## Feature Engineering of Test Data Set

In [None]:
import pandas as pd
import datetime as dt 

In [44]:
# bring in data
path = '/Users/reneehall/Documents/Learning/lighthouse_labs/coursework-lighthouse-labs/Mid-term project/colab_data/'
# test data with weather data included from Ryan
fl_df = pd.read_csv(path+'flights_test_with_weather_data_Jan1_to_7_2020_v2.csv')
# flights data tabel to refernce means when needed
fl_df2 = pd.read_csv(path+'cleaned_balanced_sample.csv')
# passengers table to reference when needed for feature creation
pass_df = pd.read_csv(path+'cleaned_passengers.csv')

# using the same feature engineering process on our test data that we used for feature creation in training set

In [93]:
# target index needed for model
"""
Index(['index', 'mkt_carrier', 'origin', 'dest', 'origin_fl_density',
       'dest_fl_density', 'day', 'month', 'year', 'weekday', 'crs_arr_hour',
       'crs_dep_hour', 'predicted_speed', 'origin_num_passengers',
       'dest_num_passengers', 'origin_num_freight', 'dest_num_freight',
       'dist_group', 'season', 'origin_airport_size', 'dest_airport_size',
       'mean_op_carrier_delay', 'mean_delay_origin_airport',
       'mean_delay_dest_airport', 'sunHour', 'cloudcover', 'precipMM',
       'hrly_bin', 'avg_hr_fl', 'avg_day_fl', 'dep_delay_lag',
       'ddl_rolling_mean', 'mean_carrier_arr_delay', 'tail_num_arr_delay',
       'totalSnow_mm', 'weather_type_Rainy', 'weather_type_Snowy',
       'weather_type_Sunny'],
"""

In [54]:
# 'origin_fl_density'

origin_daily_flights = fl_df.groupby(by=['fl_date', 'origin_airport_id']).flights.count().sort_values(ascending=False)
# join on date and origin_airport_id
fl_df = pd.merge(fl_df, origin_daily_flights, left_on=['fl_date','origin_airport_id'], right_on = ['fl_date','origin_airport_id'])
fl_df.rename(columns={'flights_y': 'origin_fl_density'}, inplace=True)

In [61]:
# 'dest_fl_density'

dest_daily_flights = fl_df.groupby(by=['fl_date', 'dest_airport_id']).flights_x.count().sort_values(ascending=False)

# join on date and origin_airport_id
fl_df = pd.merge(fl_df, dest_daily_flights, left_on=['fl_date','dest_airport_id'], right_on = ['fl_date','dest_airport_id'])
fl_df.rename(columns={'flights_x_x': 'flights', 'flights_x_y': 'dest_fl_density'}, inplace=True)

In [None]:
# create time features

#'day', 'month', 'year', 'weekday'
fl_df['day'] = pd.to_datetime(fl_df['fl_date']).dt.day
fl_df['month'] = pd.to_datetime(fl_df['fl_date']).dt.month
fl_df['year'] = pd.to_datetime(fl_df['fl_date']).dt.year
fl_df['weekday'] = pd.to_datetime(fl_df['fl_date']).dt.weekday

In [None]:
# 'crs_arr_hour', 'crs_dep_hour',
fl_df['crs_arr_hour'] = fl_df['crs_arr_time'].astype('str').str[:-2]
fl_df['crs_arr_hour'] = fl_df.crs_arr_hour.replace('', 0)
fl_df['crs_arr_hour'] = fl_df.crs_arr_hour.astype('int')

fl_df['crs_dep_hour'] = fl_df['crs_dep_time'].astype('str').str[:-2]
fl_df['crs_dep_hour'] = fl_df.crs_dep_hour.replace('', 0)
fl_df['crs_dep_hour'] = fl_df.crs_dep_hour.astype('int')

In [68]:
# 'predicted_speed'
fl_df['predicted_speed'] = fl_df['distance']/fl_df['crs_elapsed_time']

In [72]:
# airport size
airport_size = pass_df.groupby(by=['origin_airport_id']).sum().sort_values(by='passengers', ascending=False)

# 1:small, 2:med, 3:lrg, 4, xl
bins = [0, 100000, 1000000, 10000000, 300000000]
labels = [1,2,3,4]
airport_size['size'] = pd.cut(airport_size['passengers'], bins=bins, labels=labels)

airport_size = airport_size[['passengers','size']]

# 'origin_airport_size'
fl_df = pd.merge(fl_df, airport_size, left_on=['origin_airport_id'], right_on = ['origin_airport_id'])
fl_df.drop(labels='passengers', axis=1, inplace=True)
fl_df.rename(columns={'size': 'origin_airport_size'}, inplace=True)

In [76]:
# 'dest_airport_size'
fl_df = pd.merge(fl_df, airport_size, left_on=['dest_airport_id'], right_on = ['origin_airport_id'])
fl_df.drop(labels='passengers', axis=1, inplace=True)
fl_df.rename(columns={'size': 'dest_airport_size'}, inplace=True)

In [83]:
# 'season'
# winter dec-feb, spring march-may, summer june-aug, fall sep-nov
bins = [0, 2, 5, 8, 11]
labels = [1,2,3,4]
fl_df['season'] = pd.cut(fl_df['month'], bins=bins, labels=labels)

# binning didn't allow for dec-feb (12,1,2) so december/season was filled with np.nan
# fill december/season with 1
fl_df['season'] = fl_df['season'].fillna(1)

In [87]:
# 'dist_group'
# NOTE intervals of 500, label=1 is shortest
bins = [0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500]
labels = [1,2,3,4,5,6,7,8,9,10,11]
fl_df['dist_group'] = pd.cut(fl_df['distance'], bins=bins, labels=labels)

In [96]:
# 'mean_op_carrier_delay'
# take mean from sample dataset and apply to test set
op_carrier_mean_delay = fl_df2.groupby(by=['op_unique_carrier']).arr_delay.mean().sort_values(ascending=False)
fl_df = pd.merge(fl_df, op_carrier_mean_delay, left_on=['op_unique_carrier'], right_on = ['op_unique_carrier'])
fl_df.rename(columns={'arr_delay': 'mean_op_carrier_delay'}, inplace=True)

In [104]:
# 'mean_delay_origin_airport'
# take mean from sample dataset and apply to test set
origin_airport_mean_delay = fl_df2.groupby(by=['origin_airport_id']).arr_delay.mean()
fl_df = pd.merge(fl_df, origin_airport_mean_delay, left_on=['origin_airport_id'], right_on = ['origin_airport_id'])
fl_df.rename(columns={'arr_delay': 'mean_delay_origin_airport'}, inplace=True)

fl_df = pd.merge(fl_df, origin_airport_mean_delay, left_on=['dest_airport_id'], right_on = ['origin_airport_id'])
fl_df.rename(columns={'arr_delay': 'mean_delay_dest_airport'}, inplace=True)

In [116]:
# 'origin_num_passengers'
# passengers/mo total
pass_per_mo_origin_airport = pass_df.groupby(by=['origin_airport_id','month']).passengers.sum().sort_values(ascending=False)
fl_df = pd.merge(fl_df, pass_per_mo_origin_airport, how='left', left_on=['origin_airport_id','month'], right_on = ['origin_airport_id','month'])
fl_df.rename(columns={'passengers': 'origin_num_passengers'}, inplace=True)

In [134]:
# some missing passendger values for a size 1 origin airport; replace with mean
size_1_mean = fl_df.loc[fl_df['origin_airport_size'] == 1].origin_num_passengers.mean()
fl_df['origin_num_passengers'] = fl_df['origin_num_passengers'].fillna(value=size_1_mean)

4096.382113821138

In [139]:
# 'dest_num_passengers'
# passengers/mo total
pass_per_mo_dest_airport = pass_df.groupby(by=['dest_airport_id','month']).passengers.sum().sort_values(ascending=False)
fl_df = pd.merge(fl_df, pass_per_mo_dest_airport, how='left', left_on=['dest_airport_id','month'], right_on = ['dest_airport_id','month'])
fl_df.rename(columns={'passengers': 'dest_num_passengers'}, inplace=True)

In [146]:
# some missing passendger values for a size 1 dest airport; replace with mean
fl_df['dest_num_passengers'] = fl_df['dest_num_passengers'].fillna(value=size_1_mean)

In [150]:
# 'origin_num_freight'
# freight/mo total
origin_mo_freight = pass_df.groupby(by=['origin_airport_id','month']).freight.sum().sort_values(ascending=False)
fl_df = pd.merge(fl_df, origin_mo_freight, how='left', left_on=['origin_airport_id','month'], right_on = ['origin_airport_id','month'])
fl_df.rename(columns={'freight': 'origin_num_freight'}, inplace=True)

In [156]:
# some missing freight values for a size 1 origin airport; replace with mean
size_1_fmean = fl_df.loc[fl_df['origin_airport_size'] == 1].origin_num_freight.mean()
fl_df['origin_num_freight'] = fl_df['origin_num_freight'].fillna(value=size_1_fmean)

60044.55894308943

In [162]:
# 'dest_num_freight'
dest_mo_freight = pass_df.groupby(by=['dest_airport_id','month']).freight.sum().sort_values(ascending=False)
fl_df = pd.merge(fl_df, dest_mo_freight, how='left', left_on=['dest_airport_id','month'], right_on = ['dest_airport_id','month'])
fl_df.rename(columns={'freight': 'dest_num_freight'}, inplace=True)

In [169]:
# some missing freight values for a size 1 dest airport; replace with mean
fl_df['dest_num_freight'] = fl_df['dest_num_freight'].fillna(value=size_1_fmean)

In [None]:
# the remaining features are from Tim, and this file will be passed on to him so he can incorperate them into the final test file
    """'hrly_bin', 'avg_hr_fl', 'avg_day_fl', 'dep_delay_lag',
       'ddl_rolling_mean', 'mean_carrier_arr_delay', 'tail_num_arr_delay',"""

In [172]:
# save to csv
fl_df.to_csv(path+'final_test_data.csv', index=False)