### __Data Preprocessing: Regression Model__

In [62]:
# import libraries
import numpy as np
import pandas as pd
import random
from datetime import datetime, timezone, timedelta
import datetime as dt
import time

# to be able to see multiple ouputs from sungle cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### __Preprocessing Data Steps__
> - Step 1. Remove insignificant variables:
    - string variables: branded_code_share, mkt_carrier, tail_num, origin, origin_city_name, dest, dest_city_name, dup, flights
    - variables with high correlation: mkt_carrier_fl_num, op_carrier_fl_num, distance (highly correlated to crs_elapsed_time)
> - Step 2. Create additional features: 
    - year, month and day_of_week from 'fl_date'
    - hours from both, 'crs_dep_time' and 'crs_arr_time'
> - Step 3. Get features from passengers and fuel
    - from passengers_summary: speed, passengers_mean, and passengers_sum
    - from fuel_summary: total_cost mean and sum, and total_gallons mean and sum
> - Setp 4. Create dummy variables based on the following features:
    - year
    - month
    - day_of_week
    - mkt_unique_carrier & op_unique_carrier
    - origin_airport_id & dest_airport_id
    - two hours variables (from 'crs_dep_time' and 'crs_arr_time')

-------------------------------------------------

#### __Import Datasets__

In [90]:
# Import training csv file
df = pd.read_csv('reduced_train_df_65kv2.csv')
df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65083 entries, 0 to 65082
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   fl_date             65083 non-null  object 
 1   mkt_unique_carrier  65083 non-null  object 
 2   branded_code_share  65083 non-null  object 
 3   mkt_carrier         65083 non-null  object 
 4   mkt_carrier_fl_num  65083 non-null  int64  
 5   op_unique_carrier   65083 non-null  object 
 6   tail_num            65083 non-null  object 
 7   op_carrier_fl_num   65083 non-null  int64  
 8   origin_airport_id   65083 non-null  int64  
 9   origin              65083 non-null  object 
 10  origin_city_name    65083 non-null  object 
 11  dest_airport_id     65083 non-null  int64  
 12  dest                65083 non-null  object 
 13  dest_city_name      65083 non-null  object 
 14  crs_dep_time        65083 non-null  int64  
 15  crs_arr_time        65083 non-null  int64  
 16  dup 

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,arr_delay,target_class
0,2019-05-01,DL,DL,DL,1515,DL,N929AT,1515,15380,TVC,...,DTW,"Detroit, MI",510,616,N,66.0,1.0,207.0,4.0,1
1,2019-05-01,DL,DL,DL,1700,DL,N896DN,1700,10397,ATL,...,SMF,"Sacramento, CA",857,1050,N,293.0,1.0,2092.0,10.0,1
2,2019-05-01,DL,DL,DL,1757,DL,N972AT,1757,13342,MKE,...,DTW,"Detroit, MI",530,742,N,72.0,1.0,237.0,2.0,1


-----------------------------------------

#### __Import Summary Files__
> - Passenger Summary
> - Fuel Summary

In [91]:
passengers = pd.read_csv('passengers_summary.csv')
passengers.info()
passengers.head()

fuel = pd.read_csv('fuel_summary.csv')
fuel.info()
fuel.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139480 entries, 0 to 139479
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   item_id           139480 non-null  object 
 1   carrier_month_id  139480 non-null  object 
 2   dest_airport_id   139480 non-null  int64  
 3   unique_carrier    139480 non-null  object 
 4   month             139480 non-null  int64  
 5   air_time          139480 non-null  float64
 6   distance          139480 non-null  float64
 7   passengers        139480 non-null  float64
 8   speed             139480 non-null  float64
 9   mean_passengers   139480 non-null  float64
 10  sum_passengers    139480 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 11.7+ MB


Unnamed: 0,item_id,carrier_month_id,dest_airport_id,unique_carrier,month,air_time,distance,passengers,speed,mean_passengers,sum_passengers
0,100055V10,5V10,10005,5V,10,954.0,1325.0,16.0,1.388889,10.0,8580
1,100145V10,5V10,10014,5V,10,82.0,70.0,0.0,0.853659,10.0,8580
2,100645V10,5V10,10064,5V,10,30.0,224.0,0.0,7.466667,10.0,8580
3,101395V10,5V10,10139,5V,10,91.0,323.0,0.0,3.549451,10.0,8580
4,101845V10,5V10,10184,5V,10,274.0,546.0,17.0,1.992701,10.0,8580


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   carrier_month_id    756 non-null    object 
 1   carrier             756 non-null    object 
 2   month               756 non-null    int64  
 3   total_cost:mean     756 non-null    float64
 4   total_cost:sum      756 non-null    float64
 5   total_gallons:mean  756 non-null    float64
 6   total_gallons:sum   756 non-null    float64
dtypes: float64(4), int64(1), object(2)
memory usage: 41.5+ KB


Unnamed: 0,carrier_month_id,carrier,month,total_cost:mean,total_cost:sum,total_gallons:mean,total_gallons:sum
0,09Q1,09Q,1,2878729.0,8636187.0,1018414.0,3055241.0
1,09Q10,09Q,10,2892003.0,8676010.0,1210001.0,3630002.0
2,09Q11,09Q,11,2627896.0,7883689.0,1006282.0,3018846.0
3,09Q12,09Q,12,2743162.0,8229487.0,932129.0,2796387.0
4,09Q2,09Q,2,2266957.0,9067827.0,876080.5,3504322.0


-------------------------------------------

#### Step 1. Remove insignificant variables

In [92]:
# start time to measure the time of the program execution
start_time = time.time()

# drop columns
df = df.drop(columns=['branded_code_share', 'mkt_carrier', 'tail_num', 
                      'origin', 'origin_city_name', 'dest', 'dest_city_name', 
                      'dup', 'flights', 'mkt_carrier_fl_num', 
                      'op_carrier_fl_num', 'distance'
])

# rearrange columns
rearranged_columns = ['fl_date', 'mkt_unique_carrier', 'op_unique_carrier',
                      'origin_airport_id', 'dest_airport_id', 'crs_dep_time',
                     'crs_arr_time', 'crs_elapsed_time', 'target_class', 
                     'arr_delay']

df = df[rearranged_columns]

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65083 entries, 0 to 65082
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   fl_date             65083 non-null  object 
 1   mkt_unique_carrier  65083 non-null  object 
 2   op_unique_carrier   65083 non-null  object 
 3   origin_airport_id   65083 non-null  int64  
 4   dest_airport_id     65083 non-null  int64  
 5   crs_dep_time        65083 non-null  int64  
 6   crs_arr_time        65083 non-null  int64  
 7   crs_elapsed_time    65083 non-null  float64
 8   target_class        65083 non-null  int64  
 9   arr_delay           65083 non-null  float64
dtypes: float64(2), int64(5), object(3)
memory usage: 5.0+ MB


Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_elapsed_time,target_class,arr_delay
0,2019-05-01,DL,DL,15380,11433,510,616,66.0,1,4.0
1,2019-05-01,DL,DL,10397,14893,857,1050,293.0,1,10.0
2,2019-05-01,DL,DL,13342,11433,530,742,72.0,1,2.0
3,2019-05-01,UA,YX,11057,11618,1725,1928,123.0,1,11.0
4,2019-05-01,UA,YX,11292,10397,1335,1837,182.0,1,85.0




--- 0.07651710510253906 seconds ---


#### Step 2. Create additional features

In [93]:
# start time to measure the time of the program execution
start_time = time.time()

# convert 'fl_date' to datetime ('2019-05-01')
df['fl_date'] = pd.to_datetime(df['fl_date'], format='%Y-%m-%d')

# compute year, month and weekdays from 'fl_date' in df
month_train = [x.month for x in df['fl_date']]
year_train = [x.year for x in df['fl_date']]
dayofweek_train = df['fl_date'].dt.dayofweek

# compute hours frpm 'crs_dep_time' and crs_arr_time'
hours_dep = [int(x/100) for x in df['crs_dep_time']]
hours_arr = [int(x/100) for x in df['crs_arr_time']]

# insert new columns in df
df.insert(loc=1, column='year', value=year_train)
df.insert(loc=2, column='month', value=month_train)
df.insert(loc=3, column='day_of_week', value=dayofweek_train)
df.insert(loc=10, column='crs_dep_hours', value=hours_dep)
df.insert(loc=11, column='crs_arr_hours', value=hours_arr)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65083 entries, 0 to 65082
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   fl_date             65083 non-null  datetime64[ns]
 1   year                65083 non-null  int64         
 2   month               65083 non-null  int64         
 3   day_of_week         65083 non-null  int64         
 4   mkt_unique_carrier  65083 non-null  object        
 5   op_unique_carrier   65083 non-null  object        
 6   origin_airport_id   65083 non-null  int64         
 7   dest_airport_id     65083 non-null  int64         
 8   crs_dep_time        65083 non-null  int64         
 9   crs_arr_time        65083 non-null  int64         
 10  crs_dep_hours       65083 non-null  int64         
 11  crs_arr_hours       65083 non-null  int64         
 12  crs_elapsed_time    65083 non-null  float64       
 13  target_class        65083 non-null  int64     

Unnamed: 0,fl_date,year,month,day_of_week,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_dep_hours,crs_arr_hours,crs_elapsed_time,target_class,arr_delay
0,2019-05-01,2019,5,2,DL,DL,15380,11433,510,616,5,6,66.0,1,4.0
1,2019-05-01,2019,5,2,DL,DL,10397,14893,857,1050,8,10,293.0,1,10.0
2,2019-05-01,2019,5,2,DL,DL,13342,11433,530,742,5,7,72.0,1,2.0
3,2019-05-01,2019,5,2,UA,YX,11057,11618,1725,1928,17,19,123.0,1,11.0
4,2019-05-01,2019,5,2,UA,YX,11292,10397,1335,1837,13,18,182.0,1,85.0




--- 2.795344829559326 seconds ---


#### Step 3. Get features from passengers and fuel

In [94]:
# start time to measure the time of the program execution
start_time = time.time()

# create new id variable to merge with passenger_summary table
item_id = []

for num in range(len(df)):
    item = str(df.iloc[num, 7]) + df.iloc[num, 5] + str(df.iloc[num, 2])
    item_id.append(item)

df.insert(loc=0, column='item_id', value=item_id)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65083 entries, 0 to 65082
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   item_id             65083 non-null  object        
 1   fl_date             65083 non-null  datetime64[ns]
 2   year                65083 non-null  int64         
 3   month               65083 non-null  int64         
 4   day_of_week         65083 non-null  int64         
 5   mkt_unique_carrier  65083 non-null  object        
 6   op_unique_carrier   65083 non-null  object        
 7   origin_airport_id   65083 non-null  int64         
 8   dest_airport_id     65083 non-null  int64         
 9   crs_dep_time        65083 non-null  int64         
 10  crs_arr_time        65083 non-null  int64         
 11  crs_dep_hours       65083 non-null  int64         
 12  crs_arr_hours       65083 non-null  int64         
 13  crs_elapsed_time    65083 non-null  float64   

Unnamed: 0,item_id,fl_date,year,month,day_of_week,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_dep_hours,crs_arr_hours,crs_elapsed_time,target_class,arr_delay
0,11433DL5,2019-05-01,2019,5,2,DL,DL,15380,11433,510,616,5,6,66.0,1,4.0
1,14893DL5,2019-05-01,2019,5,2,DL,DL,10397,14893,857,1050,8,10,293.0,1,10.0
2,11433DL5,2019-05-01,2019,5,2,DL,DL,13342,11433,530,742,5,7,72.0,1,2.0
3,11618YX5,2019-05-01,2019,5,2,UA,YX,11057,11618,1725,1928,17,19,123.0,1,11.0
4,10397YX5,2019-05-01,2019,5,2,UA,YX,11292,10397,1335,1837,13,18,182.0,1,85.0




--- 9.855679035186768 seconds ---


In [95]:
# start time to measure the time of the program execution
start_time = time.time()

# remove columns from passenger_summary that should not be merged to df
pass_df = passengers.drop(columns=['carrier_month_id', 'dest_airport_id',
                                   'unique_carrier', 'month', 'air_time',
                                   'distance', 'passengers'])

# inner join sum_pass_df with carrier_pass
df = df.merge(pass_df, how='left', on='item_id')
df = df.rename({'speed': 'speed_passengers'}, axis=1)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65083 entries, 0 to 65082
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   item_id             65083 non-null  object        
 1   fl_date             65083 non-null  datetime64[ns]
 2   year                65083 non-null  int64         
 3   month               65083 non-null  int64         
 4   day_of_week         65083 non-null  int64         
 5   mkt_unique_carrier  65083 non-null  object        
 6   op_unique_carrier   65083 non-null  object        
 7   origin_airport_id   65083 non-null  int64         
 8   dest_airport_id     65083 non-null  int64         
 9   crs_dep_time        65083 non-null  int64         
 10  crs_arr_time        65083 non-null  int64         
 11  crs_dep_hours       65083 non-null  int64         
 12  crs_arr_hours       65083 non-null  int64         
 13  crs_elapsed_time    65083 non-null  float64   

Unnamed: 0,item_id,fl_date,year,month,day_of_week,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_dep_hours,crs_arr_hours,crs_elapsed_time,target_class,arr_delay,speed_passengers,mean_passengers,sum_passengers
0,11433DL5,2019-05-01,2019,5,2,DL,DL,15380,11433,510,616,5,6,66.0,1,4.0,0.321053,5.0,79895
1,14893DL5,2019-05-01,2019,5,2,DL,DL,10397,14893,857,1050,8,10,293.0,1,10.0,0.218119,5.0,79895
2,11433DL5,2019-05-01,2019,5,2,DL,DL,13342,11433,530,742,5,7,72.0,1,2.0,0.321053,5.0,79895
3,11618YX5,2019-05-01,2019,5,2,UA,YX,11057,11618,1725,1928,17,19,123.0,1,11.0,0.290223,5.0,22455
4,10397YX5,2019-05-01,2019,5,2,UA,YX,11292,10397,1335,1837,13,18,182.0,1,85.0,0.188597,5.0,22455




--- 0.19785714149475098 seconds ---


In [96]:
# start time to measure the time of the program execution
start_time = time.time()

# for three new columns merged from passenger_summary,
#  if there are any missing values, fill missing values with min values
df['speed_passengers'] = df['speed_passengers'].fillna(df['speed_passengers'].min())
df['mean_passengers'] = df['mean_passengers'].fillna(df['mean_passengers'].min())
df['sum_passengers'] = df['sum_passengers'].fillna(df['sum_passengers'].min())

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65083 entries, 0 to 65082
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   item_id             65083 non-null  object        
 1   fl_date             65083 non-null  datetime64[ns]
 2   year                65083 non-null  int64         
 3   month               65083 non-null  int64         
 4   day_of_week         65083 non-null  int64         
 5   mkt_unique_carrier  65083 non-null  object        
 6   op_unique_carrier   65083 non-null  object        
 7   origin_airport_id   65083 non-null  int64         
 8   dest_airport_id     65083 non-null  int64         
 9   crs_dep_time        65083 non-null  int64         
 10  crs_arr_time        65083 non-null  int64         
 11  crs_dep_hours       65083 non-null  int64         
 12  crs_arr_hours       65083 non-null  int64         
 13  crs_elapsed_time    65083 non-null  float64   

Unnamed: 0,item_id,fl_date,year,month,day_of_week,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_dep_hours,crs_arr_hours,crs_elapsed_time,target_class,arr_delay,speed_passengers,mean_passengers,sum_passengers
0,11433DL5,2019-05-01,2019,5,2,DL,DL,15380,11433,510,616,5,6,66.0,1,4.0,0.321053,5.0,79895
1,14893DL5,2019-05-01,2019,5,2,DL,DL,10397,14893,857,1050,8,10,293.0,1,10.0,0.218119,5.0,79895
2,11433DL5,2019-05-01,2019,5,2,DL,DL,13342,11433,530,742,5,7,72.0,1,2.0,0.321053,5.0,79895
3,11618YX5,2019-05-01,2019,5,2,UA,YX,11057,11618,1725,1928,17,19,123.0,1,11.0,0.290223,5.0,22455
4,10397YX5,2019-05-01,2019,5,2,UA,YX,11292,10397,1335,1837,13,18,182.0,1,85.0,0.188597,5.0,22455




--- 0.08615350723266602 seconds ---


In [97]:
# start time to measure the time of the program execution
start_time = time.time()

# create new id variable to merge with passenger_summary table
carrier_month_id = []

for num in range(len(df)):
    item = df.iloc[num, 6] + str(df.iloc[num, 3])
    carrier_month_id.append(item)

df.insert(loc=1, column='carrier_month_id', value=carrier_month_id)

# remove columns from fuel_summary that should not be merged to df
fuel_df = fuel.drop(columns=['carrier', 'month'])

# inner join sum_pass_df with carrier_pass
df = df.merge(fuel_df, how='left', on='carrier_month_id')
df = df.rename({'total_cost:mean': 'total_cost:mean_fuel',
               'total_cost:sum': 'total_cost:sum_fuel',
               'total_gallons:mean': 'total_gallons:mean_fuel',
               'total_gallons:sum': 'total_gallons:sum_fuel'}, axis=1)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65083 entries, 0 to 65082
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   item_id                  65083 non-null  object        
 1   carrier_month_id         65083 non-null  object        
 2   fl_date                  65083 non-null  datetime64[ns]
 3   year                     65083 non-null  int64         
 4   month                    65083 non-null  int64         
 5   day_of_week              65083 non-null  int64         
 6   mkt_unique_carrier       65083 non-null  object        
 7   op_unique_carrier        65083 non-null  object        
 8   origin_airport_id        65083 non-null  int64         
 9   dest_airport_id          65083 non-null  int64         
 10  crs_dep_time             65083 non-null  int64         
 11  crs_arr_time             65083 non-null  int64         
 12  crs_dep_hours            65083 n

Unnamed: 0,item_id,carrier_month_id,fl_date,year,month,day_of_week,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,...,crs_elapsed_time,target_class,arr_delay,speed_passengers,mean_passengers,sum_passengers,total_cost:mean_fuel,total_cost:sum_fuel,total_gallons:mean_fuel,total_gallons:sum_fuel
0,11433DL5,DL5,2019-05-01,2019,5,2,DL,DL,15380,11433,...,66.0,1,4.0,0.321053,5.0,79895,598183733.4,2990919000.0,303232966.2,1516165000.0
1,14893DL5,DL5,2019-05-01,2019,5,2,DL,DL,10397,14893,...,293.0,1,10.0,0.218119,5.0,79895,598183733.4,2990919000.0,303232966.2,1516165000.0
2,11433DL5,DL5,2019-05-01,2019,5,2,DL,DL,13342,11433,...,72.0,1,2.0,0.321053,5.0,79895,598183733.4,2990919000.0,303232966.2,1516165000.0
3,11618YX5,YX5,2019-05-01,2019,5,2,UA,YX,11057,11618,...,123.0,1,11.0,0.290223,5.0,22455,8618120.0,43090600.0,5318166.6,26590830.0
4,10397YX5,YX5,2019-05-01,2019,5,2,UA,YX,11292,10397,...,182.0,1,85.0,0.188597,5.0,22455,8618120.0,43090600.0,5318166.6,26590830.0




--- 6.7503674030303955 seconds ---


In [98]:
# for four new columns merged from fuel_summary,
#  if there are any missing values, fill missing values with 0
df['total_cost:mean_fuel'] = df['total_cost:mean_fuel'].fillna(df['total_cost:mean_fuel'].min())
df['total_cost:sum_fuel'] = df['total_cost:sum_fuel'].fillna(df['total_cost:sum_fuel'].min())
df['total_gallons:mean_fuel'] = df['total_gallons:mean_fuel'].fillna(df['total_gallons:mean_fuel'].min())
df['total_gallons:sum_fuel'] = df['total_gallons:sum_fuel'].fillna(df['total_gallons:sum_fuel'].min())

# check
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65083 entries, 0 to 65082
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   item_id                  65083 non-null  object        
 1   carrier_month_id         65083 non-null  object        
 2   fl_date                  65083 non-null  datetime64[ns]
 3   year                     65083 non-null  int64         
 4   month                    65083 non-null  int64         
 5   day_of_week              65083 non-null  int64         
 6   mkt_unique_carrier       65083 non-null  object        
 7   op_unique_carrier        65083 non-null  object        
 8   origin_airport_id        65083 non-null  int64         
 9   dest_airport_id          65083 non-null  int64         
 10  crs_dep_time             65083 non-null  int64         
 11  crs_arr_time             65083 non-null  int64         
 12  crs_dep_hours            65083 n

Unnamed: 0,item_id,carrier_month_id,fl_date,year,month,day_of_week,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,...,crs_elapsed_time,target_class,arr_delay,speed_passengers,mean_passengers,sum_passengers,total_cost:mean_fuel,total_cost:sum_fuel,total_gallons:mean_fuel,total_gallons:sum_fuel
0,11433DL5,DL5,2019-05-01,2019,5,2,DL,DL,15380,11433,...,66.0,1,4.0,0.321053,5.0,79895,598183733.4,2990919000.0,303232966.2,1516165000.0
1,14893DL5,DL5,2019-05-01,2019,5,2,DL,DL,10397,14893,...,293.0,1,10.0,0.218119,5.0,79895,598183733.4,2990919000.0,303232966.2,1516165000.0
2,11433DL5,DL5,2019-05-01,2019,5,2,DL,DL,13342,11433,...,72.0,1,2.0,0.321053,5.0,79895,598183733.4,2990919000.0,303232966.2,1516165000.0
3,11618YX5,YX5,2019-05-01,2019,5,2,UA,YX,11057,11618,...,123.0,1,11.0,0.290223,5.0,22455,8618120.0,43090600.0,5318166.6,26590830.0
4,10397YX5,YX5,2019-05-01,2019,5,2,UA,YX,11292,10397,...,182.0,1,85.0,0.188597,5.0,22455,8618120.0,43090600.0,5318166.6,26590830.0


#### Setp 4. Create dummy variables based on the following features
> - year
> - month
> - day_of_week
> - mkt_unique_carrier & op_unique_carrier
> - two hours variables (from 'crs_dep_time' and 'crs_arr_time')

In [99]:
# start time to measure the time of the program execution
start_time = time.time()

# convert integer to string values for converting them to dummy variables
df['year'] = df['year'].astype(str)
df['month'] = df['month'].astype(str)
df['day_of_week'] = df['day_of_week'].astype(str)
df['crs_dep_hours'] = df['crs_dep_hours'].astype(str)
df['crs_arr_hours'] = df['crs_arr_hours'].astype(str)
df['origin_airport_id'] = df['origin_airport_id'].astype(str)
df['dest_airport_id'] = df['dest_airport_id'].astype(str)

# create dummy variables
dummy_features = ['year', 'month', 'day_of_week', 'mkt_unique_carrier',
                 'origin_airport_id', 'dest_airport_id', 
                 'op_unique_carrier', 'crs_dep_hours', 'crs_arr_hours',
                 ]

df_dummy = pd.get_dummies(df[dummy_features])
df_dummy

# remove variables related to dummy_features from work_df
df = df.drop(dummy_features, axis=1)

# combine dummy variables with work_df
df = pd.concat([df, df_dummy], axis=1)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

Unnamed: 0,year_2018,year_2019,month_1,month_10,month_11,month_12,month_2,month_3,month_4,month_5,...,crs_arr_hours_21,crs_arr_hours_22,crs_arr_hours_23,crs_arr_hours_3,crs_arr_hours_4,crs_arr_hours_5,crs_arr_hours_6,crs_arr_hours_7,crs_arr_hours_8,crs_arr_hours_9
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65078,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65079,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
65080,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65081,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 65083 entries, 0 to 65082
Columns: 857 entries, item_id to crs_arr_hours_9
dtypes: datetime64[ns](1), float64(8), int64(4), object(2), uint8(842)
memory usage: 60.2+ MB


Unnamed: 0,item_id,carrier_month_id,fl_date,crs_dep_time,crs_arr_time,crs_elapsed_time,target_class,arr_delay,speed_passengers,mean_passengers,...,crs_arr_hours_21,crs_arr_hours_22,crs_arr_hours_23,crs_arr_hours_3,crs_arr_hours_4,crs_arr_hours_5,crs_arr_hours_6,crs_arr_hours_7,crs_arr_hours_8,crs_arr_hours_9
0,11433DL5,DL5,2019-05-01,510,616,66.0,1,4.0,0.321053,5.0,...,0,0,0,0,0,0,1,0,0,0
1,14893DL5,DL5,2019-05-01,857,1050,293.0,1,10.0,0.218119,5.0,...,0,0,0,0,0,0,0,0,0,0
2,11433DL5,DL5,2019-05-01,530,742,72.0,1,2.0,0.321053,5.0,...,0,0,0,0,0,0,0,1,0,0
3,11618YX5,YX5,2019-05-01,1725,1928,123.0,1,11.0,0.290223,5.0,...,0,0,0,0,0,0,0,0,0,0
4,10397YX5,YX5,2019-05-01,1335,1837,182.0,1,85.0,0.188597,5.0,...,0,0,0,0,0,0,0,0,0,0




--- 2.3064377307891846 seconds ---


In [100]:
# start time to measure the time of the program execution
start_time = time.time()

# remove variables related to dummy_features from work_df
df = df.drop(['item_id', 'carrier_month_id', 'crs_dep_time', 'crs_arr_time'], axis=1)

# check
df.info()
df.head(3)

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65083 entries, 0 to 65082
Columns: 853 entries, fl_date to crs_arr_hours_9
dtypes: datetime64[ns](1), float64(8), int64(2), uint8(842)
memory usage: 58.2 MB


Unnamed: 0,fl_date,crs_elapsed_time,target_class,arr_delay,speed_passengers,mean_passengers,sum_passengers,total_cost:mean_fuel,total_cost:sum_fuel,total_gallons:mean_fuel,...,crs_arr_hours_21,crs_arr_hours_22,crs_arr_hours_23,crs_arr_hours_3,crs_arr_hours_4,crs_arr_hours_5,crs_arr_hours_6,crs_arr_hours_7,crs_arr_hours_8,crs_arr_hours_9
0,2019-05-01,66.0,1,4.0,0.321053,5.0,79895,598183733.4,2990919000.0,303232966.2,...,0,0,0,0,0,0,1,0,0,0
1,2019-05-01,293.0,1,10.0,0.218119,5.0,79895,598183733.4,2990919000.0,303232966.2,...,0,0,0,0,0,0,0,0,0,0
2,2019-05-01,72.0,1,2.0,0.321053,5.0,79895,598183733.4,2990919000.0,303232966.2,...,0,0,0,0,0,0,0,1,0,0




--- 0.3230423927307129 seconds ---


--------------------------------------------

#### __Only for Training Dataset: Skip this process for Testing Dataset__

In [101]:
# start time to measure the time of the program execution
start_time = time.time()

# Move 'target_class' variable to the last column
target = df['arr_delay']
target

# remove variables related to dummy_features from work_df
df = df.drop(['arr_delay'], axis=1)

# add back the target variable to the last column'
df['target'] = target

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

0         4.0
1        10.0
2         2.0
3        11.0
4        85.0
         ... 
65078   -12.0
65079   -23.0
65080   -14.0
65081    -7.0
65082   -19.0
Name: arr_delay, Length: 65083, dtype: float64

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65083 entries, 0 to 65082
Columns: 853 entries, fl_date to target
dtypes: datetime64[ns](1), float64(8), int64(2), uint8(842)
memory usage: 58.2 MB


Unnamed: 0,fl_date,crs_elapsed_time,target_class,speed_passengers,mean_passengers,sum_passengers,total_cost:mean_fuel,total_cost:sum_fuel,total_gallons:mean_fuel,total_gallons:sum_fuel,...,crs_arr_hours_22,crs_arr_hours_23,crs_arr_hours_3,crs_arr_hours_4,crs_arr_hours_5,crs_arr_hours_6,crs_arr_hours_7,crs_arr_hours_8,crs_arr_hours_9,target
0,2019-05-01,66.0,1,0.321053,5.0,79895,598183733.4,2990919000.0,303232966.2,1516165000.0,...,0,0,0,0,0,1,0,0,0,4.0
1,2019-05-01,293.0,1,0.218119,5.0,79895,598183733.4,2990919000.0,303232966.2,1516165000.0,...,0,0,0,0,0,0,0,0,0,10.0
2,2019-05-01,72.0,1,0.321053,5.0,79895,598183733.4,2990919000.0,303232966.2,1516165000.0,...,0,0,0,0,0,0,1,0,0,2.0
3,2019-05-01,123.0,1,0.290223,5.0,22455,8618120.0,43090600.0,5318166.6,26590830.0,...,0,0,0,0,0,0,0,0,0,11.0
4,2019-05-01,182.0,1,0.188597,5.0,22455,8618120.0,43090600.0,5318166.6,26590830.0,...,0,0,0,0,0,0,0,0,0,85.0




--- 0.2691071033477783 seconds ---


-------------------------------------------------------

In [102]:
# export to csv
df.to_csv('training_for_regmodel_65k.csv', index=False)