### __Data Preprocessing: Testing Dataset__

In [2]:
# import libraries
import numpy as np
import pandas as pd
import random
from datetime import datetime, timezone, timedelta
import datetime as dt
import time

# to be able to see multiple ouputs from sungle cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### __Preprocessing Data Steps__
> - Step 1. Remove insignificant variables:
    - string variables: branded_code_share, mkt_carrier, tail_num, origin, origin_city_name, dest, dest_city_name, dup, flights
    - variables with high correlation: mkt_carrier_fl_num, op_carrier_fl_num, distance (highly correlated to crs_elapsed_time)
> - Step 2. Create additional features: 
    - year, month and day_of_week from 'fl_date'
    - hours from both, 'crs_dep_time' and 'crs_arr_time'
> - Step 3. Get features from passengers and fuel
    - from passengers_summary: speed, passengers_mean, and passengers_sum
    - from fuel_summary: total_cost mean and sum, and total_gallons mean and sum
> - Setp 4. Create dummy variables based on the following features:
    - year
    - month
    - day_of_week
    - mkt_unique_carrier & op_unique_carrier
    - origin_airport_id & dest_airport_id
    - two hours variables (from 'crs_dep_time' and 'crs_arr_time')

-------------------------------------------------

#### __Import Datasets__

In [18]:
# Import training csv file
df = pd.read_csv('flights_test.csv')
df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660556 entries, 0 to 660555
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   fl_date             660556 non-null  object
 1   mkt_unique_carrier  660556 non-null  object
 2   branded_code_share  660556 non-null  object
 3   mkt_carrier         660556 non-null  object
 4   mkt_carrier_fl_num  660556 non-null  int64 
 5   op_unique_carrier   660556 non-null  object
 6   tail_num            659057 non-null  object
 7   op_carrier_fl_num   660556 non-null  int64 
 8   origin_airport_id   660556 non-null  int64 
 9   origin              660556 non-null  object
 10  origin_city_name    660556 non-null  object
 11  dest_airport_id     660556 non-null  int64 
 12  dest                660556 non-null  object
 13  dest_city_name      660556 non-null  object
 14  crs_dep_time        660556 non-null  int64 
 15  crs_arr_time        660556 non-null  int64 
 16  du

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363
1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363
2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",2020,2130,N,70,1,333


-----------------------------------------

#### __Import Summary Files__
> - Passenger Summary
> - Fuel Summary

In [19]:
passengers = pd.read_csv('passengers_summary.csv')
passengers.info()
passengers.head()

fuel = pd.read_csv('fuel_summary.csv')
fuel.info()
fuel.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139480 entries, 0 to 139479
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   item_id           139480 non-null  object 
 1   carrier_month_id  139480 non-null  object 
 2   dest_airport_id   139480 non-null  int64  
 3   unique_carrier    139480 non-null  object 
 4   month             139480 non-null  int64  
 5   air_time          139480 non-null  float64
 6   distance          139480 non-null  float64
 7   passengers        139480 non-null  float64
 8   speed             139480 non-null  float64
 9   mean_passengers   139480 non-null  float64
 10  sum_passengers    139480 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 11.7+ MB


Unnamed: 0,item_id,carrier_month_id,dest_airport_id,unique_carrier,month,air_time,distance,passengers,speed,mean_passengers,sum_passengers
0,100055V10,5V10,10005,5V,10,954.0,1325.0,16.0,1.388889,10.0,8580
1,100145V10,5V10,10014,5V,10,82.0,70.0,0.0,0.853659,10.0,8580
2,100645V10,5V10,10064,5V,10,30.0,224.0,0.0,7.466667,10.0,8580
3,101395V10,5V10,10139,5V,10,91.0,323.0,0.0,3.549451,10.0,8580
4,101845V10,5V10,10184,5V,10,274.0,546.0,17.0,1.992701,10.0,8580


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   carrier_month_id    756 non-null    object 
 1   carrier             756 non-null    object 
 2   month               756 non-null    int64  
 3   total_cost:mean     756 non-null    float64
 4   total_cost:sum      756 non-null    float64
 5   total_gallons:mean  756 non-null    float64
 6   total_gallons:sum   756 non-null    float64
dtypes: float64(4), int64(1), object(2)
memory usage: 41.5+ KB


Unnamed: 0,carrier_month_id,carrier,month,total_cost:mean,total_cost:sum,total_gallons:mean,total_gallons:sum
0,09Q1,09Q,1,2878729.0,8636187.0,1018414.0,3055241.0
1,09Q10,09Q,10,2892003.0,8676010.0,1210001.0,3630002.0
2,09Q11,09Q,11,2627896.0,7883689.0,1006282.0,3018846.0
3,09Q12,09Q,12,2743162.0,8229487.0,932129.0,2796387.0
4,09Q2,09Q,2,2266957.0,9067827.0,876080.5,3504322.0


-------------------------------------------

#### __Set Aside columns for creating the submission file later__

In [6]:
df_submission_col = df[['fl_date', 'mkt_carrier', 'mkt_carrier_fl_num', 'origin', 'dest']]

df_submission_col.info()
df_submission_col.head()

# export to csv
df_submission_col.to_csv('submission.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660556 entries, 0 to 660555
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   fl_date             660556 non-null  object
 1   mkt_carrier         660556 non-null  object
 2   mkt_carrier_fl_num  660556 non-null  int64 
 3   origin              660556 non-null  object
 4   dest                660556 non-null  object
dtypes: int64(1), object(4)
memory usage: 25.2+ MB


Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,origin,dest
0,2020-01-01,WN,5888,ONT,SFO
1,2020-01-01,WN,6276,ONT,SFO
2,2020-01-01,WN,4598,ONT,SJC
3,2020-01-01,WN,4761,ONT,SJC
4,2020-01-01,WN,5162,ONT,SJC


#### Step 1. Remove insignificant variables

In [20]:
# start time to measure the time of the program execution
start_time = time.time()

# drop columns
df = df.drop(columns=['branded_code_share', 'mkt_carrier', 'tail_num', 
                      'origin', 'origin_city_name', 'dest', 'dest_city_name', 
                      'dup', 'flights', 'mkt_carrier_fl_num', 
                      'op_carrier_fl_num', 'distance'
])

# rearrange columns
rearranged_columns = ['fl_date', 'mkt_unique_carrier', 'op_unique_carrier',
                      'origin_airport_id', 'dest_airport_id', 'crs_dep_time',
                     'crs_arr_time', 'crs_elapsed_time']

df = df[rearranged_columns]

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660556 entries, 0 to 660555
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   fl_date             660556 non-null  object
 1   mkt_unique_carrier  660556 non-null  object
 2   op_unique_carrier   660556 non-null  object
 3   origin_airport_id   660556 non-null  int64 
 4   dest_airport_id     660556 non-null  int64 
 5   crs_dep_time        660556 non-null  int64 
 6   crs_arr_time        660556 non-null  int64 
 7   crs_elapsed_time    660556 non-null  int64 
dtypes: int64(5), object(3)
memory usage: 40.3+ MB


Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_elapsed_time
0,2020-01-01,WN,WN,13891,14771,1810,1945,95
1,2020-01-01,WN,WN,13891,14771,1150,1320,90
2,2020-01-01,WN,WN,13891,14831,2020,2130,70
3,2020-01-01,WN,WN,13891,14831,1340,1455,75
4,2020-01-01,WN,WN,13891,14831,915,1035,80




--- 0.7337121963500977 seconds ---


#### Step 2. Create additional features

In [21]:
# start time to measure the time of the program execution
start_time = time.time()

# convert 'fl_date' to datetime ('2019-05-01')
df['fl_date'] = pd.to_datetime(df['fl_date'], format='%Y-%m-%d')

# compute year, month and weekdays from 'fl_date' in df
month_train = [x.month for x in df['fl_date']]
year_train = [x.year for x in df['fl_date']]
dayofweek_train = df['fl_date'].dt.dayofweek

# compute hours frpm 'crs_dep_time' and crs_arr_time'
hours_dep = [int(x/100) for x in df['crs_dep_time']]
hours_arr = [int(x/100) for x in df['crs_arr_time']]

# insert new columns in df
df.insert(loc=1, column='year', value=year_train)
df.insert(loc=2, column='month', value=month_train)
df.insert(loc=3, column='day_of_week', value=dayofweek_train)
df.insert(loc=10, column='crs_dep_hours', value=hours_dep)
df.insert(loc=11, column='crs_arr_hours', value=hours_arr)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660556 entries, 0 to 660555
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   fl_date             660556 non-null  datetime64[ns]
 1   year                660556 non-null  int64         
 2   month               660556 non-null  int64         
 3   day_of_week         660556 non-null  int64         
 4   mkt_unique_carrier  660556 non-null  object        
 5   op_unique_carrier   660556 non-null  object        
 6   origin_airport_id   660556 non-null  int64         
 7   dest_airport_id     660556 non-null  int64         
 8   crs_dep_time        660556 non-null  int64         
 9   crs_arr_time        660556 non-null  int64         
 10  crs_dep_hours       660556 non-null  int64         
 11  crs_arr_hours       660556 non-null  int64         
 12  crs_elapsed_time    660556 non-null  int64         
dtypes: datetime64[ns](1), int64(1

Unnamed: 0,fl_date,year,month,day_of_week,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_dep_hours,crs_arr_hours,crs_elapsed_time
0,2020-01-01,2020,1,2,WN,WN,13891,14771,1810,1945,18,19,95
1,2020-01-01,2020,1,2,WN,WN,13891,14771,1150,1320,11,13,90
2,2020-01-01,2020,1,2,WN,WN,13891,14831,2020,2130,20,21,70
3,2020-01-01,2020,1,2,WN,WN,13891,14831,1340,1455,13,14,75
4,2020-01-01,2020,1,2,WN,WN,13891,14831,915,1035,9,10,80




--- 11.208613872528076 seconds ---


#### Step 3. Get features from passengers and fuel

In [22]:
# start time to measure the time of the program execution
start_time = time.time()

# create new id variable to merge with passenger_summary table
item_id = []

for num in range(len(df)):
    item = str(df.iloc[num, 7]) + df.iloc[num, 5] + str(df.iloc[num, 2])
    item_id.append(item)

df.insert(loc=0, column='item_id', value=item_id)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660556 entries, 0 to 660555
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   item_id             660556 non-null  object        
 1   fl_date             660556 non-null  datetime64[ns]
 2   year                660556 non-null  int64         
 3   month               660556 non-null  int64         
 4   day_of_week         660556 non-null  int64         
 5   mkt_unique_carrier  660556 non-null  object        
 6   op_unique_carrier   660556 non-null  object        
 7   origin_airport_id   660556 non-null  int64         
 8   dest_airport_id     660556 non-null  int64         
 9   crs_dep_time        660556 non-null  int64         
 10  crs_arr_time        660556 non-null  int64         
 11  crs_dep_hours       660556 non-null  int64         
 12  crs_arr_hours       660556 non-null  int64         
 13  crs_elapsed_time    660556 no

Unnamed: 0,item_id,fl_date,year,month,day_of_week,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_dep_hours,crs_arr_hours,crs_elapsed_time
0,14771WN1,2020-01-01,2020,1,2,WN,WN,13891,14771,1810,1945,18,19,95
1,14771WN1,2020-01-01,2020,1,2,WN,WN,13891,14771,1150,1320,11,13,90
2,14831WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,2020,2130,20,21,70
3,14831WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,1340,1455,13,14,75
4,14831WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,915,1035,9,10,80




--- 98.11545300483704 seconds ---


In [23]:
# start time to measure the time of the program execution
start_time = time.time()

# remove columns from passenger_summary that should not be merged to df
pass_df = passengers.drop(columns=['carrier_month_id', 'dest_airport_id',
                                   'unique_carrier', 'month', 'air_time',
                                   'distance', 'passengers'])

# inner join sum_pass_df with carrier_pass
df = df.merge(pass_df, how='left', on='item_id')
df = df.rename({'speed': 'speed_passengers'}, axis=1)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660556 entries, 0 to 660555
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   item_id             660556 non-null  object        
 1   fl_date             660556 non-null  datetime64[ns]
 2   year                660556 non-null  int64         
 3   month               660556 non-null  int64         
 4   day_of_week         660556 non-null  int64         
 5   mkt_unique_carrier  660556 non-null  object        
 6   op_unique_carrier   660556 non-null  object        
 7   origin_airport_id   660556 non-null  int64         
 8   dest_airport_id     660556 non-null  int64         
 9   crs_dep_time        660556 non-null  int64         
 10  crs_arr_time        660556 non-null  int64         
 11  crs_dep_hours       660556 non-null  int64         
 12  crs_arr_hours       660556 non-null  int64         
 13  crs_elapsed_time    660556 no

Unnamed: 0,item_id,fl_date,year,month,day_of_week,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_dep_hours,crs_arr_hours,crs_elapsed_time,speed_passengers,mean_passengers,sum_passengers
0,14771WN1,2020-01-01,2020,1,2,WN,WN,13891,14771,1810,1945,18,19,95,0.173843,1.0,15744.0
1,14771WN1,2020-01-01,2020,1,2,WN,WN,13891,14771,1150,1320,11,13,90,0.173843,1.0,15744.0
2,14831WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,2020,2130,20,21,70,0.177136,1.0,15744.0
3,14831WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,1340,1455,13,14,75,0.177136,1.0,15744.0
4,14831WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,915,1035,9,10,80,0.177136,1.0,15744.0




--- 1.6354105472564697 seconds ---


In [24]:
# start time to measure the time of the program execution
start_time = time.time()

# for three new columns merged from passenger_summary,
#  if there are any missing values, fill missing values with min values
df['speed_passengers'] = df['speed_passengers'].fillna(df['speed_passengers'].min())
df['mean_passengers'] = df['mean_passengers'].fillna(df['mean_passengers'].min())
df['sum_passengers'] = df['sum_passengers'].fillna(df['sum_passengers'].min())

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660556 entries, 0 to 660555
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   item_id             660556 non-null  object        
 1   fl_date             660556 non-null  datetime64[ns]
 2   year                660556 non-null  int64         
 3   month               660556 non-null  int64         
 4   day_of_week         660556 non-null  int64         
 5   mkt_unique_carrier  660556 non-null  object        
 6   op_unique_carrier   660556 non-null  object        
 7   origin_airport_id   660556 non-null  int64         
 8   dest_airport_id     660556 non-null  int64         
 9   crs_dep_time        660556 non-null  int64         
 10  crs_arr_time        660556 non-null  int64         
 11  crs_dep_hours       660556 non-null  int64         
 12  crs_arr_hours       660556 non-null  int64         
 13  crs_elapsed_time    660556 no

Unnamed: 0,item_id,fl_date,year,month,day_of_week,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_dep_hours,crs_arr_hours,crs_elapsed_time,speed_passengers,mean_passengers,sum_passengers
0,14771WN1,2020-01-01,2020,1,2,WN,WN,13891,14771,1810,1945,18,19,95,0.173843,1.0,15744.0
1,14771WN1,2020-01-01,2020,1,2,WN,WN,13891,14771,1150,1320,11,13,90,0.173843,1.0,15744.0
2,14831WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,2020,2130,20,21,70,0.177136,1.0,15744.0
3,14831WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,1340,1455,13,14,75,0.177136,1.0,15744.0
4,14831WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,915,1035,9,10,80,0.177136,1.0,15744.0




--- 0.5168569087982178 seconds ---


In [25]:
# start time to measure the time of the program execution
start_time = time.time()

# create new id variable to merge with passenger_summary table
carrier_month_id = []

for num in range(len(df)):
    item = df.iloc[num, 6] + str(df.iloc[num, 3])
    carrier_month_id.append(item)

df.insert(loc=1, column='carrier_month_id', value=carrier_month_id)

# remove columns from fuel_summary that should not be merged to df
fuel_df = fuel.drop(columns=['carrier', 'month'])

# inner join sum_pass_df with carrier_pass
df = df.merge(fuel_df, how='left', on='carrier_month_id')
df = df.rename({'total_cost:mean': 'total_cost:mean_fuel',
               'total_cost:sum': 'total_cost:sum_fuel',
               'total_gallons:mean': 'total_gallons:mean_fuel',
               'total_gallons:sum': 'total_gallons:sum_fuel'}, axis=1)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660556 entries, 0 to 660555
Data columns (total 22 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   item_id                  660556 non-null  object        
 1   carrier_month_id         660556 non-null  object        
 2   fl_date                  660556 non-null  datetime64[ns]
 3   year                     660556 non-null  int64         
 4   month                    660556 non-null  int64         
 5   day_of_week              660556 non-null  int64         
 6   mkt_unique_carrier       660556 non-null  object        
 7   op_unique_carrier        660556 non-null  object        
 8   origin_airport_id        660556 non-null  int64         
 9   dest_airport_id          660556 non-null  int64         
 10  crs_dep_time             660556 non-null  int64         
 11  crs_arr_time             660556 non-null  int64         
 12  crs_dep_hours   

Unnamed: 0,item_id,carrier_month_id,fl_date,year,month,day_of_week,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,...,crs_dep_hours,crs_arr_hours,crs_elapsed_time,speed_passengers,mean_passengers,sum_passengers,total_cost:mean_fuel,total_cost:sum_fuel,total_gallons:mean_fuel,total_gallons:sum_fuel
0,14771WN1,WN1,2020-01-01,2020,1,2,WN,WN,13891,14771,...,18,19,95,0.173843,1.0,15744.0,296682409.0,1483412000.0,157934010.0,789670050.0
1,14771WN1,WN1,2020-01-01,2020,1,2,WN,WN,13891,14771,...,11,13,90,0.173843,1.0,15744.0,296682409.0,1483412000.0,157934010.0,789670050.0
2,14831WN1,WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,...,20,21,70,0.177136,1.0,15744.0,296682409.0,1483412000.0,157934010.0,789670050.0
3,14831WN1,WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,...,13,14,75,0.177136,1.0,15744.0,296682409.0,1483412000.0,157934010.0,789670050.0
4,14831WN1,WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,...,9,10,80,0.177136,1.0,15744.0,296682409.0,1483412000.0,157934010.0,789670050.0




--- 77.14658951759338 seconds ---


In [26]:
# for four new columns merged from fuel_summary,
#  if there are any missing values, fill missing values with 0
df['total_cost:mean_fuel'] = df['total_cost:mean_fuel'].fillna(df['total_cost:mean_fuel'].min())
df['total_cost:sum_fuel'] = df['total_cost:sum_fuel'].fillna(df['total_cost:sum_fuel'].min())
df['total_gallons:mean_fuel'] = df['total_gallons:mean_fuel'].fillna(df['total_gallons:mean_fuel'].min())
df['total_gallons:sum_fuel'] = df['total_gallons:sum_fuel'].fillna(df['total_gallons:sum_fuel'].min())

# check
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660556 entries, 0 to 660555
Data columns (total 22 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   item_id                  660556 non-null  object        
 1   carrier_month_id         660556 non-null  object        
 2   fl_date                  660556 non-null  datetime64[ns]
 3   year                     660556 non-null  int64         
 4   month                    660556 non-null  int64         
 5   day_of_week              660556 non-null  int64         
 6   mkt_unique_carrier       660556 non-null  object        
 7   op_unique_carrier        660556 non-null  object        
 8   origin_airport_id        660556 non-null  int64         
 9   dest_airport_id          660556 non-null  int64         
 10  crs_dep_time             660556 non-null  int64         
 11  crs_arr_time             660556 non-null  int64         
 12  crs_dep_hours   

Unnamed: 0,item_id,carrier_month_id,fl_date,year,month,day_of_week,mkt_unique_carrier,op_unique_carrier,origin_airport_id,dest_airport_id,...,crs_dep_hours,crs_arr_hours,crs_elapsed_time,speed_passengers,mean_passengers,sum_passengers,total_cost:mean_fuel,total_cost:sum_fuel,total_gallons:mean_fuel,total_gallons:sum_fuel
0,14771WN1,WN1,2020-01-01,2020,1,2,WN,WN,13891,14771,...,18,19,95,0.173843,1.0,15744.0,296682409.0,1483412000.0,157934010.0,789670050.0
1,14771WN1,WN1,2020-01-01,2020,1,2,WN,WN,13891,14771,...,11,13,90,0.173843,1.0,15744.0,296682409.0,1483412000.0,157934010.0,789670050.0
2,14831WN1,WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,...,20,21,70,0.177136,1.0,15744.0,296682409.0,1483412000.0,157934010.0,789670050.0
3,14831WN1,WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,...,13,14,75,0.177136,1.0,15744.0,296682409.0,1483412000.0,157934010.0,789670050.0
4,14831WN1,WN1,2020-01-01,2020,1,2,WN,WN,13891,14831,...,9,10,80,0.177136,1.0,15744.0,296682409.0,1483412000.0,157934010.0,789670050.0


#### Setp 4. Create dummy variables based on the following features
> - year
> - month
> - day_of_week
> - mkt_unique_carrier & op_unique_carrier
> - two hours variables (from 'crs_dep_time' and 'crs_arr_time')

In [27]:
# start time to measure the time of the program execution
start_time = time.time()

# convert integer to string values for converting them to dummy variables
df['year'] = df['year'].astype(str)
df['month'] = df['month'].astype(str)
df['day_of_week'] = df['day_of_week'].astype(str)
df['crs_dep_hours'] = df['crs_dep_hours'].astype(str)
df['crs_arr_hours'] = df['crs_arr_hours'].astype(str)
df['origin_airport_id'] = df['origin_airport_id'].astype(str)
df['dest_airport_id'] = df['dest_airport_id'].astype(str)

# create dummy variables
dummy_features = ['year', 'month', 'day_of_week', 'mkt_unique_carrier',
                 'origin_airport_id', 'dest_airport_id', 
                 'op_unique_carrier', 'crs_dep_hours', 'crs_arr_hours',
                 ]

df_dummy = pd.get_dummies(df[dummy_features])
df_dummy

# remove variables related to dummy_features from work_df
df = df.drop(dummy_features, axis=1)

# combine dummy variables with work_df
df = pd.concat([df, df_dummy], axis=1)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

Unnamed: 0,year_2020,month_1,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,mkt_unique_carrier_AA,...,crs_arr_hours_22,crs_arr_hours_23,crs_arr_hours_24,crs_arr_hours_3,crs_arr_hours_4,crs_arr_hours_5,crs_arr_hours_6,crs_arr_hours_7,crs_arr_hours_8,crs_arr_hours_9
0,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660551,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
660552,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
660553,1,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
660554,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 660556 entries, 0 to 660555
Columns: 836 entries, item_id to crs_arr_hours_9
dtypes: datetime64[ns](1), float64(7), int64(3), object(2), uint8(823)
memory usage: 589.0+ MB


Unnamed: 0,item_id,carrier_month_id,fl_date,crs_dep_time,crs_arr_time,crs_elapsed_time,speed_passengers,mean_passengers,sum_passengers,total_cost:mean_fuel,...,crs_arr_hours_22,crs_arr_hours_23,crs_arr_hours_24,crs_arr_hours_3,crs_arr_hours_4,crs_arr_hours_5,crs_arr_hours_6,crs_arr_hours_7,crs_arr_hours_8,crs_arr_hours_9
0,14771WN1,WN1,2020-01-01,1810,1945,95,0.173843,1.0,15744.0,296682409.0,...,0,0,0,0,0,0,0,0,0,0
1,14771WN1,WN1,2020-01-01,1150,1320,90,0.173843,1.0,15744.0,296682409.0,...,0,0,0,0,0,0,0,0,0,0
2,14831WN1,WN1,2020-01-01,2020,2130,70,0.177136,1.0,15744.0,296682409.0,...,0,0,0,0,0,0,0,0,0,0
3,14831WN1,WN1,2020-01-01,1340,1455,75,0.177136,1.0,15744.0,296682409.0,...,0,0,0,0,0,0,0,0,0,0
4,14831WN1,WN1,2020-01-01,915,1035,80,0.177136,1.0,15744.0,296682409.0,...,0,0,0,0,0,0,0,0,0,0




--- 21.340233087539673 seconds ---


In [28]:
# start time to measure the time of the program execution
start_time = time.time()

# remove variables related to dummy_features from work_df
df = df.drop(['fl_date', 'item_id', 'carrier_month_id', 'crs_dep_time', 'crs_arr_time'], axis=1)

# check
df.info()
df.head(3)

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660556 entries, 0 to 660555
Columns: 831 entries, crs_elapsed_time to crs_arr_hours_9
dtypes: float64(7), int64(1), uint8(823)
memory usage: 563.8 MB


Unnamed: 0,crs_elapsed_time,speed_passengers,mean_passengers,sum_passengers,total_cost:mean_fuel,total_cost:sum_fuel,total_gallons:mean_fuel,total_gallons:sum_fuel,year_2020,month_1,...,crs_arr_hours_22,crs_arr_hours_23,crs_arr_hours_24,crs_arr_hours_3,crs_arr_hours_4,crs_arr_hours_5,crs_arr_hours_6,crs_arr_hours_7,crs_arr_hours_8,crs_arr_hours_9
0,95,0.173843,1.0,15744.0,296682409.0,1483412000.0,157934010.0,789670050.0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,90,0.173843,1.0,15744.0,296682409.0,1483412000.0,157934010.0,789670050.0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,70,0.177136,1.0,15744.0,296682409.0,1483412000.0,157934010.0,789670050.0,1,1,...,0,0,0,0,0,0,0,0,0,0




--- 5.713189363479614 seconds ---


--------------------------------------------

-------------------------------------------------------

In [29]:
# export to csv
df.to_csv('testing_data.csv', index=False)