### __Data Preprocessing: Regression Model__

In [21]:
# import libraries
import numpy as np
import pandas as pd
import random
from datetime import datetime, timezone, timedelta
import datetime as dt
import time

# import matplotlib
from matplotlib import pyplot
import matplotlib.pyplot as plt

# to be able to see multiple ouputs from sungle cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### __Preprocessing Data Steps__
> - Step 1. Remove insignificant variables:
    - string variables: branded_code_share, mkt_carrier, tail_num, origin, origin_city_name, dest, dest_city_name, dup, flights
    - variables with high correlation: mkt_carrier_fl_num, op_carrier_fl_num, distance (highly correlated to crs_elapsed_time)
> - Step 2. Create additional features: 
    - year, month and day_of_week from 'fl_date'
    - hours from both, 'crs_dep_time' and 'crs_arr_time'
> - Step 3. Get features from passengers and fuel
    - from passengers_summary: speed, passengers_mean, and passengers_sum
    - from fuel_summary: total_cost mean and sum, and total_gallons mean and sum
> - Setp 4. Create dummy variables based on the following features:
    - year
    - month
    - day_of_week
    - mkt_unique_carrier & op_unique_carrier
    - origin_airport_id & dest_airport_id
    - two hours variables (from 'crs_dep_time' and 'crs_arr_time')

-------------------------------------------------

#### __Import Datasets__

In [9]:
# Import training csv file
df = pd.read_csv('reduced_train_df_16kv2.csv')
df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16269 entries, 0 to 16268
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   fl_date             16269 non-null  object 
 1   mkt_unique_carrier  16269 non-null  object 
 2   branded_code_share  16269 non-null  object 
 3   mkt_carrier         16269 non-null  object 
 4   mkt_carrier_fl_num  16269 non-null  int64  
 5   op_unique_carrier   16269 non-null  object 
 6   tail_num            16269 non-null  object 
 7   op_carrier_fl_num   16269 non-null  int64  
 8   origin_airport_id   16269 non-null  int64  
 9   origin              16269 non-null  object 
 10  origin_city_name    16269 non-null  object 
 11  dest_airport_id     16269 non-null  int64  
 12  dest                16269 non-null  object 
 13  dest_city_name      16269 non-null  object 
 14  crs_dep_time        16269 non-null  int64  
 15  crs_arr_time        16269 non-null  int64  
 16  dup 

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,arr_delay,target_class
0,2019-05-01,UA,UA,UA,3720,YX,N736YX,3720,11292,DEN,...,ATL,"Atlanta, GA",1335,1837,N,182.0,1.0,1199.0,85.0,1
1,2019-05-01,UA,UA,UA,230,UA,N66837,230,13930,ORD,...,EWR,"Newark, NJ",1415,1729,N,134.0,1.0,719.0,10.0,1
2,2019-05-01,UA,UA,UA,4598,G7,N169GJ,4598,13930,ORD,...,MEM,"Memphis, TN",1325,1521,N,116.0,1.0,491.0,27.0,1


-----------------------------------------

#### __Import Summary Files__
> - Passenger Summary
> - Fuel Summary

In [3]:
passengers = pd.read_csv('passengers_summary.csv')
passengers.info()
passengers.head()

fuel = pd.read_csv('fuel_summary.csv')
fuel.info()
fuel.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139480 entries, 0 to 139479
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   item_id           139480 non-null  object 
 1   carrier_month_id  139480 non-null  object 
 2   dest_airport_id   139480 non-null  int64  
 3   unique_carrier    139480 non-null  object 
 4   month             139480 non-null  int64  
 5   air_time          139480 non-null  float64
 6   distance          139480 non-null  float64
 7   passengers        139480 non-null  float64
 8   speed             139480 non-null  float64
 9   mean_passengers   139480 non-null  float64
 10  sum_passengers    139480 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 11.7+ MB


Unnamed: 0,item_id,carrier_month_id,dest_airport_id,unique_carrier,month,air_time,distance,passengers,speed,mean_passengers,sum_passengers
0,100055V10,5V10,10005,5V,10,954.0,1325.0,16.0,1.388889,10.0,8580
1,100145V10,5V10,10014,5V,10,82.0,70.0,0.0,0.853659,10.0,8580
2,100645V10,5V10,10064,5V,10,30.0,224.0,0.0,7.466667,10.0,8580
3,101395V10,5V10,10139,5V,10,91.0,323.0,0.0,3.549451,10.0,8580
4,101845V10,5V10,10184,5V,10,274.0,546.0,17.0,1.992701,10.0,8580


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   carrier_month_id    756 non-null    object 
 1   carrier             756 non-null    object 
 2   month               756 non-null    int64  
 3   total_cost:mean     756 non-null    float64
 4   total_cost:sum      756 non-null    float64
 5   total_gallons:mean  756 non-null    float64
 6   total_gallons:sum   756 non-null    float64
dtypes: float64(4), int64(1), object(2)
memory usage: 41.5+ KB


Unnamed: 0,carrier_month_id,carrier,month,total_cost:mean,total_cost:sum,total_gallons:mean,total_gallons:sum
0,09Q1,09Q,1,2878729.0,8636187.0,1018414.0,3055241.0
1,09Q10,09Q,10,2892003.0,8676010.0,1210001.0,3630002.0
2,09Q11,09Q,11,2627896.0,7883689.0,1006282.0,3018846.0
3,09Q12,09Q,12,2743162.0,8229487.0,932129.0,2796387.0
4,09Q2,09Q,2,2266957.0,9067827.0,876080.5,3504322.0


-------------------------------------------

#### __Set Aside columns for creating the submission file later__

In [None]:
df_submission_col = df[['fl_date', 'mkt_carrier', 'mkt_carrier_fl_num', 'origin', 'dest']]

df_submission_col.info()
df_submission_col.head()

# export to csv
df_submission_col.tp_csv('df_submission_columns.csv', index=False)

#### Step 1. Remove insignificant variables

In [10]:
# start time to measure the time of the program execution
start_time = time.time()

# drop columns
df = df.drop(columns=['branded_code_share', 'mkt_carrier', 'tail_num', 
                      'origin_city_name', 'dest_city_name', 'dup', 
                      'origin_airport_id', 
                      'op_carrier_fl_num', 'target_class'
])

# rearrange columns
rearranged_columns = ['fl_date', 'dest_airport_id', 'mkt_unique_carrier', 'op_unique_carrier',
                      'mkt_carrier_fl_num', 'origin', 'dest', 'crs_dep_time', 'crs_arr_time', 
                      'crs_elapsed_time', 'flights', 'distance', 'arr_delay']

df = df[rearranged_columns]

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16269 entries, 0 to 16268
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   fl_date             16269 non-null  object 
 1   dest_airport_id     16269 non-null  int64  
 2   mkt_unique_carrier  16269 non-null  object 
 3   op_unique_carrier   16269 non-null  object 
 4   origin              16269 non-null  object 
 5   dest                16269 non-null  object 
 6   crs_dep_time        16269 non-null  int64  
 7   crs_arr_time        16269 non-null  int64  
 8   crs_elapsed_time    16269 non-null  float64
 9   flights             16269 non-null  float64
 10  arr_delay           16269 non-null  float64
dtypes: float64(3), int64(3), object(5)
memory usage: 1.4+ MB


Unnamed: 0,fl_date,dest_airport_id,mkt_unique_carrier,op_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,arr_delay
0,2019-05-01,10397,UA,YX,DEN,ATL,1335,1837,182.0,1.0,85.0
1,2019-05-01,11618,UA,UA,ORD,EWR,1415,1729,134.0,1.0,10.0
2,2019-05-01,13244,UA,G7,ORD,MEM,1325,1521,116.0,1.0,27.0
3,2019-05-01,11292,AA,AA,DFW,DEN,1627,1732,125.0,1.0,10.0
4,2019-05-01,14771,F9,F9,DEN,SFO,2210,2352,162.0,1.0,26.0




--- 0.043433427810668945 seconds ---


#### Step 2. Create additional features

In [11]:
# start time to measure the time of the program execution
start_time = time.time()

# convert 'fl_date' to datetime ('2019-05-01')
df['fl_date'] = pd.to_datetime(df['fl_date'], format='%Y-%m-%d')

# compute year, month and weekdays from 'fl_date' in df
month_train = [x.month for x in df['fl_date']]
year_train = [x.year for x in df['fl_date']]

# insert new columns in df
df.insert(loc=1, column='year', value=year_train)
df.insert(loc=2, column='month', value=month_train)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16269 entries, 0 to 16268
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   fl_date             16269 non-null  datetime64[ns]
 1   year                16269 non-null  int64         
 2   month               16269 non-null  int64         
 3   dest_airport_id     16269 non-null  int64         
 4   mkt_unique_carrier  16269 non-null  object        
 5   op_unique_carrier   16269 non-null  object        
 6   origin              16269 non-null  object        
 7   dest                16269 non-null  object        
 8   crs_dep_time        16269 non-null  int64         
 9   crs_arr_time        16269 non-null  int64         
 10  crs_elapsed_time    16269 non-null  float64       
 11  flights             16269 non-null  float64       
 12  arr_delay           16269 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(5), ob

Unnamed: 0,fl_date,year,month,dest_airport_id,mkt_unique_carrier,op_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,arr_delay
0,2019-05-01,2019,5,10397,UA,YX,DEN,ATL,1335,1837,182.0,1.0,85.0
1,2019-05-01,2019,5,11618,UA,UA,ORD,EWR,1415,1729,134.0,1.0,10.0
2,2019-05-01,2019,5,13244,UA,G7,ORD,MEM,1325,1521,116.0,1.0,27.0
3,2019-05-01,2019,5,11292,AA,AA,DFW,DEN,1627,1732,125.0,1.0,10.0
4,2019-05-01,2019,5,14771,F9,F9,DEN,SFO,2210,2352,162.0,1.0,26.0




--- 0.10860896110534668 seconds ---


#### Step 3. Get features from passengers and fuel

In [12]:
# start time to measure the time of the program execution
start_time = time.time()

# create new id variable to merge with passenger_summary table
item_id = []

for num in range(len(df)):
    item = str(df.iloc[num, 3]) + df.iloc[num, 5] + str(df.iloc[num, 2])
    item_id.append(item)

df.insert(loc=0, column='item_id', value=item_id)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16269 entries, 0 to 16268
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   item_id             16269 non-null  object        
 1   fl_date             16269 non-null  datetime64[ns]
 2   year                16269 non-null  int64         
 3   month               16269 non-null  int64         
 4   dest_airport_id     16269 non-null  int64         
 5   mkt_unique_carrier  16269 non-null  object        
 6   op_unique_carrier   16269 non-null  object        
 7   origin              16269 non-null  object        
 8   dest                16269 non-null  object        
 9   crs_dep_time        16269 non-null  int64         
 10  crs_arr_time        16269 non-null  int64         
 11  crs_elapsed_time    16269 non-null  float64       
 12  flights             16269 non-null  float64       
 13  arr_delay           16269 non-null  float64   

Unnamed: 0,item_id,fl_date,year,month,dest_airport_id,mkt_unique_carrier,op_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,arr_delay
0,10397YX5,2019-05-01,2019,5,10397,UA,YX,DEN,ATL,1335,1837,182.0,1.0,85.0
1,11618UA5,2019-05-01,2019,5,11618,UA,UA,ORD,EWR,1415,1729,134.0,1.0,10.0
2,13244G75,2019-05-01,2019,5,13244,UA,G7,ORD,MEM,1325,1521,116.0,1.0,27.0
3,11292AA5,2019-05-01,2019,5,11292,AA,AA,DFW,DEN,1627,1732,125.0,1.0,10.0
4,14771F95,2019-05-01,2019,5,14771,F9,F9,DEN,SFO,2210,2352,162.0,1.0,26.0




--- 2.8409507274627686 seconds ---


In [13]:
# start time to measure the time of the program execution
start_time = time.time()

# remove columns from passenger_summary that should not be merged to df
pass_df = passengers.drop(columns=['carrier_month_id', 'dest_airport_id',
                                   'unique_carrier', 'month', 'air_time',
                                   'distance', 'passengers'])

# inner join sum_pass_df with carrier_pass
df = df.merge(pass_df, how='left', on='item_id')
df = df.rename({'speed': 'speed_passengers'}, axis=1)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16269 entries, 0 to 16268
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   item_id             16269 non-null  object        
 1   fl_date             16269 non-null  datetime64[ns]
 2   year                16269 non-null  int64         
 3   month               16269 non-null  int64         
 4   dest_airport_id     16269 non-null  int64         
 5   mkt_unique_carrier  16269 non-null  object        
 6   op_unique_carrier   16269 non-null  object        
 7   origin              16269 non-null  object        
 8   dest                16269 non-null  object        
 9   crs_dep_time        16269 non-null  int64         
 10  crs_arr_time        16269 non-null  int64         
 11  crs_elapsed_time    16269 non-null  float64       
 12  flights             16269 non-null  float64       
 13  arr_delay           16269 non-null  float64   

Unnamed: 0,item_id,fl_date,year,month,dest_airport_id,mkt_unique_carrier,op_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,arr_delay,speed_passengers,mean_passengers,sum_passengers
0,10397YX5,2019-05-01,2019,5,10397,UA,YX,DEN,ATL,1335,1837,182.0,1.0,85.0,0.188597,5.0,22455
1,11618UA5,2019-05-01,2019,5,11618,UA,UA,ORD,EWR,1415,1729,134.0,1.0,10.0,0.294716,5.0,75760
2,13244G75,2019-05-01,2019,5,13244,UA,G7,ORD,MEM,1325,1521,116.0,1.0,27.0,0.217448,5.0,7595
3,11292AA5,2019-05-01,2019,5,11292,AA,AA,DFW,DEN,1627,1732,125.0,1.0,10.0,0.158707,5.0,54465
4,14771F95,2019-05-01,2019,5,14771,F9,F9,DEN,SFO,2210,2352,162.0,1.0,26.0,0.401629,5.0,17460




--- 0.20148015022277832 seconds ---


In [14]:
# start time to measure the time of the program execution
start_time = time.time()

# for three new columns merged from passenger_summary,
#  if there are any missing values, fill missing values with min values
df['speed_passengers'] = df['speed_passengers'].fillna(df['speed_passengers'].min())
df['mean_passengers'] = df['mean_passengers'].fillna(df['mean_passengers'].min())
df['sum_passengers'] = df['sum_passengers'].fillna(df['sum_passengers'].min())

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16269 entries, 0 to 16268
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   item_id             16269 non-null  object        
 1   fl_date             16269 non-null  datetime64[ns]
 2   year                16269 non-null  int64         
 3   month               16269 non-null  int64         
 4   dest_airport_id     16269 non-null  int64         
 5   mkt_unique_carrier  16269 non-null  object        
 6   op_unique_carrier   16269 non-null  object        
 7   origin              16269 non-null  object        
 8   dest                16269 non-null  object        
 9   crs_dep_time        16269 non-null  int64         
 10  crs_arr_time        16269 non-null  int64         
 11  crs_elapsed_time    16269 non-null  float64       
 12  flights             16269 non-null  float64       
 13  arr_delay           16269 non-null  float64   

Unnamed: 0,item_id,fl_date,year,month,dest_airport_id,mkt_unique_carrier,op_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,arr_delay,speed_passengers,mean_passengers,sum_passengers
0,10397YX5,2019-05-01,2019,5,10397,UA,YX,DEN,ATL,1335,1837,182.0,1.0,85.0,0.188597,5.0,22455
1,11618UA5,2019-05-01,2019,5,11618,UA,UA,ORD,EWR,1415,1729,134.0,1.0,10.0,0.294716,5.0,75760
2,13244G75,2019-05-01,2019,5,13244,UA,G7,ORD,MEM,1325,1521,116.0,1.0,27.0,0.217448,5.0,7595
3,11292AA5,2019-05-01,2019,5,11292,AA,AA,DFW,DEN,1627,1732,125.0,1.0,10.0,0.158707,5.0,54465
4,14771F95,2019-05-01,2019,5,14771,F9,F9,DEN,SFO,2210,2352,162.0,1.0,26.0,0.401629,5.0,17460




--- 0.06107497215270996 seconds ---


In [15]:
# start time to measure the time of the program execution
start_time = time.time()

# create new id variable to merge with passenger_summary table
carrier_month_id = []

for num in range(len(df)):
    item = df.iloc[num, 6] + str(df.iloc[num, 3])
    carrier_month_id.append(item)

df.insert(loc=1, column='carrier_month_id', value=carrier_month_id)

# remove columns from fuel_summary that should not be merged to df
fuel_df = fuel.drop(columns=['carrier', 'month'])

# inner join sum_pass_df with carrier_pass
df = df.merge(fuel_df, how='left', on='carrier_month_id')
df = df.rename({'total_cost:mean': 'total_cost:mean_fuel',
               'total_cost:sum': 'total_cost:sum_fuel',
               'total_gallons:mean': 'total_gallons:mean_fuel',
               'total_gallons:sum': 'total_gallons:sum_fuel'}, axis=1)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16269 entries, 0 to 16268
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   item_id                  16269 non-null  object        
 1   carrier_month_id         16269 non-null  object        
 2   fl_date                  16269 non-null  datetime64[ns]
 3   year                     16269 non-null  int64         
 4   month                    16269 non-null  int64         
 5   dest_airport_id          16269 non-null  int64         
 6   mkt_unique_carrier       16269 non-null  object        
 7   op_unique_carrier        16269 non-null  object        
 8   origin                   16269 non-null  object        
 9   dest                     16269 non-null  object        
 10  crs_dep_time             16269 non-null  int64         
 11  crs_arr_time             16269 non-null  int64         
 12  crs_elapsed_time         16269 n

Unnamed: 0,item_id,carrier_month_id,fl_date,year,month,dest_airport_id,mkt_unique_carrier,op_unique_carrier,origin,dest,...,crs_elapsed_time,flights,arr_delay,speed_passengers,mean_passengers,sum_passengers,total_cost:mean_fuel,total_cost:sum_fuel,total_gallons:mean_fuel,total_gallons:sum_fuel
0,10397YX5,YX5,2019-05-01,2019,5,10397,UA,YX,DEN,ATL,...,182.0,1.0,85.0,0.188597,5.0,22455,8618120.0,43090600.0,5318166.6,26590830.0
1,11618UA5,UA5,2019-05-01,2019,5,11618,UA,UA,ORD,EWR,...,134.0,1.0,10.0,0.294716,5.0,75760,532297103.2,2661486000.0,291973202.2,1459866000.0
2,13244G75,G75,2019-05-01,2019,5,13244,UA,G7,ORD,MEM,...,116.0,1.0,27.0,0.217448,5.0,7595,0.0,0.0,0.0,0.0
3,11292AA5,AA5,2019-05-01,2019,5,11292,AA,AA,DFW,DEN,...,125.0,1.0,10.0,0.158707,5.0,54465,527459997.0,2637300000.0,296278877.4,1481394000.0
4,14771F95,F95,2019-05-01,2019,5,14771,F9,F9,DEN,SFO,...,162.0,1.0,26.0,0.401629,5.0,17460,36462558.6,182312800.0,20203098.2,101015500.0




--- 1.8598172664642334 seconds ---


In [16]:
# for four new columns merged from fuel_summary,
#  if there are any missing values, fill missing values with 0
df['total_cost:mean_fuel'] = df['total_cost:mean_fuel'].fillna(df['total_cost:mean_fuel'].min())
df['total_cost:sum_fuel'] = df['total_cost:sum_fuel'].fillna(df['total_cost:sum_fuel'].min())
df['total_gallons:mean_fuel'] = df['total_gallons:mean_fuel'].fillna(df['total_gallons:mean_fuel'].min())
df['total_gallons:sum_fuel'] = df['total_gallons:sum_fuel'].fillna(df['total_gallons:sum_fuel'].min())

# check
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16269 entries, 0 to 16268
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   item_id                  16269 non-null  object        
 1   carrier_month_id         16269 non-null  object        
 2   fl_date                  16269 non-null  datetime64[ns]
 3   year                     16269 non-null  int64         
 4   month                    16269 non-null  int64         
 5   dest_airport_id          16269 non-null  int64         
 6   mkt_unique_carrier       16269 non-null  object        
 7   op_unique_carrier        16269 non-null  object        
 8   origin                   16269 non-null  object        
 9   dest                     16269 non-null  object        
 10  crs_dep_time             16269 non-null  int64         
 11  crs_arr_time             16269 non-null  int64         
 12  crs_elapsed_time         16269 n

Unnamed: 0,item_id,carrier_month_id,fl_date,year,month,dest_airport_id,mkt_unique_carrier,op_unique_carrier,origin,dest,...,crs_elapsed_time,flights,arr_delay,speed_passengers,mean_passengers,sum_passengers,total_cost:mean_fuel,total_cost:sum_fuel,total_gallons:mean_fuel,total_gallons:sum_fuel
0,10397YX5,YX5,2019-05-01,2019,5,10397,UA,YX,DEN,ATL,...,182.0,1.0,85.0,0.188597,5.0,22455,8618120.0,43090600.0,5318166.6,26590830.0
1,11618UA5,UA5,2019-05-01,2019,5,11618,UA,UA,ORD,EWR,...,134.0,1.0,10.0,0.294716,5.0,75760,532297103.2,2661486000.0,291973202.2,1459866000.0
2,13244G75,G75,2019-05-01,2019,5,13244,UA,G7,ORD,MEM,...,116.0,1.0,27.0,0.217448,5.0,7595,0.0,0.0,0.0,0.0
3,11292AA5,AA5,2019-05-01,2019,5,11292,AA,AA,DFW,DEN,...,125.0,1.0,10.0,0.158707,5.0,54465,527459997.0,2637300000.0,296278877.4,1481394000.0
4,14771F95,F95,2019-05-01,2019,5,14771,F9,F9,DEN,SFO,...,162.0,1.0,26.0,0.401629,5.0,17460,36462558.6,182312800.0,20203098.2,101015500.0


#### Setp 4. Create dummy variables based on the following features
> - year
> - month
> - day_of_week
> - mkt_unique_carrier & op_unique_carrier
> - two hours variables (from 'crs_dep_time' and 'crs_arr_time')

In [17]:
# start time to measure the time of the program execution
start_time = time.time()

# create dummy variables
dummy_features = ['mkt_unique_carrier', 'op_unique_carrier']

df_dummy = pd.get_dummies(df[dummy_features])
df_dummy

# remove variables related to dummy_features from work_df
df = df.drop(dummy_features, axis=1)

# combine dummy variables with work_df
df = pd.concat([df, df_dummy], axis=1)

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

Unnamed: 0,mkt_unique_carrier_AA,mkt_unique_carrier_AS,mkt_unique_carrier_B6,mkt_unique_carrier_DL,mkt_unique_carrier_F9,mkt_unique_carrier_G4,mkt_unique_carrier_HA,mkt_unique_carrier_NK,mkt_unique_carrier_UA,mkt_unique_carrier_VX,...,op_unique_carrier_OH,op_unique_carrier_OO,op_unique_carrier_PT,op_unique_carrier_QX,op_unique_carrier_UA,op_unique_carrier_VX,op_unique_carrier_WN,op_unique_carrier_YV,op_unique_carrier_YX,op_unique_carrier_ZW
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16264,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16265,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
16266,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 16269 entries, 0 to 16268
Data columns (total 59 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   item_id                  16269 non-null  object        
 1   carrier_month_id         16269 non-null  object        
 2   fl_date                  16269 non-null  datetime64[ns]
 3   year                     16269 non-null  int64         
 4   month                    16269 non-null  int64         
 5   dest_airport_id          16269 non-null  int64         
 6   origin                   16269 non-null  object        
 7   dest                     16269 non-null  object        
 8   crs_dep_time             16269 non-null  int64         
 9   crs_arr_time             16269 non-null  int64         
 10  crs_elapsed_time         16269 non-null  float64       
 11  flights                  16269 non-null  float64       
 12  arr_delay                16269 n

Unnamed: 0,item_id,carrier_month_id,fl_date,year,month,dest_airport_id,origin,dest,crs_dep_time,crs_arr_time,...,op_unique_carrier_OH,op_unique_carrier_OO,op_unique_carrier_PT,op_unique_carrier_QX,op_unique_carrier_UA,op_unique_carrier_VX,op_unique_carrier_WN,op_unique_carrier_YV,op_unique_carrier_YX,op_unique_carrier_ZW
0,10397YX5,YX5,2019-05-01,2019,5,10397,DEN,ATL,1335,1837,...,0,0,0,0,0,0,0,0,1,0
1,11618UA5,UA5,2019-05-01,2019,5,11618,ORD,EWR,1415,1729,...,0,0,0,0,1,0,0,0,0,0
2,13244G75,G75,2019-05-01,2019,5,13244,ORD,MEM,1325,1521,...,0,0,0,0,0,0,0,0,0,0
3,11292AA5,AA5,2019-05-01,2019,5,11292,DFW,DEN,1627,1732,...,0,0,0,0,0,0,0,0,0,0
4,14771F95,F95,2019-05-01,2019,5,14771,DEN,SFO,2210,2352,...,0,0,0,0,0,0,0,0,0,0




--- 0.07867193222045898 seconds ---


In [18]:
# start time to measure the time of the program execution
start_time = time.time()

# remove variables related to dummy_features from work_df
df = df.drop(['item_id', 'carrier_month_id'], axis=1)

# check
df.info()
df.head(3)

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16269 entries, 0 to 16268
Data columns (total 57 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   fl_date                  16269 non-null  datetime64[ns]
 1   year                     16269 non-null  int64         
 2   month                    16269 non-null  int64         
 3   dest_airport_id          16269 non-null  int64         
 4   origin                   16269 non-null  object        
 5   dest                     16269 non-null  object        
 6   crs_dep_time             16269 non-null  int64         
 7   crs_arr_time             16269 non-null  int64         
 8   crs_elapsed_time         16269 non-null  float64       
 9   flights                  16269 non-null  float64       
 10  arr_delay                16269 non-null  float64       
 11  speed_passengers         16269 non-null  float64       
 12  mean_passengers          16269 n

Unnamed: 0,fl_date,year,month,dest_airport_id,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,...,op_unique_carrier_OH,op_unique_carrier_OO,op_unique_carrier_PT,op_unique_carrier_QX,op_unique_carrier_UA,op_unique_carrier_VX,op_unique_carrier_WN,op_unique_carrier_YV,op_unique_carrier_YX,op_unique_carrier_ZW
0,2019-05-01,2019,5,10397,DEN,ATL,1335,1837,182.0,1.0,...,0,0,0,0,0,0,0,0,1,0
1,2019-05-01,2019,5,11618,ORD,EWR,1415,1729,134.0,1.0,...,0,0,0,0,1,0,0,0,0,0
2,2019-05-01,2019,5,13244,ORD,MEM,1325,1521,116.0,1.0,...,0,0,0,0,0,0,0,0,0,0




--- 0.05816936492919922 seconds ---


#### __Create Ordinal Variables based on Flights__
> - origin
> - dest

#### __origin__

In [26]:
# start time to measure the time of the program execution
start_time = time.time()

# create summary table of tail_num vs flights
origin_table = pd.pivot_table(df, values='flights', index=['origin'],
                                aggfunc=np.sum)

# reindex the rows to restore the variable in index to column
origin_table = origin_table.reset_index(drop=False)
origin_table

# get min & Max values from the table
min_1 = min(origin_table['flights'])
max_1 = max(origin_table['flights'])
#bin_size1 = int((max_1 - min_1) / 5 + 1)

print(f"Max Value: {max_1} | Min Value: {min_1}")

# allocate bin_size to 5 groups
#group11 = 1000
#group21 = 2000
#group31 = 5000
#group41 = 11000
#group51 = 27000
#group11 = int(min_1 + bin_size1)
#group21 = int(group11 + bin_size1)
#group31 = int(group21 + bin_size1)
#group41 = int(group31 + bin_size1)
#group51 = int(group41 + bin_size1)

#print(f"group 1: {group11} | group 2: {group21} | group 3: {group31} | \
#group 4: {group41} | group 5: {group51}")

# assign the group number to tail_num values
origin_class = []

#for value in origin_table['flights']:
#    if value <= group11:
#        origin_class.append(1)
#    elif value > group11 and value <= group21:
#        origin_class.append(2)
#    elif value > group21 and value <= group31:
#        origin_class.append(3)
#    elif value > group31 and value <= group41:
#        origin_class.append(4)
#    elif value > group41 and value <= group51:
#        origin_class.append(5)

# assign the list above to a new class variable
#origin_table['class'] = origin_class
origin_table['class'] = origin_table['flights'].astype(int)

# check 
origin_table
origin_table['class'].nunique()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

Unnamed: 0,origin,flights
0,ABE,9.0
1,ABI,4.0
2,ABQ,62.0
3,ABR,3.0
4,ABY,4.0
...,...,...
333,WRG,2.0
334,XNA,22.0
335,YAK,1.0
336,YKM,4.0


Max Value: 843.0 | Min Value: 1.0


Unnamed: 0,origin,flights,class
0,ABE,9.0,9
1,ABI,4.0,4
2,ABQ,62.0,62
3,ABR,3.0,3
4,ABY,4.0,4
...,...,...,...
333,WRG,2.0,2
334,XNA,22.0,22
335,YAK,1.0,1
336,YKM,4.0,4


94



--- 0.17020773887634277 seconds ---


In [27]:
# start time to measure the time of the program execution
start_time = time.time()

# create a dictiornary - key with tail_num and value with class
# restructure the tableto prepare for a dictionary
origin_df = origin_table.drop(['flights'], axis=1)
origin_df

# create an empty dictionary
dict1 = {}

for num in range(len(origin_df)):
    dict1[origin_df.iloc[num, 0]] = origin_df.iloc[num, 1]

# check dictionary created from above
dict1

# map the ordinal features, using the dictionary above
df = df.replace({"origin" : dict1})

# check
df.info()
df.head(3)

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

Unnamed: 0,origin,class
0,ABE,9
1,ABI,4
2,ABQ,62
3,ABR,3
4,ABY,4
...,...,...
333,WRG,2
334,XNA,22
335,YAK,1
336,YKM,4


{'ABE': 9,
 'ABI': 4,
 'ABQ': 62,
 'ABR': 3,
 'ABY': 4,
 'ACK': 3,
 'ACT': 4,
 'ACV': 3,
 'ACY': 1,
 'ADK': 1,
 'AEX': 10,
 'AGS': 11,
 'ALB': 30,
 'ALO': 1,
 'ALW': 2,
 'AMA': 7,
 'ANC': 35,
 'APN': 4,
 'ASE': 11,
 'ATL': 759,
 'ATW': 19,
 'ATY': 1,
 'AUS': 133,
 'AVL': 19,
 'AVP': 11,
 'AZA': 15,
 'AZO': 6,
 'BDL': 66,
 'BFF': 3,
 'BFL': 8,
 'BFM': 1,
 'BGM': 2,
 'BGR': 11,
 'BHM': 33,
 'BIL': 4,
 'BIS': 9,
 'BLI': 5,
 'BLV': 2,
 'BMI': 9,
 'BNA': 144,
 'BOI': 38,
 'BOS': 306,
 'BPT': 3,
 'BQK': 2,
 'BQN': 4,
 'BRO': 5,
 'BTM': 1,
 'BTR': 11,
 'BTV': 16,
 'BUF': 57,
 'BUR': 61,
 'BWI': 214,
 'BZN': 17,
 'CAE': 20,
 'CAK': 20,
 'CDC': 2,
 'CDV': 3,
 'CGI': 3,
 'CHA': 19,
 'CHO': 10,
 'CHS': 38,
 'CID': 27,
 'CIU': 1,
 'CKB': 1,
 'CLE': 108,
 'CLL': 6,
 'CLT': 555,
 'CMH': 108,
 'CMI': 3,
 'CMX': 3,
 'CNY': 2,
 'COD': 1,
 'COS': 24,
 'COU': 5,
 'CPR': 11,
 'CRP': 7,
 'CRW': 17,
 'CSG': 3,
 'CVG': 94,
 'CWA': 9,
 'CYS': 1,
 'DAB': 5,
 'DAL': 164,
 'DAY': 29,
 'DBQ': 3,
 'DCA': 282,
 'DE

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16269 entries, 0 to 16268
Data columns (total 57 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   fl_date                  16269 non-null  datetime64[ns]
 1   year                     16269 non-null  int64         
 2   month                    16269 non-null  int64         
 3   dest_airport_id          16269 non-null  int64         
 4   origin                   16269 non-null  int32         
 5   dest                     16269 non-null  object        
 6   crs_dep_time             16269 non-null  int64         
 7   crs_arr_time             16269 non-null  int64         
 8   crs_elapsed_time         16269 non-null  float64       
 9   flights                  16269 non-null  float64       
 10  arr_delay                16269 non-null  float64       
 11  speed_passengers         16269 non-null  float64       
 12  mean_passengers          16269 n

Unnamed: 0,fl_date,year,month,dest_airport_id,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,...,op_unique_carrier_OH,op_unique_carrier_OO,op_unique_carrier_PT,op_unique_carrier_QX,op_unique_carrier_UA,op_unique_carrier_VX,op_unique_carrier_WN,op_unique_carrier_YV,op_unique_carrier_YX,op_unique_carrier_ZW
0,2019-05-01,2019,5,10397,607,ATL,1335,1837,182.0,1.0,...,0,0,0,0,0,0,0,0,1,0
1,2019-05-01,2019,5,11618,843,EWR,1415,1729,134.0,1.0,...,0,0,0,0,1,0,0,0,0,0
2,2019-05-01,2019,5,13244,843,MEM,1325,1521,116.0,1.0,...,0,0,0,0,0,0,0,0,0,0




--- 0.5044844150543213 seconds ---


In [28]:
df['origin'].unique()

array([607, 843, 629, 390, 312,  38,  33, 317, 320, 311, 759,  10, 194,
       164, 187, 343,  58, 372, 214, 282, 516,  59,  57, 366, 133, 306,
       555, 126, 105,  46,  19,  53, 148,  96, 359, 200, 108,  27, 167,
       363, 227,  40,  50,   1,  62,   7,  91, 144,   9,  90,  66,   2,
         3,  17, 155,  43,  28,   6,   5,  94,  55, 256, 195,  18, 237,
        24,  37,  16,  13,  22, 171, 114,   4,  29,  93, 129, 119,  51,
        73,   8,  14,  23,  21,  49,  30,  35,  12,  61,  11,  85,  69,
        15,  36,  20])

#### __dest__

In [29]:
# start time to measure the time of the program execution
start_time = time.time()

# create summary table of tail_num vs flights
dest_table = pd.pivot_table(df, values='flights', index=['dest'],
                                aggfunc=np.sum)

# reindex the rows to restore the variable in index to column
dest_table = dest_table.reset_index(drop=False)
dest_table

# get min & Max values from the table
min_3 = min(dest_table['flights'])
max_3 = max(dest_table['flights'])
#bin_size3 = int((max_tailnum3 - min_tailnum3) / 5 + 1)

print(f"Max Value: {max_3} | Min Value: {min_3}")

# allocate bin_size to 5 groups
#group13 = 1000
#group23 = 2000
#group33 = 6000
#group43 = 12000
#group53 = 26000

#print(f"group 1: {group13} | group 2: {group23} | group 3: {group33} | \
#group 4: {group43} | group 5: {group53}")

# assign the group number to tail_num values
#dest_class = []

#for value in dest_table['flights']:
#    if value <= group13:
#        dest_class.append(1)
#    elif value > group13 and value <= group23:
#        dest_class.append(2)
#    elif value > group23 and value <= group33:
#        dest_class.append(3)
#    elif value > group33 and value <= group43:
#        dest_class.append(4)
#    elif value > group43 and value <= group53:
#        dest_class.append(5)

# assign the list above to a new class variable
#dest_table['class'] = dest_class
dest_table['class'] = dest_table['flights'].astype(int)

# check 
dest_table
dest_table['class'].nunique()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

Unnamed: 0,dest,flights
0,ABE,13.0
1,ABI,1.0
2,ABQ,46.0
3,ABR,2.0
4,ACK,1.0
...,...,...
335,WYS,2.0
336,XNA,39.0
337,XWA,1.0
338,YAK,2.0


Max Value: 806.0 | Min Value: 1.0


Unnamed: 0,dest,flights,class
0,ABE,13.0,13
1,ABI,1.0,1
2,ABQ,46.0,46
3,ABR,2.0,2
4,ACK,1.0,1
...,...,...,...
335,WYS,2.0,2
336,XNA,39.0,39
337,XWA,1.0,1
338,YAK,2.0,2


99



--- 0.04003024101257324 seconds ---


In [30]:
# start time to measure the time of the program execution
start_time = time.time()

# create a dictiornary - key with tail_num and value with class
# restructure the tableto prepare for a dictionary
dest_df = dest_table.drop(['flights'], axis=1)
dest_df

# create an empty dictionary
dict3 = {}

for num in range(len(dest_df)):
    dict3[dest_df.iloc[num, 0]] = dest_df.iloc[num, 1]

# check dictionary created from above
dict3

# map the ordinal features, using the dictionary above
df = df.replace({"dest" : dict3})

# check
df.info()
df.head(3)

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

Unnamed: 0,dest,class
0,ABE,13
1,ABI,1
2,ABQ,46
3,ABR,2
4,ACK,1
...,...,...
335,WYS,2
336,XNA,39
337,XWA,1
338,YAK,2


{'ABE': 13,
 'ABI': 1,
 'ABQ': 46,
 'ABR': 2,
 'ACK': 1,
 'ACT': 2,
 'ACV': 5,
 'ACY': 8,
 'ADQ': 2,
 'AEX': 13,
 'AGS': 14,
 'ALB': 32,
 'ALO': 2,
 'ALW': 2,
 'AMA': 9,
 'ANC': 44,
 'APN': 1,
 'ASE': 12,
 'ATL': 750,
 'ATW': 15,
 'ATY': 2,
 'AUS': 112,
 'AVL': 25,
 'AVP': 18,
 'AZA': 7,
 'AZO': 9,
 'BDL': 56,
 'BET': 2,
 'BFF': 2,
 'BFL': 7,
 'BGM': 1,
 'BGR': 12,
 'BHM': 33,
 'BIL': 17,
 'BIS': 12,
 'BLI': 11,
 'BLV': 4,
 'BMI': 2,
 'BNA': 147,
 'BOI': 53,
 'BOS': 299,
 'BPT': 3,
 'BQK': 2,
 'BQN': 2,
 'BRD': 1,
 'BRO': 7,
 'BRW': 2,
 'BTM': 1,
 'BTR': 15,
 'BTV': 33,
 'BUF': 58,
 'BUR': 55,
 'BWI': 214,
 'BZN': 15,
 'CAE': 23,
 'CAK': 9,
 'CDC': 2,
 'CDV': 2,
 'CGI': 1,
 'CHA': 25,
 'CHO': 20,
 'CHS': 63,
 'CID': 24,
 'CKB': 1,
 'CLE': 106,
 'CLL': 8,
 'CLT': 519,
 'CMH': 81,
 'CMI': 3,
 'CMX': 1,
 'COD': 2,
 'COS': 15,
 'COU': 8,
 'CPR': 2,
 'CRP': 8,
 'CRW': 10,
 'CSG': 5,
 'CVG': 118,
 'CWA': 3,
 'CYS': 1,
 'DAB': 9,
 'DAL': 170,
 'DAY': 29,
 'DBQ': 2,
 'DCA': 280,
 'DEN': 573,
 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16269 entries, 0 to 16268
Data columns (total 57 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   fl_date                  16269 non-null  datetime64[ns]
 1   year                     16269 non-null  int64         
 2   month                    16269 non-null  int64         
 3   dest_airport_id          16269 non-null  int64         
 4   origin                   16269 non-null  int32         
 5   dest                     16269 non-null  int32         
 6   crs_dep_time             16269 non-null  int64         
 7   crs_arr_time             16269 non-null  int64         
 8   crs_elapsed_time         16269 non-null  float64       
 9   flights                  16269 non-null  float64       
 10  arr_delay                16269 non-null  float64       
 11  speed_passengers         16269 non-null  float64       
 12  mean_passengers          16269 n

Unnamed: 0,fl_date,year,month,dest_airport_id,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,...,op_unique_carrier_OH,op_unique_carrier_OO,op_unique_carrier_PT,op_unique_carrier_QX,op_unique_carrier_UA,op_unique_carrier_VX,op_unique_carrier_WN,op_unique_carrier_YV,op_unique_carrier_YX,op_unique_carrier_ZW
0,2019-05-01,2019,5,10397,607,750,1335,1837,182.0,1.0,...,0,0,0,0,0,0,0,0,1,0
1,2019-05-01,2019,5,11618,843,319,1415,1729,134.0,1.0,...,0,0,0,0,1,0,0,0,0,0
2,2019-05-01,2019,5,13244,843,69,1325,1521,116.0,1.0,...,0,0,0,0,0,0,0,0,0,0




--- 0.4791843891143799 seconds ---


In [31]:
df['dest'].unique()

array([750, 319,  69, 573, 387,  27,  12, 299, 519, 363, 383,  40, 107,
         8, 806,  25, 289,  45, 214,  56,  32,  24,  21, 170, 101,  53,
       141, 160, 118, 206,  62,  46, 147, 360,  33, 315,  17,  41, 254,
        63, 168,   3, 113, 133, 103,   1,  31,  18, 112, 106,  74, 516,
        26,  81, 361, 177,  34,  51, 608, 161,   7,  42, 219,  99, 235,
        92, 409, 120,  20,  48,   9,  60,  36,  22, 304, 102,  13,  15,
       110,  39,  58,   4,   2,  55,   6, 280,  23,  14,  71,  19,  16,
        10,  44,   5,  11,  28,  57,  47,  29])

#### __Remove 'fl_date', 'year', 'month' & 'dest_airport_id' from df__

In [32]:
# start time to measure the time of the program execution
start_time = time.time()

# drop columns
df = df.drop(columns=['fl_date','year', 'month', 'dest_airport_id', 'flights'])

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16269 entries, 0 to 16268
Data columns (total 53 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   origin                   16269 non-null  int32  
 1   dest                     16269 non-null  int32  
 2   crs_dep_time             16269 non-null  int64  
 3   crs_arr_time             16269 non-null  int64  
 4   crs_elapsed_time         16269 non-null  float64
 5   flights                  16269 non-null  float64
 6   arr_delay                16269 non-null  float64
 7   speed_passengers         16269 non-null  float64
 8   mean_passengers          16269 non-null  float64
 9   sum_passengers           16269 non-null  int64  
 10  total_cost:mean_fuel     16269 non-null  float64
 11  total_cost:sum_fuel      16269 non-null  float64
 12  total_gallons:mean_fuel  16269 non-null  float64
 13  total_gallons:sum_fuel   16269 non-null  float64
 14  mkt_unique_carrier_AA 

Unnamed: 0,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,arr_delay,speed_passengers,mean_passengers,sum_passengers,...,op_unique_carrier_OH,op_unique_carrier_OO,op_unique_carrier_PT,op_unique_carrier_QX,op_unique_carrier_UA,op_unique_carrier_VX,op_unique_carrier_WN,op_unique_carrier_YV,op_unique_carrier_YX,op_unique_carrier_ZW
0,607,750,1335,1837,182.0,1.0,85.0,0.188597,5.0,22455,...,0,0,0,0,0,0,0,0,1,0
1,843,319,1415,1729,134.0,1.0,10.0,0.294716,5.0,75760,...,0,0,0,0,1,0,0,0,0,0
2,843,69,1325,1521,116.0,1.0,27.0,0.217448,5.0,7595,...,0,0,0,0,0,0,0,0,0,0
3,629,573,1627,1732,125.0,1.0,10.0,0.158707,5.0,54465,...,0,0,0,0,0,0,0,0,0,0
4,607,387,2210,2352,162.0,1.0,26.0,0.401629,5.0,17460,...,0,0,0,0,0,0,0,0,0,0




--- 0.09528493881225586 seconds ---


--------------------------------------------

#### __Only for Training Dataset: Skip this process for Testing Dataset__

In [33]:
# start time to measure the time of the program execution
start_time = time.time()

# Move 'target_class' variable to the last column
target = df['arr_delay']
target

# remove variables related to dummy_features from work_df
df = df.drop(['arr_delay'], axis=1)

# add back the target variable to the last column'
df['target'] = target

# check
df.info()
df.head()

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

0        85.0
1        10.0
2        27.0
3        10.0
4        26.0
         ... 
16264   -26.0
16265    -5.0
16266     0.0
16267   -12.0
16268    -7.0
Name: arr_delay, Length: 16269, dtype: float64

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16269 entries, 0 to 16268
Data columns (total 53 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   origin                   16269 non-null  int32  
 1   dest                     16269 non-null  int32  
 2   crs_dep_time             16269 non-null  int64  
 3   crs_arr_time             16269 non-null  int64  
 4   crs_elapsed_time         16269 non-null  float64
 5   flights                  16269 non-null  float64
 6   speed_passengers         16269 non-null  float64
 7   mean_passengers          16269 non-null  float64
 8   sum_passengers           16269 non-null  int64  
 9   total_cost:mean_fuel     16269 non-null  float64
 10  total_cost:sum_fuel      16269 non-null  float64
 11  total_gallons:mean_fuel  16269 non-null  float64
 12  total_gallons:sum_fuel   16269 non-null  float64
 13  mkt_unique_carrier_AA    16269 non-null  uint8  
 14  mkt_unique_carrier_AS 

Unnamed: 0,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,speed_passengers,mean_passengers,sum_passengers,total_cost:mean_fuel,...,op_unique_carrier_OO,op_unique_carrier_PT,op_unique_carrier_QX,op_unique_carrier_UA,op_unique_carrier_VX,op_unique_carrier_WN,op_unique_carrier_YV,op_unique_carrier_YX,op_unique_carrier_ZW,target
0,607,750,1335,1837,182.0,1.0,0.188597,5.0,22455,8618120.0,...,0,0,0,0,0,0,0,1,0,85.0
1,843,319,1415,1729,134.0,1.0,0.294716,5.0,75760,532297103.2,...,0,0,0,1,0,0,0,0,0,10.0
2,843,69,1325,1521,116.0,1.0,0.217448,5.0,7595,0.0,...,0,0,0,0,0,0,0,0,0,27.0
3,629,573,1627,1732,125.0,1.0,0.158707,5.0,54465,527459997.0,...,0,0,0,0,0,0,0,0,0,10.0
4,607,387,2210,2352,162.0,1.0,0.401629,5.0,17460,36462558.6,...,0,0,0,0,0,0,0,0,0,26.0




--- 0.05893826484680176 seconds ---


-------------------------------------------------------

In [35]:
# export to csv
df.to_csv('training_for_regmodel_16k.csv', index=False)

In [36]:
df['crs_arr_time'][0] - df['crs_dep_time'][0]

502

In [38]:
df['target'][0]

85.0