In [68]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# This is new
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [69]:
# fetch data 

main_data = pd.read_csv('flight_data\mco_syr_sw_dep_22_23.csv')
main_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Destination Airport,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),Departure delay (Minutes),Wheels-off time,Taxi-Out time (Minutes),Delay Carrier (Minutes),Delay Weather (Minutes),Delay National Aviation System (Minutes),Delay Security (Minutes),Delay Late Aircraft Arrival (Minutes)
0,WN,1/1/2022,35,N8524Z,STL,6:30,6:31,155,145,1,6:39,8,0,0,0,0,0
1,WN,1/1/2022,53,N276WN,PVD,22:15,22:44,155,161,29,22:54,10,5,0,6,0,24
2,WN,1/1/2022,59,N8695D,ROC,11:55,12:18,160,142,23,12:28,10,0,0,0,0,0
3,WN,1/1/2022,66,N7715E,IND,19:30,20:07,145,145,37,20:23,16,6,0,0,0,31
4,WN,1/1/2022,488,N409WN,MKE,10:30,11:45,175,157,75,11:56,11,26,0,0,0,31


In [70]:
main_data.dtypes

Carrier Code                                object
Date (MM/DD/YYYY)                           object
Flight Number                                int64
Tail Number                                 object
Destination Airport                         object
Scheduled departure time                    object
Actual departure time                       object
Scheduled elapsed time (Minutes)             int64
Actual elapsed time (Minutes)                int64
Departure delay (Minutes)                    int64
Wheels-off time                             object
Taxi-Out time (Minutes)                      int64
Delay Carrier (Minutes)                      int64
Delay Weather (Minutes)                      int64
Delay National Aviation System (Minutes)     int64
Delay Security (Minutes)                     int64
Delay Late Aircraft Arrival (Minutes)        int64
dtype: object

In [71]:
len(main_data)

78198

In [72]:
main_data.rename(columns = {'Destination Airport' : 'Destination_Airport' , 
                            'Carrier Code' : 'Carrier_Code',
                            'Flight Number' : 'Flight_Number',
                            'Tail Number' : 'Tail_Number',
                            'Date (MM/DD/YYYY)':'Date', 
                            'Delay Carrier (Minutes)' : 'dep_Delay_Carrier',
                             'Delay Weather (Minutes)' : 'dep_Delay_Weather',
                 'Delay National Aviation System (Minutes)' : 'dep_Delay_National_Aviation_System',
                 'Delay Security (Minutes)' : 'dep_Delay_Security',
                 'Delay Late Aircraft Arrival (Minutes)' : 'dep_Delay_Late_Aircraft_Arrival'},inplace = True)

In [73]:
main_data.dtypes

Carrier_Code                          object
Date                                  object
Flight_Number                          int64
Tail_Number                           object
Destination_Airport                   object
Scheduled departure time              object
Actual departure time                 object
Scheduled elapsed time (Minutes)       int64
Actual elapsed time (Minutes)          int64
Departure delay (Minutes)              int64
Wheels-off time                       object
Taxi-Out time (Minutes)                int64
dep_Delay_Carrier                      int64
dep_Delay_Weather                      int64
dep_Delay_National_Aviation_System     int64
dep_Delay_Security                     int64
dep_Delay_Late_Aircraft_Arrival        int64
dtype: object

In [74]:
main_data = main_data[main_data['Destination_Airport'] == 'SYR']
len(main_data)

179

In [75]:
main_data['dep_hour'] = main_data['Scheduled departure time'].str.split(":").str[0].astype('int64')
main_data['Date'] = pd.to_datetime( main_data['Date'],format ="%m/%d/%Y") 
main_data['dep_day'] = main_data['Date'].dt.day_of_week.astype('int64') 
main_data['dep_year'] = main_data['Date'].dt.year.astype('int64')
main_data.dtypes

Carrier_Code                                  object
Date                                  datetime64[ns]
Flight_Number                                  int64
Tail_Number                                   object
Destination_Airport                           object
Scheduled departure time                      object
Actual departure time                         object
Scheduled elapsed time (Minutes)               int64
Actual elapsed time (Minutes)                  int64
Departure delay (Minutes)                      int64
Wheels-off time                               object
Taxi-Out time (Minutes)                        int64
dep_Delay_Carrier                              int64
dep_Delay_Weather                              int64
dep_Delay_National_Aviation_System             int64
dep_Delay_Security                             int64
dep_Delay_Late_Aircraft_Arrival                int64
dep_hour                                       int64
dep_day                                       

In [76]:
# main_data = main_data[main_data['dep_year'] == 2022]
# main_data.head()

In [77]:
main_data['dep_hour'].value_counts()

dep_hour
11    77
10    46
12    38
13    14
9      3
8      1
Name: count, dtype: int64

In [78]:
main_data['dep_year'].value_counts()

dep_year
2023    128
2022     51
Name: count, dtype: int64

In [79]:
main_data['dep_order'] = 'latter'

In [80]:
main_data.head()

Unnamed: 0,Carrier_Code,Date,Flight_Number,Tail_Number,Destination_Airport,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),Departure delay (Minutes),...,Taxi-Out time (Minutes),dep_Delay_Carrier,dep_Delay_Weather,dep_Delay_National_Aviation_System,dep_Delay_Security,dep_Delay_Late_Aircraft_Arrival,dep_hour,dep_day,dep_year,dep_order
32,WN,2022-01-01,1596,N730SW,SYR,10:30,10:32,170,142,2,...,6,0,0,0,0,0,10,5,2022,latter
1411,WN,2023-01-07,2695,N8318F,SYR,11:05,11:05,160,161,0,...,12,0,0,0,0,0,11,5,2023,latter
2782,WN,2023-01-14,2695,N8555Z,SYR,11:05,11:02,160,180,-3,...,27,0,0,17,0,0,11,5,2023,latter
4159,WN,2023-01-21,2695,N500WR,SYR,11:05,11:01,160,156,-4,...,14,0,0,0,0,0,11,5,2023,latter
4297,WN,2022-01-22,3479,N8690A,SYR,12:00,12:08,165,147,8,...,17,0,0,0,0,0,12,5,2022,latter


In [81]:
# fetch data 

arr_data = pd.read_csv('flight_data\mco_syr_sw_arr_22_23.csv')
arr_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Actual Arrival Time,Scheduled Elapsed Time (Minutes),Actual Elapsed Time (Minutes),Arrival Delay (Minutes),Wheels-on Time,Taxi-In time (Minutes),Delay Carrier (Minutes),Delay Weather (Minutes),Delay National Aviation System (Minutes),Delay Security (Minutes),Delay Late Aircraft Arrival (Minutes)
0,WN,1/1/2022,777,N284WN,BWI,23:20,3:04,75,50,224,3:01,3,39,0,0,0,185
1,WN,1/1/2022,1596,N730SW,MCO,13:20,12:54,170,142,-26,12:51,3,0,0,0,0,0
2,WN,1/1/2022,2494,N709SW,BWI,11:15,11:10,75,56,-5,11:08,2,0,0,0,0,0
3,WN,1/1/2022,2496,N709SW,BWI,16:10,16:12,70,59,2,16:10,2,0,0,0,0,0
4,WN,1/1/2023,133,N8628A,BWI,23:00,22:59,70,65,-1,22:55,4,0,0,0,0,0


In [82]:
arr_data.rename(columns = {'Origin Airport' : 'Origin_Airport' , 
                            'Carrier Code' : 'Carrier_Code',
                            'Flight Number' : 'Flight_Number',
                            'Tail Number' : 'Tail_Number',
                            'Date (MM/DD/YYYY)':'Date', 
                            'Delay Carrier (Minutes)' : 'arr_Delay_Carrier',
                             'Delay Weather (Minutes)' : 'arr_Delay_Weather',
                 'Delay National Aviation System (Minutes)' : 'arr_Delay_National_Aviation_System',
                 'Delay Security (Minutes)' : 'arr_Delay_Security',
                 'Delay Late Aircraft Arrival (Minutes)' : 'arr_Delay_Late_Aircraft_Arrival'},inplace = True)

In [83]:
arr_data.dtypes

Carrier_Code                          object
Date                                  object
Flight_Number                          int64
Tail_Number                           object
Origin_Airport                        object
Scheduled Arrival Time                object
Actual Arrival Time                   object
Scheduled Elapsed Time (Minutes)       int64
Actual Elapsed Time (Minutes)          int64
Arrival Delay (Minutes)                int64
Wheels-on Time                        object
Taxi-In time (Minutes)                 int64
arr_Delay_Carrier                      int64
arr_Delay_Weather                      int64
arr_Delay_National_Aviation_System     int64
arr_Delay_Security                     int64
arr_Delay_Late_Aircraft_Arrival        int64
dtype: object

In [84]:
len(arr_data)

2352

In [85]:
arr_data = arr_data[arr_data['Origin_Airport'] == 'MCO']
len(arr_data)

179

In [86]:
arr_data['arr_hour'] = arr_data['Scheduled Arrival Time'].str.split(":").str[0].astype('int64')

arr_data['Date'] = pd.to_datetime( arr_data['Date'],format ="%m/%d/%Y") 
arr_data['arr_day'] = arr_data['Date'].dt.day_of_week.astype('int64')
arr_data['arr_year'] = arr_data['Date'].dt.year.astype('int64')
arr_data.drop(columns = ['Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)'],inplace = True)

arr_data.dtypes

Carrier_Code                                  object
Date                                  datetime64[ns]
Flight_Number                                  int64
Tail_Number                                   object
Origin_Airport                                object
Scheduled Arrival Time                        object
Actual Arrival Time                           object
Arrival Delay (Minutes)                        int64
Wheels-on Time                                object
Taxi-In time (Minutes)                         int64
arr_Delay_Carrier                              int64
arr_Delay_Weather                              int64
arr_Delay_National_Aviation_System             int64
arr_Delay_Security                             int64
arr_Delay_Late_Aircraft_Arrival                int64
arr_hour                                       int64
arr_day                                        int64
arr_year                                       int64
dtype: object

In [87]:
arr_data['arr_hour'].value_counts()

arr_hour
13    77
14    62
15    22
16    14
12     3
10     1
Name: count, dtype: int64

In [88]:
arr_data['arr_year'].value_counts()

arr_year
2023    128
2022     51
Name: count, dtype: int64

In [89]:
merged_df = pd.merge(main_data, arr_data, on=['Carrier_Code','Date', 'Flight_Number', 'Tail_Number'], how='inner')
merged_df.head()

Unnamed: 0,Carrier_Code,Date,Flight_Number,Tail_Number,Destination_Airport,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),Departure delay (Minutes),...,Wheels-on Time,Taxi-In time (Minutes),arr_Delay_Carrier,arr_Delay_Weather,arr_Delay_National_Aviation_System,arr_Delay_Security,arr_Delay_Late_Aircraft_Arrival,arr_hour,arr_day,arr_year
0,WN,2022-01-01,1596,N730SW,SYR,10:30,10:32,170,142,2,...,12:51,3,0,0,0,0,0,13,5,2022
1,WN,2023-01-07,2695,N8318F,SYR,11:05,11:05,160,161,0,...,13:43,3,0,0,0,0,0,13,5,2023
2,WN,2023-01-14,2695,N8555Z,SYR,11:05,11:02,160,180,-3,...,13:59,3,0,0,17,0,0,13,5,2023
3,WN,2023-01-21,2695,N500WR,SYR,11:05,11:01,160,156,-4,...,13:35,2,0,0,0,0,0,13,5,2023
4,WN,2022-01-22,3479,N8690A,SYR,12:00,12:08,165,147,8,...,14:32,3,0,0,0,0,0,14,5,2022


In [90]:
merged_df.dtypes


Carrier_Code                                  object
Date                                  datetime64[ns]
Flight_Number                                  int64
Tail_Number                                   object
Destination_Airport                           object
Scheduled departure time                      object
Actual departure time                         object
Scheduled elapsed time (Minutes)               int64
Actual elapsed time (Minutes)                  int64
Departure delay (Minutes)                      int64
Wheels-off time                               object
Taxi-Out time (Minutes)                        int64
dep_Delay_Carrier                              int64
dep_Delay_Weather                              int64
dep_Delay_National_Aviation_System             int64
dep_Delay_Security                             int64
dep_Delay_Late_Aircraft_Arrival                int64
dep_hour                                       int64
dep_day                                       

In [91]:
len(merged_df)

179

In [92]:
merged_df.to_csv('flight_data\mco_syr_sw_22_23.csv')

In [93]:
new_df = pd.read_csv('flight_data\mco_syr_sw_22_23.csv')
len(new_df)
new_df.head()

179

Unnamed: 0.1,Unnamed: 0,Carrier_Code,Date,Flight_Number,Tail_Number,Destination_Airport,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),...,Wheels-on Time,Taxi-In time (Minutes),arr_Delay_Carrier,arr_Delay_Weather,arr_Delay_National_Aviation_System,arr_Delay_Security,arr_Delay_Late_Aircraft_Arrival,arr_hour,arr_day,arr_year
0,0,WN,2022-01-01,1596,N730SW,SYR,10:30,10:32,170,142,...,12:51,3,0,0,0,0,0,13,5,2022
1,1,WN,2023-01-07,2695,N8318F,SYR,11:05,11:05,160,161,...,13:43,3,0,0,0,0,0,13,5,2023
2,2,WN,2023-01-14,2695,N8555Z,SYR,11:05,11:02,160,180,...,13:59,3,0,0,17,0,0,13,5,2023
3,3,WN,2023-01-21,2695,N500WR,SYR,11:05,11:01,160,156,...,13:35,2,0,0,0,0,0,13,5,2023
4,4,WN,2022-01-22,3479,N8690A,SYR,12:00,12:08,165,147,...,14:32,3,0,0,0,0,0,14,5,2022
