In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

In [2]:
# fetch data 
airline_data = pd.read_csv('airline_dataset_2022_23.csv')
airline_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Actual Arrival Time,Scheduled Elapsed Time (Minutes),Actual Elapsed Time (Minutes),Arrival Delay (Minutes),...,ts_des,uv_des,vis_des,weather.icon_des,weather.description_des,weather.code_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Location_des
0,9E,1/1/23,5190,N325PQ,JFK,22:41:00,22:13,91,68,-28,...,1672631100,0.0,11,804,c04n,Overcast clouds,285,3.4,1.5,SYR
1,9E,1/2/22,5531,N678CA,JFK,14:12:00,17:12,77,84,180,...,1641150900,0.5,12,c04d,Overcast clouds,804,305,10.9,4.97,SYR
2,9E,1/2/23,5190,N195PQ,JFK,22:41:00,22:46,91,77,5,...,1672717500,0.0,16,804,c04n,Overcast clouds,75,2.4,0.5,SYR
3,9E,1/3/22,5531,N602LR,JFK,14:12:00,14:24,77,81,12,...,1641237300,1.1,16,c03d,Broken clouds,803,320,8.4,2.6,SYR
4,9E,1/3/23,5190,N303PQ,JFK,22:41:00,22:23,91,78,-18,...,1672803900,0.0,4,804,c04n,Overcast clouds,70,4.0,2.1,SYR


In [3]:
airline_data.shape

(5623, 90)

In [4]:
pd.set_option('display.max_columns', None)
airline_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x',
       'Destination Airport', 'Scheduled departure time',
       'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y',
       'Arrival_time', 'departure_time', 'app_temp',

In [5]:
#Unique carrier codes in the dataset
OP_CARRIER_unique_values = airline_data['Carrier Code'].unique()
print(OP_CARRIER_unique_values)

['9E' 'MQ' 'B6' 'WN' 'UA']


In [6]:
airline_data.isna().sum()

Carrier Code          0
Date (MM/DD/YYYY)     0
Flight Number         0
Tail Number          11
Origin Airport        0
                     ..
weather.code_des      0
wind_dir_des          0
wind_gust_spd_des     0
wind_spd_des          0
Location_des          0
Length: 90, dtype: int64

In [7]:
airline_data.dropna(inplace=True)

In [8]:
airline_data.isna().sum()

Carrier Code         0
Date (MM/DD/YYYY)    0
Flight Number        0
Tail Number          0
Origin Airport       0
                    ..
weather.code_des     0
wind_dir_des         0
wind_gust_spd_des    0
wind_spd_des         0
Location_des         0
Length: 90, dtype: int64

In [9]:
airline_data['Arrival Delay (Minutes)']=airline_data['Arrival Delay (Minutes)'].astype(np.int64)

In [10]:
airline_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5612 entries, 0 to 5622
Data columns (total 90 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Carrier Code                                5612 non-null   object 
 1   Date (MM/DD/YYYY)                           5612 non-null   object 
 2   Flight Number                               5612 non-null   int64  
 3   Tail Number                                 5612 non-null   object 
 4   Origin Airport                              5612 non-null   object 
 5   Scheduled Arrival Time                      5612 non-null   object 
 6   Actual Arrival Time                         5612 non-null   object 
 7   Scheduled Elapsed Time (Minutes)            5612 non-null   int64  
 8   Actual Elapsed Time (Minutes)               5612 non-null   int64  
 9   Arrival Delay (Minutes)                     5612 non-null   int64  
 10  Wheels-on Time   

In [11]:
#function to set status of flight based on Arrival Delay
def set_dependent_variable(time):
    if time<-5:
        return 'Early'
    elif (time>=-5 and time<=5):
        return 'On-time'
    else:
        return 'Late'

In [12]:
airline_data['Status']=airline_data['Arrival Delay (Minutes)'].apply(set_dependent_variable)

In [13]:
airline_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x',
       'Destination Airport', 'Scheduled departure time',
       'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y',
       'Arrival_time', 'departure_time', 'app_temp',

In [14]:
airline_data.isna().sum()

Carrier Code         0
Date (MM/DD/YYYY)    0
Flight Number        0
Tail Number          0
Origin Airport       0
                    ..
wind_dir_des         0
wind_gust_spd_des    0
wind_spd_des         0
Location_des         0
Status               0
Length: 91, dtype: int64

In [15]:
airline_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x',
       'Destination Airport', 'Scheduled departure time',
       'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y',
       'Arrival_time', 'departure_time', 'app_temp',

In [16]:
airline_data.drop(columns=['Tail Number','Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x', 'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y', 'app_temp', 'azimuth', 'clouds',
       'dewpt', 'dhi', 'dni', 'elev_angle', 'ghi', 'pod', 'precip_rate',
       'pres', 'revision_status', 'rh', 'slp', 'snow_rate', 'solar_rad',
       'temp', 'timestamp_local', 'timestamp_utc', 'ts', 'uv', 'vis',
       'weather.code', 'weather.icon', 'weather.description', 'wind_dir',
       'wind_gust_spd', 'wind_spd', 'Location'],inplace=True)
airline_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Destination Airport,Scheduled departure time,Arrival_time,departure_time,app_temp_des,azimuth_des,clouds_des,dewpt_des,dhi_des,dni_des,elev_angle_des,ghi_des,pod_des,precip_rate_des,pres_des,revision_status_des,rh_des,slp_des,snow_rate_des,solar_rad_des,temp_des,timestamp_local_des,timestamp_utc_des,ts_des,uv_des,vis_des,weather.icon_des,weather.description_des,weather.code_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Location_des,Status
0,9E,1/1/23,5190,JFK,22:41:00,SYR,21:10:00,1/1/23 22:45,1/1/23 21:15,2.7,314.0,100,2.1,0,0,-63.3,0,n,0.0,1000,final,88,1015,0.0,0,3.9,1/1/23 22:45,1/2/23 3:45,1672631100,0.0,11,804,c04n,Overcast clouds,285,3.4,1.5,SYR,Early
1,9E,1/2/22,5531,JFK,14:12:00,SYR,12:55:00,1/2/22 14:15,1/2/22 13:00,-11.1,210.4,100,-6.2,75,657,17.6,268,d,0.0,996,final,91,1011,0.0,102,-5.0,1/2/22 14:15,1/2/22 19:15,1641150900,0.5,12,c04d,Overcast clouds,804,305,10.9,4.97,SYR,Late
2,9E,1/2/23,5190,JFK,22:41:00,SYR,21:10:00,1/2/23 22:45,1/2/23 21:15,5.1,313.9,100,0.6,0,0,-63.1,0,n,0.0,1005,final,77,1020,0.0,0,4.2,1/2/23 22:45,1/3/23 3:45,1672717500,0.0,16,804,c04n,Overcast clouds,75,2.4,0.5,SYR,On-time
3,9E,1/3/22,5531,JFK,14:12:00,SYR,12:55:00,1/3/22 14:15,1/3/22 13:00,-11.5,210.3,43,-15.2,75,659,17.8,271,d,0.0,1009,final,52,1025,0.0,257,-7.1,1/3/22 14:15,1/3/22 19:15,1641237300,1.1,16,c03d,Broken clouds,803,320,8.4,2.6,SYR,Late
4,9E,1/3/23,5190,JFK,22:41:00,SYR,21:10:00,1/3/23 22:45,1/3/23 21:15,3.3,313.8,100,3.8,0,0,-63.0,0,n,0.0,991,final,92,1006,0.0,0,5.0,1/3/23 22:45,1/4/23 3:45,1672803900,0.0,4,804,c04n,Overcast clouds,70,4.0,2.1,SYR,Early


In [17]:
airline_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Origin Airport',
       'Scheduled Arrival Time', 'Destination Airport',
       'Scheduled departure time', 'Arrival_time', 'departure_time',
       'app_temp_des', 'azimuth_des', 'clouds_des', 'dewpt_des', 'dhi_des',
       'dni_des', 'elev_angle_des', 'ghi_des', 'pod_des', 'precip_rate_des',
       'pres_des', 'revision_status_des', 'rh_des', 'slp_des', 'snow_rate_des',
       'solar_rad_des', 'temp_des', 'timestamp_local_des', 'timestamp_utc_des',
       'ts_des', 'uv_des', 'vis_des', 'weather.icon_des',
       'weather.description_des', 'weather.code_des', 'wind_dir_des',
       'wind_gust_spd_des', 'wind_spd_des', 'Location_des', 'Status'],
      dtype='object')

In [18]:
len(airline_data.columns)

39

In [19]:
airline_data.drop(columns=['Arrival_time', 'departure_time',
                            'timestamp_local_des', 'timestamp_utc_des', 'Location_des',
                            'elev_angle_des', 'ts_des', 'app_temp_des',
                          'azimuth_des', 'dhi_des', 'dni_des', 'elev_angle_des', 
                           'ghi_des', 'pod_des', 'revision_status_des', 'weather.code_des',
                           'weather.icon_des', 'weather.description_des', 'solar_rad_des',
                           'vis_des'], 
                  inplace=True)
airline_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Destination Airport,Scheduled departure time,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status
0,9E,1/1/23,5190,JFK,22:41:00,SYR,21:10:00,100,2.1,0.0,1000,88,1015,0.0,3.9,0.0,285,3.4,1.5,Early
1,9E,1/2/22,5531,JFK,14:12:00,SYR,12:55:00,100,-6.2,0.0,996,91,1011,0.0,-5.0,0.5,305,10.9,4.97,Late
2,9E,1/2/23,5190,JFK,22:41:00,SYR,21:10:00,100,0.6,0.0,1005,77,1020,0.0,4.2,0.0,75,2.4,0.5,On-time
3,9E,1/3/22,5531,JFK,14:12:00,SYR,12:55:00,43,-15.2,0.0,1009,52,1025,0.0,-7.1,1.1,320,8.4,2.6,Late
4,9E,1/3/23,5190,JFK,22:41:00,SYR,21:10:00,100,3.8,0.0,991,92,1006,0.0,5.0,0.0,70,4.0,2.1,Early


In [20]:
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [21]:
numeric_columns = airline_data.select_dtypes(include=['float64', 'int64'])

correl = numeric_columns.corr()

trace = go.Heatmap(z=correl.values,
                  x=correl.index.values,
                  y=correl.columns.values)
data=[trace]
plotly.offline.iplot(data, filename='Airline data heatmap')

In [22]:
len(airline_data.columns)

20

In [23]:
airline_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Destination Airport,Scheduled departure time,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status
0,9E,1/1/23,5190,JFK,22:41:00,SYR,21:10:00,100,2.1,0.0,1000,88,1015,0.0,3.9,0.0,285,3.4,1.5,Early
1,9E,1/2/22,5531,JFK,14:12:00,SYR,12:55:00,100,-6.2,0.0,996,91,1011,0.0,-5.0,0.5,305,10.9,4.97,Late
2,9E,1/2/23,5190,JFK,22:41:00,SYR,21:10:00,100,0.6,0.0,1005,77,1020,0.0,4.2,0.0,75,2.4,0.5,On-time
3,9E,1/3/22,5531,JFK,14:12:00,SYR,12:55:00,43,-15.2,0.0,1009,52,1025,0.0,-7.1,1.1,320,8.4,2.6,Late
4,9E,1/3/23,5190,JFK,22:41:00,SYR,21:10:00,100,3.8,0.0,991,92,1006,0.0,5.0,0.0,70,4.0,2.1,Early


In [24]:



# Sort the dataset based on "Carrier Code" and "Date (MM/DD/YYYY)"
airline_data.sort_values(by=["Carrier Code", "Date (MM/DD/YYYY)", "Scheduled Arrival Time"], inplace=True)

# Add a new column to store the status of the previous flight
airline_data['Previous Flight Status'] = airline_data.groupby(['Carrier Code', 'Date (MM/DD/YYYY)'])['Status'].shift(1)

# Handle the first flight within each group
first_flight_mask = airline_data['Previous Flight Status'].isna()
airline_data.loc[first_flight_mask, 'Previous Flight Status'] = 'Unknown'  # Replace "N/A" with "Unknown"

# Print the updated dataset
airline_data.head()


Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Destination Airport,Scheduled departure time,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status
0,9E,1/1/23,5190,JFK,22:41:00,SYR,21:10:00,100,2.1,0.0,1000,88,1015,0.0,3.9,0.0,285,3.4,1.5,Early,Unknown
21,9E,1/10/22,5446,JFK,13:45:00,SYR,12:30:00,78,-13.7,0.0,1003,52,1018,0.0,-5.4,0.9,240,14.8,11.3,On-time,Unknown
24,9E,1/10/23,5350,JFK,11:15:00,SYR,9:50:00,100,-5.0,0.0,1002,72,1018,0.0,-0.6,0.6,340,7.0,3.1,Early,Unknown
23,9E,1/10/23,5224,JFK,18:01:00,SYR,16:35:00,100,-7.9,0.0,1004,68,1019,0.0,-2.8,0.0,310,3.6,3.6,Early,Early
22,9E,1/10/23,5009,JFK,23:54:00,SYR,22:30:00,25,-10.0,0.0,1006,77,1021,0.0,-6.7,0.0,300,2.1,2.1,On-time,Early


In [25]:
airline_data.dtypes

Carrier Code                 object
Date (MM/DD/YYYY)            object
Flight Number                 int64
Origin Airport               object
Scheduled Arrival Time       object
Destination Airport          object
Scheduled departure time     object
clouds_des                    int64
dewpt_des                   float64
precip_rate_des             float64
pres_des                      int64
rh_des                        int64
slp_des                       int64
snow_rate_des               float64
temp_des                    float64
uv_des                      float64
wind_dir_des                  int64
wind_gust_spd_des           float64
wind_spd_des                float64
Status                       object
Previous Flight Status       object
dtype: object

In [26]:
#spliting the Arrival and Departure times into Hours and minutes for easy understanding to model
airline_data['Scheduled Arrival Hour']= pd.to_datetime(airline_data['Scheduled Arrival Time']).dt.hour
airline_data['Scheduled Arrival Minutes']= pd.to_datetime(airline_data['Scheduled Arrival Time']).dt.minute

airline_data['Scheduled departure Hour']= pd.to_datetime(airline_data['Scheduled departure time']).dt.hour
airline_data['Scheduled departure Minutes']= pd.to_datetime(airline_data['Scheduled departure time']).dt.minute

airline_data


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Destination Airport,Scheduled departure time,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
0,9E,1/1/23,5190,JFK,22:41:00,SYR,21:10:00,100,2.10,0.00,1000,88,1015,0.00,3.90,0.00,285,3.40,1.50,Early,Unknown,22,41,21,10
21,9E,1/10/22,5446,JFK,13:45:00,SYR,12:30:00,78,-13.70,0.00,1003,52,1018,0.00,-5.40,0.90,240,14.80,11.30,On-time,Unknown,13,45,12,30
24,9E,1/10/23,5350,JFK,11:15:00,SYR,9:50:00,100,-5.00,0.00,1002,72,1018,0.00,-0.60,0.60,340,7.00,3.10,Early,Unknown,11,15,9,50
23,9E,1/10/23,5224,JFK,18:01:00,SYR,16:35:00,100,-7.90,0.00,1004,68,1019,0.00,-2.80,0.00,310,3.60,3.60,Early,Early,18,1,16,35
22,9E,1/10/23,5009,JFK,23:54:00,SYR,22:30:00,25,-10.00,0.00,1006,77,1021,0.00,-6.70,0.00,300,2.10,2.10,On-time,Early,23,54,22,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657,WN,9/23/23,2885,MCO,15:05:00,SYR,12:25:00,87,8.10,0.00,1008,57,1022,0.00,16.70,1.80,90,7.20,3.10,Early,Unknown,15,5,12,25
4658,WN,9/24/22,3057,MCO,14:55:00,SYR,12:15:00,87,6.70,0.00,1000,48,1014,0.00,17.80,1.80,270,6.80,3.60,Early,Unknown,14,55,12,15
4652,WN,9/3/22,4667,MCO,13:55:00,SYR,11:10:00,87,17.10,0.00,1006,54,1020,0.00,27.20,2.80,135,6.80,2.80,Early,Unknown,13,55,11,10
4659,WN,9/30/23,2885,MCO,15:05:00,SYR,12:25:00,50,13.20,0.00,1007,48,1022,0.00,25.00,2.70,350,6.40,3.60,On-time,Unknown,15,5,12,25


In [27]:
airline_data.drop(columns=['Scheduled Arrival Time'],inplace=True)
airline_data.drop(columns=['Scheduled departure time'],inplace=True)
airline_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Destination Airport,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
0,9E,1/1/23,5190,JFK,SYR,100,2.1,0.0,1000,88,1015,0.0,3.9,0.0,285,3.4,1.5,Early,Unknown,22,41,21,10
21,9E,1/10/22,5446,JFK,SYR,78,-13.7,0.0,1003,52,1018,0.0,-5.4,0.9,240,14.8,11.3,On-time,Unknown,13,45,12,30
24,9E,1/10/23,5350,JFK,SYR,100,-5.0,0.0,1002,72,1018,0.0,-0.6,0.6,340,7.0,3.1,Early,Unknown,11,15,9,50
23,9E,1/10/23,5224,JFK,SYR,100,-7.9,0.0,1004,68,1019,0.0,-2.8,0.0,310,3.6,3.6,Early,Early,18,1,16,35
22,9E,1/10/23,5009,JFK,SYR,25,-10.0,0.0,1006,77,1021,0.0,-6.7,0.0,300,2.1,2.1,On-time,Early,23,54,22,30


In [28]:
airline_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5612 entries, 0 to 4653
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Carrier Code                 5612 non-null   object 
 1   Date (MM/DD/YYYY)            5612 non-null   object 
 2   Flight Number                5612 non-null   int64  
 3   Origin Airport               5612 non-null   object 
 4   Destination Airport          5612 non-null   object 
 5   clouds_des                   5612 non-null   int64  
 6   dewpt_des                    5612 non-null   float64
 7   precip_rate_des              5612 non-null   float64
 8   pres_des                     5612 non-null   int64  
 9   rh_des                       5612 non-null   int64  
 10  slp_des                      5612 non-null   int64  
 11  snow_rate_des                5612 non-null   float64
 12  temp_des                     5612 non-null   float64
 13  uv_des                 

In [29]:
airline_data['Date'] = airline_data['Date (MM/DD/YYYY)'].astype('datetime64[ns]')


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



In [30]:
airline_data.drop(columns=['Date (MM/DD/YYYY)'],inplace=True)
airline_data.head()

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,Destination Airport,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date
0,9E,5190,JFK,SYR,100,2.1,0.0,1000,88,1015,0.0,3.9,0.0,285,3.4,1.5,Early,Unknown,22,41,21,10,2023-01-01
21,9E,5446,JFK,SYR,78,-13.7,0.0,1003,52,1018,0.0,-5.4,0.9,240,14.8,11.3,On-time,Unknown,13,45,12,30,2022-01-10
24,9E,5350,JFK,SYR,100,-5.0,0.0,1002,72,1018,0.0,-0.6,0.6,340,7.0,3.1,Early,Unknown,11,15,9,50,2023-01-10
23,9E,5224,JFK,SYR,100,-7.9,0.0,1004,68,1019,0.0,-2.8,0.0,310,3.6,3.6,Early,Early,18,1,16,35,2023-01-10
22,9E,5009,JFK,SYR,25,-10.0,0.0,1006,77,1021,0.0,-6.7,0.0,300,2.1,2.1,On-time,Early,23,54,22,30,2023-01-10


In [31]:
airline_data['Status'].replace("Early",0,inplace=True)
airline_data['Status'].replace("Late",1,inplace=True)
airline_data['Status'].replace("On-time",2,inplace=True)
airline_data.head()

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,Destination Airport,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date
0,9E,5190,JFK,SYR,100,2.1,0.0,1000,88,1015,0.0,3.9,0.0,285,3.4,1.5,0,Unknown,22,41,21,10,2023-01-01
21,9E,5446,JFK,SYR,78,-13.7,0.0,1003,52,1018,0.0,-5.4,0.9,240,14.8,11.3,2,Unknown,13,45,12,30,2022-01-10
24,9E,5350,JFK,SYR,100,-5.0,0.0,1002,72,1018,0.0,-0.6,0.6,340,7.0,3.1,0,Unknown,11,15,9,50,2023-01-10
23,9E,5224,JFK,SYR,100,-7.9,0.0,1004,68,1019,0.0,-2.8,0.0,310,3.6,3.6,0,Early,18,1,16,35,2023-01-10
22,9E,5009,JFK,SYR,25,-10.0,0.0,1006,77,1021,0.0,-6.7,0.0,300,2.1,2.1,2,Early,23,54,22,30,2023-01-10


In [32]:
set(airline_data['Origin Airport'])

{'JFK', 'MCO', 'ORD'}

In [33]:
len(airline_data.columns)

23

In [34]:
# Filter the dataset to create two subsets
first_flights = airline_data[airline_data['Previous Flight Status'] == 'Unknown']
subsequent_flights = airline_data[airline_data['Previous Flight Status'] != 'Unknown']
first_flights.head()

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,Destination Airport,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date
0,9E,5190,JFK,SYR,100,2.1,0.0,1000,88,1015,0.0,3.9,0.0,285,3.4,1.5,0,Unknown,22,41,21,10,2023-01-01
21,9E,5446,JFK,SYR,78,-13.7,0.0,1003,52,1018,0.0,-5.4,0.9,240,14.8,11.3,2,Unknown,13,45,12,30,2022-01-10
24,9E,5350,JFK,SYR,100,-5.0,0.0,1002,72,1018,0.0,-0.6,0.6,340,7.0,3.1,0,Unknown,11,15,9,50,2023-01-10
25,9E,5446,JFK,SYR,50,-18.3,0.0,1018,66,1034,0.0,-13.3,1.2,260,5.7,3.1,0,Unknown,13,45,12,30,2022-01-11
28,9E,5350,JFK,SYR,90,-6.9,0.0,1007,80,1023,0.0,-4.0,0.8,80,6.2,3.84,1,Unknown,11,15,9,50,2023-01-11


In [35]:
# first_flights = airline_data[airline_data['Previous Flight Status'] == 'Unknown']
# subsequent_flights = airline_data[airline_data['Previous Flight Status'] != 'Unknown']
# first_flights.head()

In [36]:
subsequent_flights.head()

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,Destination Airport,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date
23,9E,5224,JFK,SYR,100,-7.9,0.0,1004,68,1019,0.0,-2.8,0.0,310,3.6,3.6,0,Early,18,1,16,35,2023-01-10
22,9E,5009,JFK,SYR,25,-10.0,0.0,1006,77,1021,0.0,-6.7,0.0,300,2.1,2.1,2,Early,23,54,22,30,2023-01-10
27,9E,5224,JFK,SYR,100,-6.2,0.0,1004,63,1019,0.0,0.0,0.0,80,7.6,6.2,1,Late,18,1,16,35,2023-01-11
26,9E,5009,JFK,SYR,25,-5.0,0.0,1003,72,1018,0.0,-0.6,0.0,70,10.8,3.6,1,Late,23,54,22,30,2023-01-11
31,9E,5224,JFK,SYR,100,2.7,4.0,992,92,1007,0.0,3.9,0.0,70,9.6,4.09,2,Early,18,1,16,35,2023-01-12


In [37]:
first_flights.drop(columns=['Previous Flight Status'],inplace=True)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [38]:
first_flights = pd.get_dummies(first_flights, drop_first = True)
first_flights.head()

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
0,5190,100,2.1,0.0,1000,88,1015,0.0,3.9,0.0,285,3.4,1.5,0,22,41,21,10,2023-01-01,False,False,False,False,False,False
21,5446,78,-13.7,0.0,1003,52,1018,0.0,-5.4,0.9,240,14.8,11.3,2,13,45,12,30,2022-01-10,False,False,False,False,False,False
24,5350,100,-5.0,0.0,1002,72,1018,0.0,-0.6,0.6,340,7.0,3.1,0,11,15,9,50,2023-01-10,False,False,False,False,False,False
25,5446,50,-18.3,0.0,1018,66,1034,0.0,-13.3,1.2,260,5.7,3.1,0,13,45,12,30,2022-01-11,False,False,False,False,False,False
28,5350,90,-6.9,0.0,1007,80,1023,0.0,-4.0,0.8,80,6.2,3.84,1,11,15,9,50,2023-01-11,False,False,False,False,False,False


In [39]:
subsequent_flights['Previous Flight Status'].replace("Early",0,inplace=True)
subsequent_flights['Previous Flight Status'].replace("Late",1,inplace=True)
subsequent_flights['Previous Flight Status'].replace("On-time",2,inplace=True)
subsequent_flights.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Carrier Code,Flight Number,Origin Airport,Destination Airport,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date
23,9E,5224,JFK,SYR,100,-7.9,0.0,1004,68,1019,0.0,-2.8,0.0,310,3.6,3.6,0,0,18,1,16,35,2023-01-10
22,9E,5009,JFK,SYR,25,-10.0,0.0,1006,77,1021,0.0,-6.7,0.0,300,2.1,2.1,2,0,23,54,22,30,2023-01-10
27,9E,5224,JFK,SYR,100,-6.2,0.0,1004,63,1019,0.0,0.0,0.0,80,7.6,6.2,1,1,18,1,16,35,2023-01-11
26,9E,5009,JFK,SYR,25,-5.0,0.0,1003,72,1018,0.0,-0.6,0.0,70,10.8,3.6,1,1,23,54,22,30,2023-01-11
31,9E,5224,JFK,SYR,100,2.7,4.0,992,92,1007,0.0,3.9,0.0,70,9.6,4.09,2,0,18,1,16,35,2023-01-12


In [40]:
subsequent_flights = pd.get_dummies(subsequent_flights, drop_first = True)
subsequent_flights.head()

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
23,5224,100,-7.9,0.0,1004,68,1019,0.0,-2.8,0.0,310,3.6,3.6,0,0,18,1,16,35,2023-01-10,False,False,False,False,False,False
22,5009,25,-10.0,0.0,1006,77,1021,0.0,-6.7,0.0,300,2.1,2.1,2,0,23,54,22,30,2023-01-10,False,False,False,False,False,False
27,5224,100,-6.2,0.0,1004,63,1019,0.0,0.0,0.0,80,7.6,6.2,1,1,18,1,16,35,2023-01-11,False,False,False,False,False,False
26,5009,25,-5.0,0.0,1003,72,1018,0.0,-0.6,0.0,70,10.8,3.6,1,1,23,54,22,30,2023-01-11,False,False,False,False,False,False
31,5224,100,2.7,4.0,992,92,1007,0.0,3.9,0.0,70,9.6,4.09,2,0,18,1,16,35,2023-01-12,False,False,False,False,False,False


In [41]:
print(len(first_flights.columns))
print(len(subsequent_flights.columns))

25
26


In [42]:
X_train, X_test, y_train, y_test = train_test_split(first_flights.drop(columns= ['Date','Status'], axis=1), first_flights['Status'], stratify = first_flights['Status'], test_size=0.20, random_state = 35)
X_train.head()
X_test.head()
y_train.head()
y_test.head()


Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
2479,3319,87,3.9,0.0,988,68,1003,0.0,9.6,0.9,250,9.0,3.35,16,12,13,15,False,True,False,False,False,True
5395,333,100,11.0,1.0,997,89,1011,0.0,12.8,0.0,205,6.9,2.35,21,11,18,15,False,False,True,False,False,True
1815,3518,87,-5.3,0.0,998,39,1013,0.0,7.8,0.5,40,3.35,3.35,16,15,13,21,False,True,False,False,False,True
4603,491,100,-4.4,1.0,992,57,1006,0.0,3.3,1.8,270,14.4,11.8,13,40,11,0,False,False,False,True,True,False
5289,1559,50,16.4,0.0,1001,49,1016,0.0,28.2,2.3,330,5.2,2.6,16,50,13,55,False,False,True,False,False,True


Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
1174,5183,100,12.0,0.0,999,66,1014,0.0,18.4,1.9,180,4.4,1.95,12,14,10,59,False,False,False,False,False,False
4834,1094,25,-4.1,0.0,1008,56,1023,0.0,3.9,0.0,120,7.6,2.6,20,59,18,10,False,False,True,False,False,True
4039,398,100,7.0,0.0,998,44,1013,0.0,19.5,0.5,145,8.9,3.85,17,44,16,30,True,False,False,False,False,False
4923,538,100,2.8,0.0,983,82,997,0.0,5.6,0.0,270,15.4,10.8,21,34,18,40,False,False,True,False,False,True
3543,116,25,7.8,0.0,1003,22,1017,0.0,32.2,7.0,155,6.4,0.64,14,42,13,30,True,False,False,False,False,False


2479    2
5395    2
1815    0
4603    1
5289    0
Name: Status, dtype: int64

1174    2
4834    0
4039    0
4923    2
3543    0
Name: Status, dtype: int64

In [43]:
#scaling the data using standard scaler

if True: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
    X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns, index = X_test.index)
    X_train
    X_test
    y_train
    y_test

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
2479,0.30,0.35,-0.04,-0.21,-1.60,0.30,-1.54,-0.10,-0.22,-0.19,0.46,0.45,-0.28,0.51,-0.87,-0.32,-0.85,-0.59,2.19,-0.55,-0.26,-0.41,1.20
5395,-1.21,0.81,0.70,0.84,-0.44,1.48,-0.55,-0.10,0.09,-0.78,-0.05,-0.17,-0.71,1.24,-0.92,0.71,-0.85,-0.59,-0.46,1.80,-0.26,-0.41,1.20
1815,0.40,0.35,-1.00,-0.21,-0.31,-1.33,-0.30,-0.10,-0.39,-0.45,-1.90,-1.20,-0.28,0.51,-0.70,-0.32,-0.52,-0.59,2.19,-0.55,-0.26,-0.41,1.20
4603,-1.13,0.81,-0.91,0.84,-1.09,-0.32,-1.17,-0.10,-0.83,0.40,0.68,2.03,3.36,0.07,0.67,-0.74,-1.66,-0.59,-0.46,-0.55,3.82,2.45,-0.83
5289,-0.59,-0.94,1.27,-0.21,0.07,-0.76,0.08,-0.10,1.59,0.73,1.35,-0.66,-0.60,0.51,1.22,-0.32,1.32,-0.59,-0.46,1.80,-0.26,-0.41,1.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2356,0.75,0.81,0.99,-0.21,-0.19,0.24,-0.30,-0.10,0.80,-0.39,0.85,0.07,0.10,0.65,1.00,-0.12,1.38,-0.59,2.19,-0.55,-0.26,-0.41,1.20
2584,0.40,0.81,-0.69,-0.21,1.62,0.36,1.58,-0.10,-0.88,-0.45,0.57,0.04,-0.17,0.36,1.55,-0.32,-1.66,-0.59,2.19,-0.55,-0.26,-0.41,1.20
1705,1.15,-0.31,-1.32,-0.21,1.49,0.30,1.58,-0.10,-1.47,-0.78,0.17,-0.49,-0.49,-1.84,-0.04,1.54,1.05,-0.59,-0.46,-0.55,-0.26,-0.41,-0.83
3854,-1.05,0.81,1.55,-0.21,-0.31,0.58,-0.30,-0.10,1.21,0.21,0.62,0.22,-0.12,-0.37,1.22,-1.36,-1.28,1.68,-0.46,-0.55,-0.26,2.45,-0.83


Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
1174,1.25,0.81,0.81,-0.21,-0.19,0.19,-0.17,-0.10,0.64,0.47,-0.33,-0.90,-0.88,-0.08,-0.76,-0.94,1.54,-0.59,-0.46,-0.55,-0.26,-0.41,-0.83
4834,-0.83,-1.81,-0.87,-0.21,0.97,-0.37,0.95,-0.10,-0.77,-0.78,-1.01,0.04,-0.60,1.09,1.72,0.71,-1.12,-0.59,-0.46,1.80,-0.26,-0.41,1.20
4039,-1.18,0.81,0.28,-0.21,-0.31,-1.05,-0.30,-0.10,0.75,-0.45,-0.72,0.42,-0.06,0.65,0.89,0.30,-0.03,1.68,-0.46,-0.55,-0.26,-0.41,-0.83
4923,-1.11,0.81,-0.15,-0.21,-2.24,1.08,-2.29,-0.10,-0.61,-0.78,0.68,2.32,2.93,1.24,0.34,0.71,0.51,-0.59,-0.46,1.80,-0.26,-0.41,1.20
3543,-1.32,-1.81,0.37,-0.21,0.33,-2.28,0.20,-0.10,1.98,3.82,-0.61,-0.31,-1.45,0.21,0.78,-0.32,-0.03,1.68,-0.46,-0.55,-0.26,-0.41,-0.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4207,-1.05,0.67,1.06,-0.21,0.07,0.02,0.08,-0.10,0.97,-0.12,-1.62,0.98,-0.63,-0.37,-0.48,-1.57,0.24,1.68,-0.46,-0.55,-0.26,2.45,-0.83
1710,1.26,0.04,-1.04,-0.21,1.87,0.02,1.95,-0.10,-1.09,-0.19,-0.56,-0.66,-1.39,-0.23,-0.76,-1.15,1.05,-0.59,-0.46,-0.55,-0.26,-0.41,-0.83
3243,-1.38,0.81,-0.46,-0.21,-1.60,1.25,-1.67,-0.10,-0.94,-0.78,1.24,1.74,1.81,-1.84,-1.31,1.54,0.78,1.68,-0.46,-0.55,-0.26,-0.41,-0.83
1885,0.71,0.35,-0.95,-0.21,-0.06,-2.11,-0.05,-0.10,0.31,0.80,0.12,0.83,1.60,-0.23,-1.14,-1.36,-0.41,-0.59,2.19,-0.55,-0.26,-0.41,1.20


2479    2
5395    2
1815    0
4603    1
5289    0
       ..
2356    0
2584    0
1705    0
3854    0
89      2
Name: Status, Length: 2240, dtype: int64

1174    2
4834    0
4039    0
4923    2
3543    0
       ..
4207    2
1710    2
3243    2
1885    2
5495    1
Name: Status, Length: 561, dtype: int64

In [44]:
len(X_train)
len(y_train)
len(X_test)
len(y_test)

2240

2240

561

561

In [45]:
# from sklearn.model_selection import cross_val_score
# from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import StratifiedKFold

# # Define classifiers with different parameters
# classifiers = {
#     'Random Forest': RandomForestClassifier(random_state=50, min_samples_leaf=6, max_features="sqrt", n_estimators=1000),
#     'Bagging': BaggingClassifier(base_estimator=RandomForestClassifier(random_state=50), n_estimators=100, max_samples=100),
#     'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3),
#     'Logistic Regression': LogisticRegression(),
# #     'Support Vector Machine': SVC(),
# #     'K-Nearest Neighbors': KNeighborsClassifier()
# }

# # Initialize lists to store results
# results = {}
# std_devs = {}

# # Define cross-validation strategy
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=50)

# # Perform cross-validation for each classifier
# for clf_name, clf in classifiers.items():
#     scores = cross_val_score(clf,X_train, y_train, cv=cv, scoring='accuracy')
#     results[clf_name] = scores.mean()
#     std_devs[clf_name] = scores.std()

# # Print results
# print("Mean Accuracy Scores:")
# for clf_name, mean_acc in results.items():
#     print(f"{clf_name}: {mean_acc:.4f} Â± {std_devs[clf_name]:.4f}")

# # Select classifier with highest mean accuracy
# best_clf = max(results, key=results.get)
# print(f"\nBest Classifier: {best_clf}")


<h6>used the above code to select the best model based on accuarcy score</h6>

In [46]:
# Define parameter grid
# param_grid = {
#     'n_estimators': [100, 1000, 150],
#     'max_depth': [None, 5, 10],
#     'min_samples_leaf': [1, 3, 5],
#     'max_features': [None, 'sqrt', 'log2']
# }

# best_accuracy = 0  # Initialize with 0
# best_params = None

# # Perform grid search
# for n_estimators in param_grid['n_estimators']:
#     for max_depth in param_grid['max_depth']:
#         for min_samples_leaf in param_grid['min_samples_leaf']:
#             for max_features in param_grid['max_features']:
#                 # Create Random Forest Classifier with current parameters
#                 rf = RandomForestClassifier(n_estimators=n_estimators, 
#                                              max_depth=max_depth, 
#                                              min_samples_leaf=min_samples_leaf,
#                                              max_features=max_features,
#                                              random_state=50)
        
#                 # Perform cross-validation to get predicted values
#                 y_pred = cross_val_predict(rf, X_train, y_train, cv=5)
        
#                 # Calculate accuracy
#                 accuracy = accuracy_score(y_train, y_pred)
        
#                 # Check if current parameters yield a better accuracy
#                 if accuracy > best_accuracy:
#                     best_accuracy = accuracy
#                     best_params = {'n_estimators': n_estimators, 
#                                    'max_depth': max_depth, 
#                                    'min_samples_leaf': min_samples_leaf,
#                                    'max_features': max_features}
        
#                 print(f"Parameters: n_estimators={n_estimators}, max_depth={max_depth}, min_samples_leaf={min_samples_leaf}, max_features={max_features}, Accuracy: {accuracy}")

# print("Best Parameters:", best_params)
# print("Best Accuracy:", best_accuracy)


<h6>Tried execution of selected model with different parameters for parameter tuning</h6>

In [47]:
rf_1 = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_leaf=3, max_features="sqrt", random_state=42)
rf_1 = rf_1.fit(X_train, y_train)

In [48]:
from sklearn.metrics import accuracy_score

# Predict the labels for the test data
y_pred_test = rf_1.predict(X_test)

# Calculate the accuracy score on the test data
accuracy_test = accuracy_score(y_test, y_pred_test)

print("Accuracy on test data:", accuracy_test)

Accuracy on test data: 0.5383244206773619


In [49]:
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(subsequent_flights.drop(columns= ['Date', 'Status'], axis=1), subsequent_flights['Status'], stratify = subsequent_flights['Status'], test_size=0.20, random_state = 35)
X_train_sub.head()
X_test_sub.head()
y_train_sub.head()
y_test_sub.head()


Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
829,5118,100,18.0,0.0,998,60,1012,0.0,26.4,2.3,135,4.8,2.04,1,15,23,13,59,False,False,False,False,False,False
2301,3545,0,15.4,0.0,1004,80,1019,0.0,18.9,0.0,115,4.2,1.65,2,23,11,20,20,False,True,False,False,False,True
677,4730,87,14.1,0.0,999,46,1014,0.0,26.6,2.0,180,11.3,8.25,1,16,25,15,10,False,False,False,False,False,False
1720,5090,100,-15.7,0.0,979,86,994,0.0,-13.9,0.0,220,13.4,8.8,2,23,58,22,30,False,False,False,False,False,False
3550,2516,87,13.8,0.0,998,61,1013,0.0,21.6,0.0,280,3.1,1.3,0,23,19,21,59,True,False,False,False,False,False


Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
4289,16,87,-6.0,0.0,1000,58,1015,0.0,1.4,0.9,220,11.2,6.45,2,9,20,8,10,True,False,False,False,False,False
2096,3402,93,20.0,0.0,999,75,1013,0.0,24.7,0.0,305,2.8,2.6,2,21,32,18,36,False,True,False,False,False,True
5094,1503,100,20.4,1.5,996,84,1010,0.0,23.3,0.0,210,7.6,4.09,0,23,59,21,5,False,False,True,False,False,True
4472,656,25,-5.1,0.0,1013,50,1029,0.0,4.4,0.0,210,8.1,4.34,0,19,13,16,25,True,False,False,False,True,False
2250,3911,66,14.9,0.0,993,70,1007,0.0,20.6,0.0,240,8.4,4.09,1,20,0,17,13,False,True,False,False,False,True


829     2
2301    1
677     0
1720    1
3550    1
Name: Status, dtype: int64

4289    0
2096    0
5094    2
4472    1
2250    0
Name: Status, dtype: int64

In [50]:
if True: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train_sub = pd.DataFrame(sc.fit_transform(X_train_sub), columns = X_train_sub.columns, index = X_train_sub.index)
    X_test_sub = pd.DataFrame(sc.transform(X_test_sub), columns = X_test_sub.columns, index = X_test_sub.index)
    X_train_sub
    X_test_sub
    y_train_sub
    y_test_sub

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
829,1.08,0.83,1.51,-0.09,-0.34,-0.21,-0.44,-0.12,1.49,1.10,-0.82,-0.66,-0.77,0.31,-0.82,-0.37,-0.85,1.75,-0.81,-0.47,-0.24,0.00,-0.40,-0.56
2301,0.26,-2.50,1.23,-0.09,0.43,0.86,0.42,-0.12,0.75,-0.62,-1.04,-0.84,-0.95,1.57,1.14,-0.98,0.86,-0.47,-0.81,2.11,-0.24,0.00,-0.40,1.78
677,0.88,0.39,1.09,-0.09,-0.21,-0.96,-0.20,-0.12,1.51,0.88,-0.31,1.25,2.06,0.31,-0.58,-0.27,-0.36,-1.05,-0.81,-0.47,-0.24,0.00,-0.40,-0.56
1720,1.07,0.83,-2.15,-0.09,-2.75,1.18,-2.67,-0.12,-2.52,-0.62,0.13,1.86,2.32,1.57,1.14,1.41,1.36,0.10,-0.81,-0.47,-0.24,0.00,-0.40,-0.56
3550,-0.28,0.39,1.06,-0.09,-0.34,-0.16,-0.32,-0.12,1.01,-0.62,0.80,-1.16,-1.11,-0.95,1.14,-0.57,1.11,1.75,1.23,-0.47,-0.24,0.00,-0.40,-0.56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2192,0.45,0.39,1.73,-0.09,-0.72,1.39,-0.82,-0.12,1.02,-0.62,0.69,-1.60,-1.52,0.31,0.40,-1.54,0.13,-0.88,-0.81,2.11,-0.24,0.00,-0.40,1.78
3435,-0.28,-0.84,0.01,-0.09,0.43,-0.27,0.42,-0.12,0.05,-0.62,-0.65,-0.78,-1.54,0.31,1.14,-0.57,1.11,1.75,1.23,-0.47,-0.24,0.00,-0.40,-0.56
2047,0.19,0.39,1.24,-0.09,-0.59,1.12,-0.69,-0.12,0.67,-0.62,-0.31,-1.25,-1.02,-0.95,0.65,0.09,0.37,0.44,-0.81,2.11,-0.24,0.00,-0.40,1.78
1438,1.12,0.83,-0.12,-0.09,0.68,-0.75,0.67,-0.12,0.18,0.13,0.64,-0.40,-1.54,-0.95,-1.56,-0.57,-1.59,1.75,-0.81,-0.47,-0.24,0.00,-0.40,-0.56


Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
4289,-1.59,0.39,-1.09,-0.09,-0.08,-0.32,-0.07,-0.12,-1.00,0.06,0.13,1.22,1.24,1.57,-2.29,-0.52,-2.08,-1.05,1.23,-0.47,-0.24,0.00,-0.40,-0.56
2096,0.19,0.59,1.73,-0.09,-0.21,0.59,-0.32,-0.12,1.32,-0.62,1.08,-1.25,-0.52,1.57,0.65,0.09,0.37,0.44,-0.81,2.11,-0.24,0.00,-0.40,1.78
5094,-0.81,0.83,1.77,0.52,-0.59,1.07,-0.69,-0.12,1.18,-0.62,0.02,0.16,0.16,-0.95,1.14,1.46,1.11,-1.33,-0.81,-0.47,4.10,0.00,-0.40,1.78
4472,-1.25,-1.67,-1.00,-0.09,1.57,-0.75,1.66,-0.12,-0.70,-0.62,0.02,0.31,0.28,-0.95,0.16,-0.88,-0.12,-0.19,1.23,-0.47,-0.24,0.00,2.48,-0.56
2250,0.45,-0.31,1.17,-0.09,-0.97,0.32,-1.06,-0.12,0.91,-0.62,0.36,0.40,0.16,0.31,0.40,-1.54,0.13,-0.88,-0.81,2.11,-0.24,0.00,-0.40,1.78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,1.02,-1.67,1.25,-0.09,0.68,1.07,0.55,-0.12,0.69,-0.62,-0.76,-1.45,-0.75,-0.95,1.14,1.46,1.36,0.27,-0.81,-0.47,-0.24,0.00,-0.40,-0.56
1540,0.95,0.83,-1.17,-0.09,1.06,-0.64,1.16,-0.12,-0.93,0.06,-1.37,0.69,0.57,0.31,-1.56,-0.62,-1.59,1.47,-0.81,-0.47,-0.24,0.00,-0.40,-0.56
3610,-0.28,0.39,0.98,-0.09,0.30,0.16,0.30,-0.12,0.79,-0.62,-1.60,-0.89,-0.43,1.57,1.14,-1.43,1.11,0.50,1.23,-0.47,-0.24,0.00,-0.40,-0.56
3277,-1.25,0.39,-0.14,-0.09,-1.74,0.22,-1.68,-0.12,-0.31,1.55,1.03,1.10,-0.29,0.31,-1.31,1.31,-1.34,-0.99,1.23,-0.47,-0.24,0.00,2.48,-0.56


829     2
2301    1
677     0
1720    1
3550    1
       ..
2192    1
3435    1
2047    1
1438    0
337     1
Name: Status, Length: 2248, dtype: int64

4289    0
2096    0
5094    2
4472    1
2250    0
       ..
1268    0
1540    2
3610    1
3277    0
3988    1
Name: Status, Length: 563, dtype: int64

In [51]:
rf_2 = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_leaf=3, max_features="sqrt", random_state=42)
rf_2 = rf_2.fit(X_train_sub, y_train_sub)

In [52]:
from sklearn.metrics import accuracy_score

# Predict the labels for the test data
y_pred_test_sub = rf_2.predict(X_test_sub)

# Calculate the accuracy score on the test data
accuracy_test = accuracy_score(y_test_sub, y_pred_test_sub)

print("Accuracy on test data:", accuracy_test)

Accuracy on test data: 0.5293072824156305


In [53]:
test_sub_output = pd.DataFrame(rf_2.predict(X_test_sub), index = X_test_sub.index, columns = ['pred_arr_status'])
test_sub_output.head(50)

Unnamed: 0,pred_arr_status
4289,2
2096,1
5094,1
4472,0
2250,1
2881,0
3357,1
1308,0
1668,0
1069,0


In [54]:
set(test_sub_output['pred_arr_status'])

{0, 1, 2}

In [55]:
value_counts = test_sub_output['pred_arr_status'].value_counts()
value_counts

pred_arr_status
0    319
1    208
2     36
Name: count, dtype: int64

In [56]:
# fetch data 
test_data = pd.read_csv('test_Flight_weather_data.csv')
test_data.head()

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,Dest,clouds_des,temp_des,max_temp_des,min_temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,snow_depth_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time
0,4/19/24,UA,538,ORD,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM
1,4/19/24,MQ,3402,ORD,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM
2,4/19/24,B6,116,JFK,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,1:34 PM,2:51 PM
3,4/19/24,9E,5340,JFK,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM
4,4/19/24,WN,491,MCO,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,11:35 AM,2:20 PM


In [57]:
test_data

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,Dest,clouds_des,temp_des,max_temp_des,min_temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,snow_depth_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time
0,4/19/24,UA,538,ORD,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM
1,4/19/24,MQ,3402,ORD,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM
2,4/19/24,B6,116,JFK,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,1:34 PM,2:51 PM
3,4/19/24,9E,5340,JFK,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM
4,4/19/24,WN,491,MCO,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,11:35 AM,2:20 PM
5,4/19/24,B6,56,MCO,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,1:35 PM,4:25 PM
6,4/20/24,UA,538,ORD,SYR,49,10.1,19.8,4.6,227,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7,6:52 PM,9:47 PM
7,4/20/24,MQ,3402,ORD,SYR,49,10.1,19.8,4.6,227,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7,7:59 PM,10:52 PM
8,4/20/24,B6,116,JFK,SYR,49,10.1,19.8,4.6,227,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7,1:25 PM,2:41 PM
9,4/20/24,9E,5340,JFK,SYR,49,10.1,19.8,4.6,227,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7,2:55 PM,4:21 PM


In [58]:
OP_CARRIER_unique_values

array(['9E', 'MQ', 'B6', 'WN', 'UA'], dtype=object)

In [59]:
test_data.columns

Index(['Date', 'CarrierCode', 'FlightNumber', 'Origin', 'Dest', 'clouds_des',
       'temp_des', 'max_temp_des', 'min_temp_des', 'wind_dir_des',
       'wind_spd_des', 'wind_gust_spd_des', 'snow_rate_des', 'snow_depth_des',
       'precip_rate_des', 'pres_des', 'uv_des', 'dewpt_des', 'rh_des',
       'slp_des', 'Scheduled departure time', 'Scheduled Arrival Time'],
      dtype='object')

In [60]:
test_data.drop(columns=['Dest', 'max_temp_des', 'min_temp_des', 'snow_depth_des'],inplace=True)
test_data.head()

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time
0,4/19/24,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM
1,4/19/24,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM
2,4/19/24,B6,116,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,1:34 PM,2:51 PM
3,4/19/24,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM
4,4/19/24,WN,491,MCO,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,11:35 AM,2:20 PM


In [61]:
test_data['Date'] = pd.to_datetime(test_data['Date'])


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



In [62]:
test_data.dtypes

Date                        datetime64[ns]
CarrierCode                         object
FlightNumber                         int64
Origin                              object
clouds_des                           int64
temp_des                           float64
wind_dir_des                         int64
wind_spd_des                       float64
wind_gust_spd_des                  float64
snow_rate_des                        int64
precip_rate_des                    float64
pres_des                           float64
uv_des                             float64
dewpt_des                          float64
rh_des                               int64
slp_des                            float64
Scheduled departure time            object
Scheduled Arrival Time              object
dtype: object

In [63]:
pred_ORD = test_data[test_data['Origin'] == 'ORD']
pred_JFK = test_data[test_data['Origin'] == 'JFK']
pred_MCO = test_data[test_data['Origin'] == 'MCO']

In [64]:
pred_ORD

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time
0,2024-04-19,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM
1,2024-04-19,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM
6,2024-04-20,UA,538,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,6:52 PM,9:47 PM
7,2024-04-20,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,7:59 PM,10:52 PM
11,2024-04-21,UA,538,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,6:52 PM,9:47 PM
12,2024-04-21,MQ,3402,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,7:59 PM,10:52 PM
17,2024-04-22,UA,538,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,6:52 PM,9:47 PM
18,2024-04-22,MQ,3402,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,7:59 PM,10:52 PM


In [65]:
combined_df = pd.concat([pred_ORD, pred_JFK, pred_MCO], ignore_index=True)
combined_df.head()

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time
0,2024-04-19,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM
1,2024-04-19,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM
2,2024-04-20,UA,538,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,6:52 PM,9:47 PM
3,2024-04-20,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,7:59 PM,10:52 PM
4,2024-04-21,UA,538,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,6:52 PM,9:47 PM


In [66]:
combined_df

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time
0,2024-04-19,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM
1,2024-04-19,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM
2,2024-04-20,UA,538,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,6:52 PM,9:47 PM
3,2024-04-20,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,7:59 PM,10:52 PM
4,2024-04-21,UA,538,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,6:52 PM,9:47 PM
5,2024-04-21,MQ,3402,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,7:59 PM,10:52 PM
6,2024-04-22,UA,538,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,6:52 PM,9:47 PM
7,2024-04-22,MQ,3402,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,7:59 PM,10:52 PM
8,2024-04-19,B6,116,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,1:34 PM,2:51 PM
9,2024-04-19,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM


In [67]:
combined_df['Scheduled Arrival Hour']= pd.to_datetime(combined_df['Scheduled Arrival Time']).dt.hour
combined_df['Scheduled Arrival Minutes']= pd.to_datetime(combined_df['Scheduled Arrival Time']).dt.minute

combined_df['Scheduled departure Hour']= pd.to_datetime(combined_df['Scheduled departure time']).dt.hour
combined_df['Scheduled departure Minutes']= pd.to_datetime(combined_df['Scheduled departure time']).dt.minute

combined_df.head()


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
0,2024-04-19,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM,21,47,18,52
1,2024-04-19,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM,22,52,19,59
2,2024-04-20,UA,538,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,6:52 PM,9:47 PM,21,47,18,52
3,2024-04-20,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,7:59 PM,10:52 PM,22,52,19,59
4,2024-04-21,UA,538,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,6:52 PM,9:47 PM,21,47,18,52


In [68]:
combined_df.rename(columns={'Origin': 'Origin Airport'}, inplace=True)
combined_df.rename(columns={'CarrierCode': 'Carrier Code'}, inplace=True)
combined_df.rename(columns={'FlightNumber': 'Flight Number'}, inplace=True)
combined_df.drop(columns=['Date', 'Scheduled departure time', 'Scheduled Arrival Time'], inplace=True)
combined_df.head()

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
0,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,21,47,18,52
1,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,22,52,19,59
2,UA,538,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,21,47,18,52
3,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,22,52,19,59
4,UA,538,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,21,47,18,52


In [69]:
combined_df = pd.get_dummies(combined_df, drop_first = True)
combined_df

Unnamed: 0,Flight Number,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
0,538,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,21,47,18,52,False,False,True,False,False,True
1,3402,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,22,52,19,59,False,True,False,False,False,True
2,538,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,21,47,18,52,False,False,True,False,False,True
3,3402,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,22,52,19,59,False,True,False,False,False,True
4,538,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,21,47,18,52,False,False,True,False,False,True
5,3402,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,22,52,19,59,False,True,False,False,False,True
6,538,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,21,47,18,52,False,False,True,False,False,True
7,3402,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,22,52,19,59,False,True,False,False,False,True
8,116,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,14,51,13,34,True,False,False,False,False,False
9,5340,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,16,21,14,55,False,False,False,False,False,False


In [70]:
#Ensuring the test dataset has the same dummy columns as the training dataset
# combined_pred = pd.get_dummies(combined_pred)
missing_cols = set(X_train.columns) - set(combined_df.columns)
for c in missing_cols:
    combined_df[c] = 0
combined_df = combined_df[X_train.columns]

In [71]:
if True: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    combined_df = pd.DataFrame(sc.fit_transform(combined_df), columns = combined_df.columns, index = combined_df.index)
combined_df

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
0,-0.57,0.61,-1.44,1.57,0.13,-1.62,0.13,0.0,0.6,-1.38,0.33,-0.5,0.33,1.16,0.69,1.12,0.6,-0.73,-0.46,2.18,-0.39,-0.66,1.37
1,0.83,0.61,-1.44,1.57,0.13,-1.62,0.13,0.0,0.6,-1.38,0.33,-0.5,0.33,1.47,1.06,1.48,1.11,-0.73,2.18,-0.46,-0.39,-0.66,1.37
2,-0.57,-1.71,-0.36,0.04,1.57,0.34,1.59,0.0,-1.89,-0.0,1.62,0.11,-0.27,1.16,0.69,1.12,0.6,-0.73,-0.46,2.18,-0.39,-0.66,1.37
3,0.83,-1.71,-0.36,0.04,1.57,0.34,1.59,0.0,-1.89,-0.0,1.62,0.11,-0.27,1.47,1.06,1.48,1.11,-0.73,2.18,-0.46,-0.39,-0.66,1.37
4,-0.57,1.0,1.12,-0.99,-0.12,1.0,-0.15,0.0,0.38,-0.0,-1.08,-1.11,-1.4,1.16,0.69,1.12,0.6,-0.73,-0.46,2.18,-0.39,-0.66,1.37
5,0.83,1.0,1.12,-0.99,-0.12,1.0,-0.15,0.0,0.38,-0.0,-1.08,-1.11,-1.4,1.47,1.06,1.48,1.11,-0.73,2.18,-0.46,-0.39,-0.66,1.37
6,-0.57,-0.19,0.63,-0.62,-1.32,0.34,-1.3,0.0,0.6,1.38,-0.6,1.53,1.3,1.16,0.69,1.12,0.6,-0.73,-0.46,2.18,-0.39,-0.66,1.37
7,0.83,-0.19,0.63,-0.62,-1.32,0.34,-1.3,0.0,0.6,1.38,-0.6,1.53,1.3,1.47,1.06,1.48,1.11,-0.73,2.18,-0.46,-0.39,-0.66,1.37
8,-0.78,0.61,-1.44,1.57,0.13,-1.62,0.13,0.0,0.6,-1.38,0.33,-0.5,0.33,-1.01,0.99,-0.65,-0.72,1.37,-0.46,-0.46,-0.39,-0.66,-0.73
9,1.79,0.61,-1.44,1.57,0.13,-1.62,0.13,0.0,0.6,-1.38,0.33,-0.5,0.33,-0.39,-1.22,-0.29,0.82,-0.73,-0.46,-0.46,-0.39,-0.66,-0.73


In [72]:
# Predict the labels for the test data
y_pred_test = rf_1.predict(combined_df) 
status_mapping = {0: 'Early', 1: 'Late', 2: 'On-time'}
y_pred_labels = [status_mapping[pred] for pred in y_pred_test]
# Print each label with its corresponding index
for index, label in enumerate(y_pred_labels):
    print(f"Index {index}: {label}")

Index 0: Early
Index 1: Early
Index 2: Early
Index 3: Early
Index 4: Early
Index 5: Early
Index 6: Early
Index 7: Early
Index 8: Early
Index 9: Early
Index 10: Late
Index 11: Early
Index 12: Early
Index 13: Early
Index 14: Early
Index 15: Early
Index 16: Early
Index 17: Early
Index 18: Late
Index 19: Early
Index 20: Late
Index 21: Early
Index 22: Early


In [73]:
def calculate_previous_flight_status(group):
    # Check if date is the same within the group, shift by 1 to compare with previous row
    group['Previous Flight Status'] = group['Date'].eq(group['Date'].shift(1))
    return group

In [74]:
pred_ORD

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time
0,2024-04-19,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM
1,2024-04-19,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM
6,2024-04-20,UA,538,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,6:52 PM,9:47 PM
7,2024-04-20,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,7:59 PM,10:52 PM
11,2024-04-21,UA,538,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,6:52 PM,9:47 PM
12,2024-04-21,MQ,3402,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,7:59 PM,10:52 PM
17,2024-04-22,UA,538,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,6:52 PM,9:47 PM
18,2024-04-22,MQ,3402,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,7:59 PM,10:52 PM


In [75]:
#filtering the data and setting previous flight status
pred_ORD.sort_values(by=['Date','Scheduled Arrival Time'])
pred_ORD = pred_ORD.groupby(["Date"]).apply(calculate_previous_flight_status)
# pred_ORD.dropna(subset=["Previous Flight Status"], inplace=True)


pred_JFK.sort_values(by=['Date','Scheduled Arrival Time'], inplace=True)
pred_JFK = pred_JFK.groupby(["Date"]).apply(calculate_previous_flight_status)
# pred_JFK.dropna(subset=["Previous Flight Status"], inplace=True)


pred_MCO.sort_values(by=['Date','Scheduled Arrival Time'], inplace=True)
pred_MCO = pred_MCO.groupby(["Date"]).apply(calculate_previous_flight_status)
# pred_MCO.dropna(subset=["Previous Flight Status"], inplace=True)

pred_ORD = pred_ORD[pred_ORD['Previous Flight Status']]
pred_JFK = pred_JFK[pred_JFK['Previous Flight Status']]
pred_MCO = pred_MCO[pred_MCO['Previous Flight Status']]

pred_MCO.isna().sum()
pred_JFK.isna().sum()
pred_ORD.isna().sum()

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time
1,2024-04-19,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM
0,2024-04-19,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM
7,2024-04-20,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,7:59 PM,10:52 PM
6,2024-04-20,UA,538,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,6:52 PM,9:47 PM
12,2024-04-21,MQ,3402,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,7:59 PM,10:52 PM
11,2024-04-21,UA,538,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,6:52 PM,9:47 PM
18,2024-04-22,MQ,3402,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,7:59 PM,10:52 PM
17,2024-04-22,UA,538,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,6:52 PM,9:47 PM




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Date                        0
CarrierCode                 0
FlightNumber                0
Origin                      0
clouds_des                  0
temp_des                    0
wind_dir_des                0
wind_spd_des                0
wind_gust_spd_des           0
snow_rate_des               0
precip_rate_des             0
pres_des                    0
uv_des                      0
dewpt_des                   0
rh_des                      0
slp_des                     0
Scheduled departure time    0
Scheduled Arrival Time      0
Previous Flight Status      0
dtype: int64

Date                        0
CarrierCode                 0
FlightNumber                0
Origin                      0
clouds_des                  0
temp_des                    0
wind_dir_des                0
wind_spd_des                0
wind_gust_spd_des           0
snow_rate_des               0
precip_rate_des             0
pres_des                    0
uv_des                      0
dewpt_des                   0
rh_des                      0
slp_des                     0
Scheduled departure time    0
Scheduled Arrival Time      0
Previous Flight Status      0
dtype: int64

Date                        0
CarrierCode                 0
FlightNumber                0
Origin                      0
clouds_des                  0
temp_des                    0
wind_dir_des                0
wind_spd_des                0
wind_gust_spd_des           0
snow_rate_des               0
precip_rate_des             0
pres_des                    0
uv_des                      0
dewpt_des                   0
rh_des                      0
slp_des                     0
Scheduled departure time    0
Scheduled Arrival Time      0
Previous Flight Status      0
dtype: int64

In [76]:
pred_ORD

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time,Previous Flight Status
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-04-19,1,2024-04-19,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM,True
2024-04-20,7,2024-04-20,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,7:59 PM,10:52 PM,True
2024-04-21,12,2024-04-21,MQ,3402,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,7:59 PM,10:52 PM,True
2024-04-22,18,2024-04-22,MQ,3402,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,7:59 PM,10:52 PM,True


In [77]:
pred_JFK

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time,Previous Flight Status
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-04-19,3,2024-04-19,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM,True
2024-04-20,9,2024-04-20,9E,5340,JFK,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,2:55 PM,4:21 PM,True
2024-04-21,14,2024-04-21,9E,5340,JFK,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,2:55 PM,4:21 PM,True
2024-04-22,20,2024-04-22,9E,5340,JFK,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,2:55 PM,4:21 PM,True


In [78]:
pred_MCO

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time,Previous Flight Status
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-04-19,5,2024-04-19,B6,56,MCO,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,1:35 PM,4:25 PM,True
2024-04-21,16,2024-04-21,B6,56,MCO,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,1:35 PM,4:25 PM,True
2024-04-22,22,2024-04-22,B6,56,MCO,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,1:34 PM,4:25 PM,True


In [79]:
combined_df_prev = pd.concat([pred_ORD, pred_JFK, pred_MCO], ignore_index=True)

In [80]:
combined_df_prev

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time,Previous Flight Status
0,2024-04-19,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM,True
1,2024-04-20,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,7:59 PM,10:52 PM,True
2,2024-04-21,MQ,3402,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,7:59 PM,10:52 PM,True
3,2024-04-22,MQ,3402,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,7:59 PM,10:52 PM,True
4,2024-04-19,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM,True
5,2024-04-20,9E,5340,JFK,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,2:55 PM,4:21 PM,True
6,2024-04-21,9E,5340,JFK,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,2:55 PM,4:21 PM,True
7,2024-04-22,9E,5340,JFK,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,2:55 PM,4:21 PM,True
8,2024-04-19,B6,56,MCO,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,1:35 PM,4:25 PM,True
9,2024-04-21,B6,56,MCO,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,1:35 PM,4:25 PM,True


In [81]:
combined_df_prev['Scheduled Arrival Hour']= pd.to_datetime(combined_df_prev['Scheduled Arrival Time']).dt.hour
combined_df_prev['Scheduled Arrival Minutes']= pd.to_datetime(combined_df_prev['Scheduled Arrival Time']).dt.minute

combined_df_prev['Scheduled departure Hour']= pd.to_datetime(combined_df_prev['Scheduled departure time']).dt.hour
combined_df_prev['Scheduled departure Minutes']= pd.to_datetime(combined_df_prev['Scheduled departure time']).dt.minute

combined_df_prev.head()


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
0,2024-04-19,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM,True,22,52,19,59
1,2024-04-20,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,7:59 PM,10:52 PM,True,22,52,19,59
2,2024-04-21,MQ,3402,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,7:59 PM,10:52 PM,True,22,52,19,59
3,2024-04-22,MQ,3402,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,7:59 PM,10:52 PM,True,22,52,19,59
4,2024-04-19,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM,True,16,21,14,55


In [82]:
set(combined_df_prev['CarrierCode'])

{'9E', 'B6', 'MQ'}

In [83]:
combined_df_prev

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
0,2024-04-19,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM,True,22,52,19,59
1,2024-04-20,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,7:59 PM,10:52 PM,True,22,52,19,59
2,2024-04-21,MQ,3402,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,7:59 PM,10:52 PM,True,22,52,19,59
3,2024-04-22,MQ,3402,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,7:59 PM,10:52 PM,True,22,52,19,59
4,2024-04-19,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM,True,16,21,14,55
5,2024-04-20,9E,5340,JFK,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,2:55 PM,4:21 PM,True,16,21,14,55
6,2024-04-21,9E,5340,JFK,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,2:55 PM,4:21 PM,True,16,21,14,55
7,2024-04-22,9E,5340,JFK,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,2:55 PM,4:21 PM,True,16,21,14,55
8,2024-04-19,B6,56,MCO,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,1:35 PM,4:25 PM,True,16,25,13,35
9,2024-04-21,B6,56,MCO,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,1:35 PM,4:25 PM,True,16,25,13,35


In [84]:
combined_df_prev.rename(columns={'Origin': 'Origin Airport'}, inplace=True)

In [85]:
combined_df_prev.rename(columns={'CarrierCode': 'Carrier Code'}, inplace=True)

In [86]:
combined_df_prev.rename(columns={'FlightNumber': 'Flight Number'}, inplace=True)

In [87]:
combined_df_prev.drop(columns=['Date', 'Scheduled departure time', 'Scheduled Arrival Time'], inplace=True)

In [88]:
subsequent_flights.columns

Index(['Flight Number', 'clouds_des', 'dewpt_des', 'precip_rate_des',
       'pres_des', 'rh_des', 'slp_des', 'snow_rate_des', 'temp_des', 'uv_des',
       'wind_dir_des', 'wind_gust_spd_des', 'wind_spd_des', 'Status',
       'Previous Flight Status', 'Scheduled Arrival Hour',
       'Scheduled Arrival Minutes', 'Scheduled departure Hour',
       'Scheduled departure Minutes', 'Date', 'Carrier Code_B6',
       'Carrier Code_MQ', 'Carrier Code_UA', 'Carrier Code_WN',
       'Origin Airport_MCO', 'Origin Airport_ORD'],
      dtype='object')

In [89]:
combined_df_prev.columns

Index(['Carrier Code', 'Flight Number', 'Origin Airport', 'clouds_des',
       'temp_des', 'wind_dir_des', 'wind_spd_des', 'wind_gust_spd_des',
       'snow_rate_des', 'precip_rate_des', 'pres_des', 'uv_des', 'dewpt_des',
       'rh_des', 'slp_des', 'Previous Flight Status', 'Scheduled Arrival Hour',
       'Scheduled Arrival Minutes', 'Scheduled departure Hour',
       'Scheduled departure Minutes'],
      dtype='object')

In [90]:
combined_df_prev.head()

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
0,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,True,22,52,19,59
1,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,True,22,52,19,59
2,MQ,3402,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,True,22,52,19,59
3,MQ,3402,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,True,22,52,19,59
4,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,True,16,21,14,55


In [91]:
# L_flight_data_merged_pred = pd.get_dummies(L_flight_data_merged_pred, drop_first = True)

In [92]:
#here we assume Previous Flight Status to be one value at a time and make predictions
combined_df_prev['Previous Flight Status'] = 1
L_flight_data_merged_pred = combined_df_prev.copy()

combined_df_prev['Previous Flight Status'] = 0
E_flight_data_merged_pred = combined_df_prev.copy()

combined_df_prev['Previous Flight Status'] = 2
O_flight_data_merged_pred = combined_df_prev.copy()

In [93]:
L_flight_data_merged_pred

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
0,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,1,22,52,19,59
1,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,1,22,52,19,59
2,MQ,3402,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,1,22,52,19,59
3,MQ,3402,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,1,22,52,19,59
4,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,1,16,21,14,55
5,9E,5340,JFK,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,1,16,21,14,55
6,9E,5340,JFK,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,1,16,21,14,55
7,9E,5340,JFK,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,1,16,21,14,55
8,B6,56,MCO,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,1,16,25,13,35
9,B6,56,MCO,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,1,16,25,13,35


In [94]:
E_flight_data_merged_pred

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
0,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,0,22,52,19,59
1,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,0,22,52,19,59
2,MQ,3402,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,0,22,52,19,59
3,MQ,3402,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,0,22,52,19,59
4,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,0,16,21,14,55
5,9E,5340,JFK,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,0,16,21,14,55
6,9E,5340,JFK,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,0,16,21,14,55
7,9E,5340,JFK,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,0,16,21,14,55
8,B6,56,MCO,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,0,16,25,13,35
9,B6,56,MCO,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,0,16,25,13,35


In [95]:
O_flight_data_merged_pred

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
0,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2,22,52,19,59
1,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,2,22,52,19,59
2,MQ,3402,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,2,22,52,19,59
3,MQ,3402,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,2,22,52,19,59
4,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2,16,21,14,55
5,9E,5340,JFK,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,2,16,21,14,55
6,9E,5340,JFK,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,2,16,21,14,55
7,9E,5340,JFK,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,2,16,21,14,55
8,B6,56,MCO,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2,16,25,13,35
9,B6,56,MCO,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,2,16,25,13,35


In [96]:
L_flight_data_merged_pred = pd.get_dummies(L_flight_data_merged_pred, drop_first = True)

In [97]:
#Ensuring the test dataset has the same dummy columns as the training dataset
missing_cols = set(X_train_sub.columns) - set(L_flight_data_merged_pred.columns)
for c in missing_cols:
    L_flight_data_merged_pred[c] = 0
L_flight_data_merged_pred = L_flight_data_merged_pred[X_train_sub.columns]

In [98]:
if True: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    L_flight_data_merged_pred = pd.DataFrame(sc.fit_transform(L_flight_data_merged_pred), columns = L_flight_data_merged_pred.columns, index = L_flight_data_merged_pred.index)
L_flight_data_merged_pred

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
0,0.1,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,1.32,1.31,1.31,0.8,-0.61,1.32,0.0,0.0,-0.61,1.32
1,0.1,-1.87,-0.37,0.04,1.71,0.35,1.73,0.0,-2.11,0.0,1.77,0.11,-0.28,0.0,1.32,1.31,1.31,0.8,-0.61,1.32,0.0,0.0,-0.61,1.32
2,0.1,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,1.32,1.31,1.31,0.8,-0.61,1.32,0.0,0.0,-0.61,1.32
3,0.1,-0.28,0.6,-0.61,-1.29,0.35,-1.28,0.0,0.55,1.35,-0.55,1.5,1.26,0.0,1.32,1.31,1.31,0.8,-0.61,1.32,0.0,0.0,-0.61,1.32
4,1.03,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,-0.76,-0.87,-0.59,0.41,-0.61,-0.76,0.0,0.0,-0.61,-0.76
5,1.03,-1.87,-0.37,0.04,1.71,0.35,1.73,0.0,-2.11,0.0,1.77,0.11,-0.28,0.0,-0.76,-0.87,-0.59,0.41,-0.61,-0.76,0.0,0.0,-0.61,-0.76
6,1.03,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,-0.76,-0.87,-0.59,0.41,-0.61,-0.76,0.0,0.0,-0.61,-0.76
7,1.03,-0.28,0.6,-0.61,-1.29,0.35,-1.28,0.0,0.55,1.35,-0.55,1.5,1.26,0.0,-0.76,-0.87,-0.59,0.41,-0.61,-0.76,0.0,0.0,-0.61,-0.76
8,-1.5,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,-0.76,-0.59,-0.96,-1.58,1.63,-0.76,0.0,0.0,1.63,-0.76
9,-1.5,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,-0.76,-0.59,-0.96,-1.58,1.63,-0.76,0.0,0.0,1.63,-0.76


In [99]:
status_mapping = {0: 'Early', 1: 'Late', 2: 'On-time'}

In [100]:
# Predict the labels for the test data
y_pred_test_sub_L_flight = rf_2.predict(L_flight_data_merged_pred) 
y_pred_test_sub_L_flight_labels = [status_mapping[pred] for pred in y_pred_test_sub_L_flight]
# Print each label with its corresponding index
for index, label in enumerate(y_pred_test_sub_L_flight_labels):
    print(f"Index {index}: {label}")

Index 0: Late
Index 1: Late
Index 2: Late
Index 3: Late
Index 4: Early
Index 5: Early
Index 6: Early
Index 7: Early
Index 8: Early
Index 9: Late
Index 10: Late


In [101]:
E_flight_data_merged_pred = pd.get_dummies(E_flight_data_merged_pred, drop_first = True)

In [102]:
#Ensuring the test dataset has the same dummy columns as the training dataset
# combined_pred = pd.get_dummies(combined_pred)
missing_cols = set(X_train_sub.columns) - set(E_flight_data_merged_pred.columns)
for c in missing_cols:
    E_flight_data_merged_pred[c] = 0
E_flight_data_merged_pred = E_flight_data_merged_pred[X_train_sub.columns]


In [103]:
if True: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    E_flight_data_merged_pred = pd.DataFrame(sc.fit_transform(E_flight_data_merged_pred), columns = E_flight_data_merged_pred.columns, index = E_flight_data_merged_pred.index)
E_flight_data_merged_pred

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
0,0.1,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,1.32,1.31,1.31,0.8,-0.61,1.32,0.0,0.0,-0.61,1.32
1,0.1,-1.87,-0.37,0.04,1.71,0.35,1.73,0.0,-2.11,0.0,1.77,0.11,-0.28,0.0,1.32,1.31,1.31,0.8,-0.61,1.32,0.0,0.0,-0.61,1.32
2,0.1,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,1.32,1.31,1.31,0.8,-0.61,1.32,0.0,0.0,-0.61,1.32
3,0.1,-0.28,0.6,-0.61,-1.29,0.35,-1.28,0.0,0.55,1.35,-0.55,1.5,1.26,0.0,1.32,1.31,1.31,0.8,-0.61,1.32,0.0,0.0,-0.61,1.32
4,1.03,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,-0.76,-0.87,-0.59,0.41,-0.61,-0.76,0.0,0.0,-0.61,-0.76
5,1.03,-1.87,-0.37,0.04,1.71,0.35,1.73,0.0,-2.11,0.0,1.77,0.11,-0.28,0.0,-0.76,-0.87,-0.59,0.41,-0.61,-0.76,0.0,0.0,-0.61,-0.76
6,1.03,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,-0.76,-0.87,-0.59,0.41,-0.61,-0.76,0.0,0.0,-0.61,-0.76
7,1.03,-0.28,0.6,-0.61,-1.29,0.35,-1.28,0.0,0.55,1.35,-0.55,1.5,1.26,0.0,-0.76,-0.87,-0.59,0.41,-0.61,-0.76,0.0,0.0,-0.61,-0.76
8,-1.5,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,-0.76,-0.59,-0.96,-1.58,1.63,-0.76,0.0,0.0,1.63,-0.76
9,-1.5,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,-0.76,-0.59,-0.96,-1.58,1.63,-0.76,0.0,0.0,1.63,-0.76


In [104]:
# Predict the labels for the test data
y_pred_test_sub_E_flight = rf_2.predict(E_flight_data_merged_pred)
y_pred_test_sub_E_flight_labels = [status_mapping[pred] for pred in y_pred_test_sub_E_flight]
# Print each label with its corresponding index
for index, label in enumerate(y_pred_test_sub_E_flight_labels):
    print(f"Index {index}: {label}")

Index 0: Late
Index 1: Late
Index 2: Late
Index 3: Late
Index 4: Early
Index 5: Early
Index 6: Early
Index 7: Early
Index 8: Early
Index 9: Late
Index 10: Late


In [105]:
O_flight_data_merged_pred = pd.get_dummies(O_flight_data_merged_pred, drop_first = True)

In [106]:
#Ensuring the test dataset has the same dummy columns as the training dataset
# combined_pred = pd.get_dummies(combined_pred)
missing_cols = set(X_train_sub.columns) - set(O_flight_data_merged_pred.columns)
for c in missing_cols:
    O_flight_data_merged_pred[c] = 0
O_flight_data_merged_pred = O_flight_data_merged_pred[X_train_sub.columns]


In [107]:
if True: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    O_flight_data_merged_pred = pd.DataFrame(sc.fit_transform(O_flight_data_merged_pred), columns = O_flight_data_merged_pred.columns, index = O_flight_data_merged_pred.index)
O_flight_data_merged_pred

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
0,0.1,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,1.32,1.31,1.31,0.8,-0.61,1.32,0.0,0.0,-0.61,1.32
1,0.1,-1.87,-0.37,0.04,1.71,0.35,1.73,0.0,-2.11,0.0,1.77,0.11,-0.28,0.0,1.32,1.31,1.31,0.8,-0.61,1.32,0.0,0.0,-0.61,1.32
2,0.1,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,1.32,1.31,1.31,0.8,-0.61,1.32,0.0,0.0,-0.61,1.32
3,0.1,-0.28,0.6,-0.61,-1.29,0.35,-1.28,0.0,0.55,1.35,-0.55,1.5,1.26,0.0,1.32,1.31,1.31,0.8,-0.61,1.32,0.0,0.0,-0.61,1.32
4,1.03,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,-0.76,-0.87,-0.59,0.41,-0.61,-0.76,0.0,0.0,-0.61,-0.76
5,1.03,-1.87,-0.37,0.04,1.71,0.35,1.73,0.0,-2.11,0.0,1.77,0.11,-0.28,0.0,-0.76,-0.87,-0.59,0.41,-0.61,-0.76,0.0,0.0,-0.61,-0.76
6,1.03,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,-0.76,-0.87,-0.59,0.41,-0.61,-0.76,0.0,0.0,-0.61,-0.76
7,1.03,-0.28,0.6,-0.61,-1.29,0.35,-1.28,0.0,0.55,1.35,-0.55,1.5,1.26,0.0,-0.76,-0.87,-0.59,0.41,-0.61,-0.76,0.0,0.0,-0.61,-0.76
8,-1.5,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,-0.76,-0.59,-0.96,-1.58,1.63,-0.76,0.0,0.0,1.63,-0.76
9,-1.5,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,-0.76,-0.59,-0.96,-1.58,1.63,-0.76,0.0,0.0,1.63,-0.76


In [108]:
# Predict the labels for the test data
y_pred_test_sub_O_flight = rf_2.predict(O_flight_data_merged_pred) 
y_pred_test_sub_O_flight_labels = [status_mapping[pred] for pred in y_pred_test_sub_O_flight]
# Print each label with its corresponding index
for index, label in enumerate(y_pred_test_sub_O_flight_labels):
    print(f"Index {index}: {label}")

Index 0: Late
Index 1: Late
Index 2: Late
Index 3: Late
Index 4: Early
Index 5: Early
Index 6: Early
Index 7: Early
Index 8: Early
Index 9: Late
Index 10: Late
