In [68]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime,timedelta
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

In [69]:
flight_data = pd.read_csv('mega_data.csv')
flight_data.head()

Unnamed: 0,Date (MM/DD/YYYY),delay_class,Arrival Time,EWR,IAD,ORD,Monday,Saturday,Sunday,Thursday,...,temp,wind_dir,weather,precip,pres,vis,clouds,dewpt,rh,wind_gust_spd
0,2022-01-01 23:10:00,severely late,23:10,0,1,0,0,1,0,0,...,-3.67,33.33,600.67,1.33,979.43,3.0,100.0,-5.6,86.33,9.67
1,2023-01-01 14:58:00,on-time,14:58,0,0,0,0,0,1,0,...,8.5,243.33,804.0,0.0,983.93,6.33,100.0,7.13,91.0,4.93
2,2023-01-01 23:14:00,on-time,23:14,1,0,0,0,0,1,0,...,6.1,228.33,748.0,0.0,987.23,2.67,75.0,5.1,93.33,1.97
3,2023-01-01 23:57:00,on-time,23:57,0,0,1,0,0,1,0,...,6.1,228.33,748.0,0.0,987.23,2.67,75.0,5.1,93.33,1.97
4,2022-01-02 23:10:00,late,23:10,0,1,0,0,0,0,0,...,-9.43,276.67,803.67,0.0,994.25,16.0,83.33,-12.17,80.33,1.8


In [70]:
flight_data.isna().sum()
flight_data = flight_data.dropna()

Date (MM/DD/YYYY)     0
delay_class           0
Arrival Time          0
EWR                   0
IAD                   0
ORD                   0
Monday                0
Saturday              0
Sunday                0
Thursday              0
Tuesday               0
Wednesday             0
wind_spd             38
temp                 38
wind_dir             38
weather              38
precip               38
pres                 38
vis                  38
clouds               38
dewpt                38
rh                   38
wind_gust_spd        38
dtype: int64

In [71]:
for index, row in flight_data.iterrows():
    flight_data.at[index, 'Arrival Time'] = int(row['Arrival Time'].split(':')[0])

In [72]:
X_train, X_test, y_train, y_test = train_test_split(flight_data.drop(columns = ['delay_class', 'Date (MM/DD/YYYY)']), flight_data['delay_class'], test_size=0.1, stratify = flight_data['delay_class'], random_state=4)

sc = StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train), 
                       columns = X_train.columns, index = X_train.index)
X_test = pd.DataFrame(sc.transform(X_test), 
                      columns = X_test.columns, index = X_test.index)

X_train
X_test
y_train
y_test

Unnamed: 0,Arrival Time,EWR,IAD,ORD,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,...,temp,wind_dir,weather,precip,pres,vis,clouds,dewpt,rh,wind_gust_spd
214,0.94,-0.22,1.74,-0.81,-0.41,-0.41,-0.42,2.46,-0.41,-0.40,...,-1.55,1.07,0.33,-0.26,-0.20,-0.85,-0.15,-1.75,-0.51,1.60
892,-0.85,-0.22,-0.58,-0.81,2.45,-0.41,-0.42,-0.41,-0.41,-0.40,...,0.94,0.11,0.30,-0.26,0.78,0.07,-1.11,-0.16,-2.25,-0.10
592,0.94,-0.22,1.74,-0.81,2.45,-0.41,-0.42,-0.41,-0.41,-0.40,...,0.73,0.03,0.30,-0.26,-0.60,0.42,-1.11,1.13,0.90,-1.06
934,-0.63,-0.22,-0.58,1.24,-0.41,-0.41,-0.42,2.46,-0.41,-0.40,...,-0.04,0.31,0.36,-0.26,-1.18,0.42,1.00,-1.12,-2.27,0.52
376,-0.85,-0.22,-0.58,-0.81,-0.41,-0.41,-0.42,2.46,-0.41,-0.40,...,-0.01,-1.19,-4.14,6.32,0.26,-1.54,1.20,0.25,0.61,-0.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522,0.94,-0.22,-0.58,1.24,-0.41,-0.41,-0.42,2.46,-0.41,-0.40,...,1.04,-0.09,0.30,-0.26,-0.12,0.42,-1.11,0.77,-0.88,-0.85
619,-0.63,-0.22,-0.58,1.24,-0.41,-0.41,-0.42,-0.41,2.46,-0.40,...,1.14,-0.46,0.34,-0.26,-0.08,0.42,0.04,0.96,-0.70,-0.89
664,0.94,-0.22,1.74,-0.81,-0.41,-0.41,-0.42,-0.41,-0.41,2.50,...,1.19,-0.38,0.30,-0.26,0.41,0.42,-1.11,1.63,0.88,-0.95
311,0.49,-0.22,-0.58,1.24,-0.41,-0.41,-0.42,-0.41,-0.41,2.50,...,0.42,-0.30,-2.65,0.79,-2.53,0.07,1.20,0.92,1.30,0.75


Unnamed: 0,Arrival Time,EWR,IAD,ORD,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,...,temp,wind_dir,weather,precip,pres,vis,clouds,dewpt,rh,wind_gust_spd
2,0.94,4.50,-0.58,-0.81,-0.41,-0.41,2.41,-0.41,-0.41,-0.40,...,-0.55,0.25,-0.47,-0.26,0.14,-4.20,0.62,-0.08,1.54,-1.06
813,0.94,-0.22,-0.58,1.24,-0.41,2.45,-0.42,-0.41,-0.41,-0.40,...,0.81,-0.30,0.30,-0.26,0.04,0.42,-1.11,1.00,0.25,-0.44
877,0.49,-0.22,-0.58,1.24,-0.41,-0.41,-0.42,-0.41,-0.41,-0.40,...,0.30,0.27,0.36,-0.26,0.23,0.42,1.20,0.41,0.13,1.60
610,-0.85,-0.22,-0.58,-0.81,-0.41,2.45,-0.42,-0.41,-0.41,-0.40,...,1.38,0.38,0.33,-0.15,-0.23,0.19,-0.31,1.22,-0.66,1.33
702,-0.63,-0.22,-0.58,1.24,-0.41,-0.41,-0.42,-0.41,2.46,-0.40,...,1.20,-1.84,0.30,-0.26,0.35,0.42,-1.11,0.83,-1.10,-0.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,-0.85,-0.22,-0.58,-0.81,-0.41,-0.41,-0.42,2.46,-0.41,-0.40,...,1.14,0.15,0.36,-0.26,-0.03,0.42,1.10,1.09,-0.37,0.51
221,0.27,-0.22,-0.58,1.24,2.45,-0.41,-0.42,-0.41,-0.41,-0.40,...,-0.06,0.15,0.33,-0.26,0.61,0.42,-0.54,-0.52,-1.22,-0.57
704,-0.85,-0.22,-0.58,-0.81,-0.41,-0.41,-0.42,-0.41,-0.41,2.50,...,1.24,-2.05,0.32,-0.26,0.49,0.42,-0.44,0.85,-1.12,-0.83
149,0.94,-0.22,1.74,-0.81,-0.41,-0.41,-0.42,2.46,-0.41,-0.40,...,-1.58,1.60,-1.66,0.16,-0.58,-3.05,1.14,-1.37,0.88,1.12


214          on-time
892            early
592    severely late
934            early
376            early
           ...      
522            early
619            early
664          on-time
311    severely late
940            early
Name: delay_class, Length: 979, dtype: object

2            on-time
813            early
877          on-time
610          on-time
702            early
           ...      
735             late
221            early
704          on-time
149    severely late
16           on-time
Name: delay_class, Length: 109, dtype: object

In [81]:
# model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'auto', penalty = 'l2')
model = GradientBoostingClassifier(max_features=3,max_depth=8,n_estimators=10)
# max_features=3,max_depth=8,n_estimators=10
# model = RandomForestClassifier(criterion='gini')
# model = DecisionTreeClassifier()
# model = KNeighborsClassifier(n_neighbors=7)

# fit the model to the training data
model.fit(X_train, y_train)

model.score(X_train, y_train)

# make predictions on the test data
y_pred = model.predict(X_test)

# evaluate the model performance using accuracy score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(classification_report(y_test, y_pred))

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

0.8345250255362615

               precision    recall  f1-score   support

        early       0.31      0.16      0.21        31
         late       0.00      0.00      0.00        13
      on-time       0.47      0.86      0.61        50
severely late       0.00      0.00      0.00        15

     accuracy                           0.44       109
    macro avg       0.19      0.26      0.20       109
 weighted avg       0.30      0.44      0.34       109

Accuracy: 0.44036697247706424
Precision: 0.303275827682489
Recall: 0.44036697247706424
F1 score: 0.338325090107304


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [97]:
test_data = pd.read_csv('test_data_merged.csv')
test_data.head()

Unnamed: 0,Date,Arrival Time,EWR,IAD,ORD,Monday,Saturday,Sunday,Thursday,Tuesday,...,temp,wind_dir,weather,precip,pres,vis,clouds,dewpt,rh,wind_gust_spd
0,2023-04-21 10:00:00,10:00 AM,0,0,1,0,0,0,0,0,...,10.17,261.0,804.0,0.0,983.33,26.79,100.0,4.3,67.33,3.82
1,2023-04-21 04:50:00,4:50 PM,0,0,1,0,0,0,0,0,...,10.8,283.0,804.0,0.19,982.5,15.96,100.0,8.5,85.67,10.31
2,2023-04-21 09:34:00,9:34 PM,0,0,1,0,0,0,0,0,...,9.5,272.67,804.0,0.0,983.33,21.53,100.0,5.43,76.0,5.15
3,2023-04-22 10:00:00,10:00 AM,0,0,1,0,1,0,0,0,...,6.93,271.33,803.0,0.06,977.0,24.13,56.67,1.23,67.33,7.12
4,2023-04-22 04:50:00,4:50 PM,0,0,1,0,1,0,0,0,...,7.4,307.33,803.33,0.06,975.67,24.06,71.67,5.17,85.67,7.73


In [98]:
for index, row in test_data.iterrows():
    test_data.at[index, 'Arrival Time'] = int(str(str(row['Arrival Time']).split(' ')[0]).split(':')[0])

In [99]:
final_test = test_data.drop(columns = ['Date'], axis = 1)

In [100]:
sc = StandardScaler()
final_test = pd.DataFrame(sc.fit_transform(final_test), columns = final_test.columns, index = final_test.index)
final_test.head()

Unnamed: 0,Arrival Time,EWR,IAD,ORD,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,...,temp,wind_dir,weather,precip,pres,vis,clouds,dewpt,rh,wind_gust_spd
0,0.92,-0.58,-0.58,1.29,-0.58,-0.58,-0.58,0.0,0.0,0.0,...,1.1,-0.51,0.43,-0.4,0.07,0.93,0.78,0.59,-0.84,-1.21
1,-0.78,-0.58,-0.58,1.29,-0.58,-0.58,-0.58,0.0,0.0,0.0,...,1.26,0.6,0.43,0.66,-0.11,-0.9,0.78,1.6,0.86,1.28
2,0.64,-0.58,-0.58,1.29,-0.58,-0.58,-0.58,0.0,0.0,0.0,...,0.93,0.08,0.43,-0.38,0.07,0.04,0.78,0.86,-0.03,-0.7
3,0.92,-0.58,-0.58,1.29,-0.58,1.73,-0.58,0.0,0.0,0.0,...,0.28,0.01,0.42,-0.05,-1.29,0.48,-0.5,-0.14,-0.84,0.06
4,-0.78,-0.58,-0.58,1.29,-0.58,1.73,-0.58,0.0,0.0,0.0,...,0.4,1.82,0.42,-0.05,-1.57,0.47,-0.05,0.8,0.86,0.3


In [105]:
test_output = pd.DataFrame(model.predict(final_test), index = final_test.index, columns = ['pred_arrival_delay'])
test_output = test_output.merge(test_data, left_index = True, right_index = True)

In [106]:
test_output

Unnamed: 0,pred_arrival_delay,Date,Arrival Time,EWR,IAD,ORD,Monday,Saturday,Sunday,Thursday,...,temp,wind_dir,weather,precip,pres,vis,clouds,dewpt,rh,wind_gust_spd
0,early,2023-04-21 10:00:00,10,0,0,1,0,0,0,0,...,10.17,261.0,804.0,0.0,983.33,26.79,100.0,4.3,67.33,3.82
1,on-time,2023-04-21 04:50:00,4,0,0,1,0,0,0,0,...,10.8,283.0,804.0,0.19,982.5,15.96,100.0,8.5,85.67,10.31
2,early,2023-04-21 09:34:00,9,0,0,1,0,0,0,0,...,9.5,272.67,804.0,0.0,983.33,21.53,100.0,5.43,76.0,5.15
3,early,2023-04-22 10:00:00,10,0,0,1,0,1,0,0,...,6.93,271.33,803.0,0.06,977.0,24.13,56.67,1.23,67.33,7.12
4,on-time,2023-04-22 04:50:00,4,0,0,1,0,1,0,0,...,7.4,307.33,803.33,0.06,975.67,24.06,71.67,5.17,85.67,7.73
5,on-time,2023-04-22 09:34:00,9,0,0,1,0,1,0,0,...,6.23,271.67,802.33,0.04,976.67,24.13,37.33,2.27,76.0,7.58
6,on-time,2023-04-23 10:00:00,10,0,0,1,0,0,1,0,...,2.0,278.0,610.0,0.1,985.17,8.59,100.0,-0.17,85.67,7.8
7,on-time,2023-04-23 04:55:00,4,0,0,1,0,0,1,0,...,1.67,275.0,804.0,0.0,981.5,24.13,100.0,-1.0,82.67,9.9
8,early,2023-04-23 09:34:00,9,0,0,1,0,0,1,0,...,1.73,275.0,610.0,0.04,984.33,11.95,100.0,-0.23,86.67,7.67
9,early,2023-04-24 10:00:00,10,0,0,1,1,0,0,0,...,5.8,284.0,801.0,0.0,990.33,24.13,18.33,-1.87,58.33,5.99


In [107]:
test_output.to_csv('final_prediction.csv', index=False)

In [76]:
# encoder = LabelEncoder()

# # fit the encoder to the categorical variable
# encoder.fit(flight_data['delay_class'])

# # transform the categorical variable into numerical variable
# flight_data['delay_class'] = encoder.transform(flight_data['delay_class'])

In [77]:
# X_train, X_test, y_train, y_test = train_test_split(flight_data.drop(columns = ['delay_class', 'Date (MM/DD/YYYY)']), flight_data['delay_class'], test_size=0.2, stratify = flight_data['delay_class'], random_state=50)

# sc = StandardScaler()
# X_train = pd.DataFrame(sc.fit_transform(X_train), 
#                        columns = X_train.columns, index = X_train.index)
# X_test = pd.DataFrame(sc.transform(X_test), 
#                       columns = X_test.columns, index = X_test.index)

In [78]:
# model = GradientBoostingRegressor(max_features=20,max_depth=20,n_estimators=20)
# model = RandomForestRegressor()

# # fit the model to the training data
# model.fit(X_train, y_train)

# model.score(X_train, y_train)

# # make predictions on the test data
# model.score(X_test, y_test)