In [1]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle as pkl

In [2]:
classifier = pkl.load(open('rfc.pkl', 'rb'))
regressor = pkl.load(open('xgb_reg.pkl', 'rb'))

In [3]:
type(classifier)

sklearn.ensemble._forest.RandomForestClassifier

In [4]:
type(regressor)

xgboost.sklearn.XGBRegressor

In [5]:
df = pd.read_csv('merged.csv', index_col = [0])

In [6]:
df.columns

Index(['FlightDate', 'Quarter', 'Year', 'Month', 'Origin', 'Dest',
       'DayofMonth', 'DepTime', 'DepDel15', 'CRSDepTime', 'DepDelayMinutes',
       'OriginAirportID', 'DestAirportID', 'ArrTime', 'CRSArrTime', 'ArrDel15',
       'ArrDelayMinutes', 'rounded_time', 'windspeedKmph', 'winddirDegree',
       'weatherCode', 'precipMM', 'visibility', 'pressure', 'cloudcover',
       'DewPointF', 'WindGustKmph', 'tempF', 'WindChillF', 'humidity', 'date',
       'time', 'airport_code'],
      dtype='object')

In [7]:
df = df.drop(columns = ['airport_code','rounded_time','date','FlightDate','OriginAirportID', 'DestAirportID','WindChillF', 'CRSArrTime', 'ArrTime', 'time', 'Quarter'])

In [8]:
df.columns

Index(['Year', 'Month', 'Origin', 'Dest', 'DayofMonth', 'DepTime', 'DepDel15',
       'CRSDepTime', 'DepDelayMinutes', 'ArrDel15', 'ArrDelayMinutes',
       'windspeedKmph', 'winddirDegree', 'weatherCode', 'precipMM',
       'visibility', 'pressure', 'cloudcover', 'DewPointF', 'WindGustKmph',
       'tempF', 'humidity'],
      dtype='object')

In [9]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
df["Origin"] = labelEncoder.fit_transform(df["Origin"])
df["Dest"] = labelEncoder.fit_transform(df["Dest"])

In [10]:
X = df.drop(columns = ['ArrDel15', 'ArrDelayMinutes'])

In [11]:
y = df[['ArrDel15', 'ArrDelayMinutes']]

In [12]:
y.head()

Unnamed: 0,ArrDel15,ArrDelayMinutes
0,0.0,8.0
1,1.0,24.0
2,0.0,0.0
3,0.0,10.0
4,0.0,0.0


In [13]:
from sklearn.metrics import accuracy_score,  mean_absolute_error, classification_report,  r2_score, mean_absolute_error, mean_squared_error

In [14]:
def run_models(X, y):
    #Classification
    y_pred = classifier.predict(X)
    print("Accuracy of Classification: ", accuracy_score(y_pred, y['ArrDel15']))
    print(classification_report(y_pred, y['ArrDel15']))
    df['y_pred'] = y_pred
    
    #Regression
    reg_df = df[df['y_pred'] == 1.0]
    #reg_df = reg_df.drop('y_pred', axis = 1)
    reg_df = reg_df[reg_df['ArrDelayMinutes'] > 15]
    X = reg_df.drop(columns = ['ArrDel15', 'ArrDelayMinutes', 'y_pred'])
    y = reg_df[['ArrDel15', 'ArrDelayMinutes']]
    ypred = regressor.predict(X)
    print("mae: ", mean_absolute_error(ypred, y['ArrDelayMinutes']))
    print("mse: ", mean_squared_error(ypred, y['ArrDelayMinutes']))
    mse = mean_squared_error(ypred, y['ArrDelayMinutes'])
    rmse = mse ** 0.5  
    print("RMSE: ", rmse)
    print("R2 score: ", r2_score(ypred, y['ArrDelayMinutes']))
    
    #reg_df.rename(columns = {'y_pred' : 'ArrDel15 (Predicted)'})
    reg_df['ArrDelayMinutes (Predicted)'] = ypred
    print(reg_df[['y_pred', 'ArrDelayMinutes (Predicted)']])
    #print(classification_report(ypred, y['ArrDelayMinutes']))
    return reg_df

In [15]:
result = run_models(X, y)

Accuracy of Classification:  0.9853657661848181
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99   1435992
         1.0       0.99      0.94      0.97    404282

    accuracy                           0.99   1840274
   macro avg       0.99      0.97      0.98   1840274
weighted avg       0.99      0.99      0.99   1840274

mae:  11.3249318141593
mse:  269.07079402311916
RMSE:  16.40337751876482
R2 score:  0.9461960603118135
         y_pred  ArrDelayMinutes (Predicted)
1           1.0                    33.247444
9           1.0                    33.721954
13          1.0                    32.271927
14          1.0                    26.371428
17          1.0                    27.499187
...         ...                          ...
1840262     1.0                    57.617500
1840264     1.0                    30.766113
1840265     1.0                    48.450542
1840271     1.0                    29.498631
1840272     1.0            

In [16]:
result[['y_pred', 'ArrDelayMinutes (Predicted)']]

Unnamed: 0,y_pred,ArrDelayMinutes (Predicted)
1,1.0,33.247444
9,1.0,33.721954
13,1.0,32.271927
14,1.0,26.371428
17,1.0,27.499187
...,...,...
1840262,1.0,57.617500
1840264,1.0,30.766113
1840265,1.0,48.450542
1840271,1.0,29.498631


In [17]:
result

Unnamed: 0,Year,Month,Origin,Dest,DayofMonth,DepTime,DepDel15,CRSDepTime,DepDelayMinutes,ArrDel15,...,precipMM,visibility,pressure,cloudcover,DewPointF,WindGustKmph,tempF,humidity,y_pred,ArrDelayMinutes (Predicted)
1,2016,1,13,14,1,759.0,1.0,724,35.0,1.0,...,0.0,10,1030,0,23,8,34,66,1.0,33.247444
9,2016,1,13,7,1,745.0,1.0,705,40.0,1.0,...,0.0,10,1030,0,23,8,34,66,1.0,33.721954
13,2016,1,13,8,2,720.0,1.0,655,25.0,1.0,...,0.0,10,1024,0,24,8,36,64,1.0,32.271927
14,2016,1,13,8,2,730.0,0.0,730,0.0,1.0,...,0.0,10,1024,0,24,8,36,64,1.0,26.371428
17,2016,1,13,0,2,745.0,0.0,745,0.0,1.0,...,0.0,10,1024,0,24,8,36,64,1.0,27.499187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1840262,2017,9,6,8,21,2358.0,1.0,2245,73.0,1.0,...,0.0,10,1015,96,64,26,72,75,1.0,57.617500
1840264,2017,9,9,6,22,2021.0,1.0,1954,27.0,1.0,...,0.6,9,1013,20,73,24,79,80,1.0,30.766113
1840265,2017,9,9,4,22,2240.0,1.0,2140,60.0,1.0,...,0.0,9,1013,19,73,23,78,84,1.0,48.450542
1840271,2017,9,9,6,24,1948.0,0.0,1954,0.0,1.0,...,0.0,10,1012,14,71,19,82,71,1.0,29.498631


##### Regression Results Analysis

In [18]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score
result['ArrDelayMinutes']

1          24.0
9          37.0
13         46.0
14         21.0
17         22.0
           ... 
1840262    51.0
1840264    18.0
1840265    57.0
1840271    38.0
1840272    44.0
Name: ArrDelayMinutes, Length: 368603, dtype: float64

In [19]:
#15 - 100
r1 = result[(result['ArrDelayMinutes'] >= 15) & (result['ArrDelayMinutes'] <= 100)]
mae = mean_absolute_error(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])
mse = mean_squared_error(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])
rmse = mse ** 0.5  
r2 = r2_score(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])

print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("R-Squared: ", r2)


r1.shape

MAE:  10.034320833911403
MSE:  175.99924482257902
RMSE:  13.26647069957112
R-Squared:  0.6412034566054876


(305532, 24)

In [20]:
#100 - 200
r1 = result[(result['ArrDelayMinutes'] >= 100) & (result['ArrDelayMinutes'] <= 200)]
mae = mean_absolute_error(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])
mse = mean_squared_error(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])
rmse = mse ** 0.5  
r2 = r2_score(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])

print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("R-Squared: ", r2)
r1.shape

MAE:  16.80788731630153
MSE:  638.7694904465857
RMSE:  25.27388949977003
R-Squared:  0.14188487583696208


(48859, 24)

In [21]:
#200 - 500
r1 = result[(result['ArrDelayMinutes'] >= 200) & (result['ArrDelayMinutes'] <= 500)]
mae = mean_absolute_error(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])
mse = mean_squared_error(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])
rmse = mse ** 0.5  
r2 = r2_score(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])

print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("R-Squared: ", r2)
r1.shape

MAE:  19.361672549383194
MSE:  885.2081790742596
RMSE:  29.752448287061345
R-Squared:  0.7975807899118941


(14187, 24)

In [22]:
#500 - 1000
r1 = result[(result['ArrDelayMinutes'] >= 500) & (result['ArrDelayMinutes'] <= 1000)]
mae = mean_absolute_error(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])
mse = mean_squared_error(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])
rmse = mse ** 0.5  
r2 = r2_score(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])

print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("R-Squared: ", r2)
r1.shape

MAE:  23.667469628423238
MSE:  1400.4657565670586
RMSE:  37.42279728410289
R-Squared:  0.9292506844622952


(1112, 24)

In [23]:
#1000 - 2000
r1 = result[(result['ArrDelayMinutes'] >= 1000) & (result['ArrDelayMinutes'] <= 2000)]
mae = mean_absolute_error(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])
mse = mean_squared_error(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])
rmse = mse ** 0.5  
r2 = r2_score(r1['ArrDelayMinutes'], r1['ArrDelayMinutes (Predicted)'])

print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("R-Squared: ", r2)
r1.shape

MAE:  39.87918672947525
MSE:  4550.512600875939
RMSE:  67.45748735963961
R-Squared:  0.8329290617942275


(173, 24)

In [24]:
new = pd.read_csv('merged.csv')

In [25]:
new.shape

(1840274, 34)