In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate

In [2]:
# Loads the data into a dataframe
df = pd.read_csv('./Jan_2019_ontime.csv')
df.shape

(583985, 22)

In [3]:
# Chooses a random sample of 1% of the data to learn on and drops unnessecary rows and columns
df = df.sample(frac=0.05)
df = df.drop(['Unnamed: 21', 'OP_UNIQUE_CARRIER', 'ORIGIN_AIRPORT_ID','ORIGIN_AIRPORT_SEQ_ID', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'OP_CARRIER_AIRLINE_ID',
             'DEP_TIME_BLK', 'DEP_DEL15', 'ARR_TIME', 'CANCELLED', 'DIVERTED'], axis=1)
df = df.dropna()
df.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST,DEP_TIME,ARR_DEL15,DISTANCE
67631,4,5,YV,N86347,6114,OKC,IAH,906.0,1.0,395.0
225266,12,6,MQ,N254NN,4075,XNA,DFW,1428.0,0.0,280.0
188813,10,4,G4,229NV,862,PIE,CID,1135.0,0.0,1090.0
396318,22,2,WN,N8509U,1568,GEG,OAK,1619.0,0.0,723.0
551788,30,3,WN,N8676A,1226,FLL,MCO,1517.0,0.0,177.0


In [4]:
# Loads air traffic data into a dataframe and converts date to day of month
traffic_df = pd.read_csv('January 2019 - airport traffic data.csv')
traffic_df.Date = pd.to_datetime(traffic_df.Date).dt.day
traffic_df = traffic_df.rename(columns={"Date": "DAY_OF_MONTH", "Facility": "ORIGIN", "Total Operations": "TRAFFIC"})
traffic_df.head()

Unnamed: 0,DAY_OF_MONTH,ORIGIN,TRAFFIC
0,1,ABE,45
1,1,ABI,24
2,1,ABQ,154
3,1,ABY,24
4,1,ACK,55


In [5]:
# Adds origin airport traffic information
df = pd.merge(df, traffic_df, how="left", on=['DAY_OF_MONTH', 'ORIGIN'])
df = df.rename(columns={"TRAFFIC":"ORIGIN_TRAFFIC"})
df.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST,DEP_TIME,ARR_DEL15,DISTANCE,ORIGIN_TRAFFIC
0,4,5,YV,N86347,6114,OKC,IAH,906.0,1.0,395.0,297.0
1,12,6,MQ,N254NN,4075,XNA,DFW,1428.0,0.0,280.0,54.0
2,10,4,G4,229NV,862,PIE,CID,1135.0,0.0,1090.0,524.0
3,22,2,WN,N8509U,1568,GEG,OAK,1619.0,0.0,723.0,188.0
4,30,3,WN,N8676A,1226,FLL,MCO,1517.0,0.0,177.0,855.0


In [6]:
# Adds destination airport traffic information - probably an easier way to do this, but its been far too long since I've had CSSE371
traffic_df = traffic_df.rename(columns={"ORIGIN": "DEST"})
df = pd.merge(df, traffic_df, how="left", on=['DAY_OF_MONTH', 'DEST'])
df = df.rename(columns={"TRAFFIC":"DEST_TRAFFIC"})
df = df.dropna()
df.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST,DEP_TIME,ARR_DEL15,DISTANCE,ORIGIN_TRAFFIC,DEST_TRAFFIC
0,4,5,YV,N86347,6114,OKC,IAH,906.0,1.0,395.0,297.0,1398.0
1,12,6,MQ,N254NN,4075,XNA,DFW,1428.0,0.0,280.0,54.0,1434.0
2,10,4,G4,229NV,862,PIE,CID,1135.0,0.0,1090.0,524.0,114.0
3,22,2,WN,N8509U,1568,GEG,OAK,1619.0,0.0,723.0,188.0,652.0
4,30,3,WN,N8676A,1226,FLL,MCO,1517.0,0.0,177.0,855.0,924.0


In [7]:
# One-hot-encodes categorical features
X = df.drop('ARR_DEL15', axis=1)
X = pd.get_dummies(X)
y = df.ARR_DEL15
X.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_FL_NUM,DEP_TIME,DISTANCE,ORIGIN_TRAFFIC,DEST_TRAFFIC,OP_CARRIER_9E,OP_CARRIER_AA,OP_CARRIER_AS,...,DEST_TRI,DEST_TTN,DEST_TUL,DEST_TUS,DEST_TVC,DEST_TWF,DEST_TXK,DEST_TYR,DEST_TYS,DEST_XNA
0,4,5,6114,906.0,395.0,297.0,1398.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,12,6,4075,1428.0,280.0,54.0,1434.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10,4,862,1135.0,1090.0,524.0,114.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,22,2,1568,1619.0,723.0,188.0,652.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,30,3,1226,1517.0,177.0,855.0,924.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Grid search for number of estimators and depth on a random forest classifier
B = [1, 10, 100, 1000]
C = [1, 10, 100, 1000]
grid = {'n_estimators' : B, 'max_depth' : C}
rf = RandomForestClassifier()
rfCV = GridSearchCV(rf, param_grid=grid, return_train_score=True, n_jobs=-1)
rfCV.fit(X, y)

GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [1, 10, 100, 1000],
                         'n_estimators': [1, 10, 100, 1000]},
             return_train_score=True)

In [9]:
print('Best hyperparameters: ', rfCV.best_params_)
print('Best training score: ', rfCV.cv_results_['mean_train_score'][rfCV.cv_results_['params'].index(rfCV.best_params_)])
print('Best validation score: ', rfCV.best_score_)

Best hyperparameters:  {'max_depth': 1000, 'n_estimators': 100}
Best training score:  0.9999819176348266
Best validation score:  0.8188853619072247
