In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

In [2]:
# Loads the data into a dataframe
df = pd.read_csv('./Jan_2019_ontime.csv')
df2 = pd.read_csv('./Jan_2020_ontime.csv')
df.append(df2)
df.shape

(583985, 22)

In [3]:
# Creates a standard baseline binary classifier accuracy by using all of our avaliable data
df.ARR_DEL15.value_counts()

0.0    460741
1.0    105222
Name: ARR_DEL15, dtype: int64

In [4]:
base_accuracy = 460741 / (460741 + 105222)
base_error = 1 - base_accuracy
print('Baseline Accuracy: ', base_accuracy)
print('Baseline Error: ', base_error)

Baseline Accuracy:  0.8140832527921437
Baseline Error:  0.18591674720785634


In [5]:
# Chooses a random sample of 5% of the data to learn on
df = df.sample(frac=0.05)
df = df.drop(['Unnamed: 21', 'OP_UNIQUE_CARRIER', 'ORIGIN_AIRPORT_ID','ORIGIN_AIRPORT_SEQ_ID', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'OP_CARRIER_AIRLINE_ID',
             'DEP_TIME_BLK', 'DEP_DEL15', 'ARR_TIME', 'CANCELLED', 'DIVERTED'], axis=1)
df = df.dropna()
df.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST,DEP_TIME,ARR_DEL15,DISTANCE
427685,23,3,DL,N901DE,2381,BHM,ATL,1136.0,0.0,134.0
571453,31,4,DL,N858DZ,1403,ATL,TUS,2011.0,0.0,1541.0
281219,15,2,OO,N936SW,3521,ATL,FWA,1339.0,0.0,508.0
155154,9,3,YV,N518LR,6055,IAD,LGA,1250.0,0.0,229.0
288593,16,3,YV,N923FJ,5793,ELP,DFW,922.0,1.0,551.0


In [6]:
# Splits data into test and training sets
X = df.drop('ARR_DEL15', axis=1)
y = df.ARR_DEL15
X.OP_CARRIER_FL_NUM = X.OP_CARRIER_FL_NUM.astype('category')
X.DAY_OF_WEEK = X.DAY_OF_WEEK.astype('category')
X.DAY_OF_MONTH = X.DAY_OF_MONTH.astype('category')
X.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST,DEP_TIME,DISTANCE
427685,23,3,DL,N901DE,2381,BHM,ATL,1136.0,134.0
571453,31,4,DL,N858DZ,1403,ATL,TUS,2011.0,1541.0
281219,15,2,OO,N936SW,3521,ATL,FWA,1339.0,508.0
155154,9,3,YV,N518LR,6055,IAD,LGA,1250.0,229.0
288593,16,3,YV,N923FJ,5793,ELP,DFW,922.0,551.0


In [7]:
# One-hot-encodes categorical features and standardizes numerical features
X_num = X[['DEP_TIME', 'DISTANCE']]
X_num = (X_num - X_num.mean())/X_num.std()
X_cat = X.drop(['DEP_TIME', 'DISTANCE'], axis=1)
X_cat = pd.get_dummies(X_cat)
X = X_num.join(X_cat)
X.head()

Unnamed: 0,DEP_TIME,DISTANCE,DAY_OF_MONTH_1,DAY_OF_MONTH_2,DAY_OF_MONTH_3,DAY_OF_MONTH_4,DAY_OF_MONTH_5,DAY_OF_MONTH_6,DAY_OF_MONTH_7,DAY_OF_MONTH_8,...,DEST_TYR,DEST_TYS,DEST_UIN,DEST_USA,DEST_VEL,DEST_VLD,DEST_VPS,DEST_WRG,DEST_XNA,DEST_YUM
427685,-0.392906,-1.127796,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
571453,1.377538,1.233325,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
281219,0.017837,-0.500178,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
155154,-0.162242,-0.968374,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
288593,-0.825906,-0.428018,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Creates a logistic regression on the sample and cross-validates categorical features
lr = LogisticRegression()
results = cross_validate(lr, X, y, return_train_score=True, n_jobs=-1, error_score='raise')

In [10]:
logistic_training_accuracy = results['train_score'].mean()
logistic_validation_accuracy = results['test_score'].mean()
print("Logistic Regressor w/ Cross-Valdation:")
print("Training accuracy:", logistic_training_accuracy)
print("Validation accuracy:", logistic_validation_accuracy)

Logistic Regressor w/ Cross-Valdation:
Training accuracy: 0.8620500622027384
Validation accuracy: 0.8083744595286605


In [12]:
results['train_score']

array([0.8635519 , 0.86164941, 0.86257853, 0.86094766, 0.86152281])

In [13]:
results['test_score']

array([0.80640595, 0.80375155, 0.81136082, 0.80938053, 0.81097345])