In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

In [2]:
# Loads the data into a dataframe
df = pd.read_csv('./Jan_2019_ontime.csv')
df2 = pd.read_csv('./Jan_2020_ontime.csv')
df.append(df2)
df.shape

(583985, 22)

In [3]:
# Creates a standard baseline binary classifier accuracy by using all of our avaliable data
df.ARR_DEL15.value_counts()

0.0    460741
1.0    105222
Name: ARR_DEL15, dtype: int64

In [4]:
base_accuracy = 460741 / (460741 + 105222)
base_error = 1 - base_accuracy
print('Baseline Accuracy: ', base_accuracy)
print('Baseline Error: ', base_error)

Baseline Accuracy:  0.8140832527921437
Baseline Error:  0.18591674720785634


In [5]:
# Chooses a random sample of 5% of the data to learn on
df = df.sample(frac=0.05)
df = df.drop(['Unnamed: 21', 'OP_UNIQUE_CARRIER', 'ORIGIN_AIRPORT_ID','ORIGIN_AIRPORT_SEQ_ID', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'OP_CARRIER_AIRLINE_ID',
             'DEP_TIME_BLK', 'DEP_DEL15', 'ARR_TIME', 'CANCELLED', 'DIVERTED'], axis=1)
df = df.dropna()
df.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST,DEP_TIME,ARR_DEL15,DISTANCE
12606,1,2,UA,N14237,1100,ORD,SJC,1010.0,0.0,1829.0
140716,8,2,9E,N131EV,3440,LGA,MCI,1946.0,0.0,1107.0
32714,2,3,OO,N943SW,5630,DEN,MSO,748.0,0.0,679.0
542415,29,2,B6,N827JB,655,SYR,MCO,703.0,0.0,1053.0
541552,29,2,AA,N886NN,205,IAH,ORD,1308.0,0.0,925.0


In [6]:
# Splits data into test and training sets
X = df.drop('ARR_DEL15', axis=1)
y = df.ARR_DEL15
X.OP_CARRIER_FL_NUM = X.OP_CARRIER_FL_NUM.astype('category')
X.DAY_OF_WEEK = X.DAY_OF_WEEK.astype('category')
X.OP_CARRIER_FL_NUM = X.OP_CARRIER_FL_NUM.astype('category')
X.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST,DEP_TIME,DISTANCE
12606,1,2,UA,N14237,1100,ORD,SJC,1010.0,1829.0
140716,8,2,9E,N131EV,3440,LGA,MCI,1946.0,1107.0
32714,2,3,OO,N943SW,5630,DEN,MSO,748.0,679.0
542415,29,2,B6,N827JB,655,SYR,MCO,703.0,1053.0
541552,29,2,AA,N886NN,205,IAH,ORD,1308.0,925.0


In [7]:
# One-hot-encodes categorical features and standardizes numerical features
X_num = X[['DEP_TIME', 'DISTANCE']]
X_num = (X_num - X_num.mean())/X_num.std()
X_cat = X.drop(['DEP_TIME', 'DISTANCE'], axis=1)
X_cat = pd.get_dummies(X_cat)
X = X_num.join(X_cat)
X.head()

Unnamed: 0,DEP_TIME,DISTANCE,DAY_OF_MONTH,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,...,DEST_TYS,DEST_UIN,DEST_USA,DEST_VEL,DEST_VLD,DEST_VPS,DEST_WRG,DEST_XNA,DEST_YAK,DEST_YUM
12606,-0.647864,1.716358,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
140716,1.23873,0.504776,8,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32714,-1.175949,-0.213447,2,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
542415,-1.266651,0.414159,29,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
541552,-0.047218,0.199363,29,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Creates a logistic regression on the sample and cross-validates categorical features
lr = LogisticRegression()
results = cross_validate(lr, X, y, return_train_score=True, n_jobs=-1, error_score='raise')

In [9]:
logistic_training_accuracy = results['train_score'].mean()
logistic_validation_accuracy = results['test_score'].mean()
print("Logistic Regressor w/ Cross-Valdation:")
print("Training accuracy:", logistic_training_accuracy)
print("Test accuracy:", logistic_validation_accuracy)

Logistic Regressor w/ Cross-Valdation:
Training accuracy: 0.8336755080007847
Test accuracy: 0.8053028290699114
