In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [2]:
# Loads the data into a dataframe
df = pd.read_csv('./Jan_2019_ontime.csv')
df.shape

(583985, 22)

In [3]:
# Drops column with no data and reduces the data to just a small sample for establishing a baseline
df = df.sample(frac=0.05)
df = df.drop('Unnamed: 21', axis=1)
df.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN,...,DEST_AIRPORT_SEQ_ID,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_TIME,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE
577588,31,4,WN,19393,WN,N937WN,541,11697,1169706,FLL,...,1530402,TPA,2138.0,1.0,2000-2059,2232.0,1.0,0.0,0.0,197.0
331253,18,5,OO,20304,OO,N760EV,3206,11298,1129806,DFW,...,1244807,JAN,1625.0,0.0,1600-1659,1741.0,0.0,0.0,0.0,408.0
476012,26,6,WN,19393,WN,N965WN,4835,13796,1379608,OAK,...,1467903,SAN,1248.0,0.0,1200-1259,1405.0,0.0,0.0,0.0,446.0
77855,5,6,WN,19393,WN,N8687A,3119,11292,1129202,DEN,...,1323202,MDW,1517.0,0.0,1500-1559,1818.0,0.0,0.0,0.0,895.0
375909,21,1,AA,19805,AA,N193UW,1893,12953,1295304,LGA,...,1105703,CLT,1452.0,0.0,1400-1459,1711.0,0.0,0.0,0.0,544.0


In [4]:
# Since OP_CARRIER and OP_UNIQUE_CARRIER columns are identical, we can remove one of them
df = df.drop('OP_UNIQUE_CARRIER', axis=1)
df.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_TIME,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE
577588,31,4,19393,WN,N937WN,541,11697,1169706,FLL,15304,1530402,TPA,2138.0,1.0,2000-2059,2232.0,1.0,0.0,0.0,197.0
331253,18,5,20304,OO,N760EV,3206,11298,1129806,DFW,12448,1244807,JAN,1625.0,0.0,1600-1659,1741.0,0.0,0.0,0.0,408.0
476012,26,6,19393,WN,N965WN,4835,13796,1379608,OAK,14679,1467903,SAN,1248.0,0.0,1200-1259,1405.0,0.0,0.0,0.0,446.0
77855,5,6,19393,WN,N8687A,3119,11292,1129202,DEN,13232,1323202,MDW,1517.0,0.0,1500-1559,1818.0,0.0,0.0,0.0,895.0
375909,21,1,19805,AA,N193UW,1893,12953,1295304,LGA,11057,1105703,CLT,1452.0,0.0,1400-1459,1711.0,0.0,0.0,0.0,544.0


In [5]:
df = df.dropna()

In [6]:
# Calculates a baseline error for predicting whether a flight's arrival is delayed for more than 15 minutes
df.ARR_DEL15.value_counts()

0.0    23068
1.0     5203
Name: ARR_DEL15, dtype: int64

In [7]:
# Our baseline accuracy is ~0.816 and baseline error is ~0.184
baseline_accuracy = 23068 / (23068 + 5203)
baseline_accuracy

0.8159598174808107

In [8]:
baseline_error = 1 - baseline_accuracy
baseline_error

0.1840401825191893

In [9]:
# Removes some unnessecary features for creating a simple classifier
X = df.drop(['ORIGIN_AIRPORT_ID','ORIGIN_AIRPORT_SEQ_ID', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'OP_CARRIER_AIRLINE_ID',
             'DEP_TIME_BLK', 'DEP_DEL15', 'ARR_DEL15', 'ARR_TIME', 'CANCELLED', 'DIVERTED'], axis=1)
y = df.ARR_DEL15
X.DAY_OF_MONTH = X.DAY_OF_MONTH.astype('category')
X.DAY_OF_WEEK = X.DAY_OF_WEEK.astype('category')
X.OP_CARRIER_FL_NUM = X.OP_CARRIER_FL_NUM.astype('category')
X.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST,DEP_TIME,DISTANCE
577588,31,4,WN,N937WN,541,FLL,TPA,2138.0,197.0
331253,18,5,OO,N760EV,3206,DFW,JAN,1625.0,408.0
476012,26,6,WN,N965WN,4835,OAK,SAN,1248.0,446.0
77855,5,6,WN,N8687A,3119,DEN,MDW,1517.0,895.0
375909,21,1,AA,N193UW,1893,LGA,CLT,1452.0,544.0


In [10]:
# One-hot-encodes categorical features
X = pd.get_dummies(X)
X.head()

Unnamed: 0,DEP_TIME,DISTANCE,DAY_OF_MONTH_1,DAY_OF_MONTH_2,DAY_OF_MONTH_3,DAY_OF_MONTH_4,DAY_OF_MONTH_5,DAY_OF_MONTH_6,DAY_OF_MONTH_7,DAY_OF_MONTH_8,...,DEST_TYR,DEST_TYS,DEST_USA,DEST_VEL,DEST_VLD,DEST_VPS,DEST_WRG,DEST_XNA,DEST_YAK,DEST_YUM
577588,2138.0,197.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
331253,1625.0,408.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
476012,1248.0,446.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77855,1517.0,895.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
375909,1452.0,544.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Logistic regressor training accuracy
X_train = X.iloc[0:20000,:]
X_test = X.iloc[20001:28271,:]
y_train = y[0:20000]
y_test = y[20001:28271]

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.81995

In [19]:
# Logistic regressor test accuracy
lr.score(X_test, y_test)

0.8207980652962515