# Setting Up Variable Codings and Data Partitions

In [8]:
#Import Needed Packages
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import statistics as stat
from sklearn import metrics
from sklearn.metrics import roc_auc_score

#read in the data
df = pd.read_csv(r'C:\Users\Home\Documents\Data Mining\Assignments\Assignment 4\HW4_FlightDelays.csv')

#drop "Weather" as it is not an ex-ante predictor
df.drop('Weather', axis=1, inplace=True)

#Group variables into a list based on type (there are no numeric variables in this data set)
cvar_list = ['Binned_CRS_DEP_TIME','CARRIER','DEST','ORIGIN','DAY_WEEK','Flight Status']

#Creating Dummies for Categorical Variables
df2 = df.copy()
df2[cvar_list] = df[cvar_list].astype('category')
df2 = pd.get_dummies(df2, prefix_sep = '_')

#Finding mode of each column so we know what redundant dummy to drop 
# I am skipping finding the mode for "flight status", as I know my event of interest is "Yes" and, as such, I will drop the "No" equivalents)
time_mode = stat.multimode(df['Binned_CRS_DEP_TIME'])
carrier_mode = stat.multimode(df['CARRIER'])
dest_mode = stat.multimode(df['DEST'])
origin_mode = stat.multimode(df['ORIGIN'])
day_mode = stat.multimode(df['DAY_WEEK'])

delay_rdummy = 'Flight Status_On-time'

#remove one "redundant dummy", per each set of dummies
rdummies = ['Binned_CRS_DEP_TIME_'+str(time_mode[0]), 'CARRIER_'+carrier_mode[0],'DEST_'+dest_mode[0],'ORIGIN_'+origin_mode[0],'DAY_WEEK_'+str(day_mode[0]),delay_rdummy]
df3 = df2.copy()
df3 = df2.drop(columns=rdummies)

#Data Partition:
#Splitting the data into our partitions will return two dataframes, so we must prep like so:
testpart_size = .2
df_partition = df3

df_nontestdata, df_testdata = train_test_split(df_partition, test_size = testpart_size, random_state = 1)

# Logistic Regression Over Validation Partition

In [10]:
#Logistic Regression Analysis:
DV = 'Flight Status_Delayed'
y = df_nontestdata[DV]
x = df_nontestdata.drop(columns = [DV])

def summary_coef(model_object):
    n_predictors = x.shape[1]
    model_coef = pd.DataFrame(model_object.coef_.reshape(1,n_predictors),columns = x.columns.values)
    model_coef['Intercept'] = model_object.intercept_
    return (model_coef.transpose())

#Setup Logistic Regression with k-folds = 5
kfolds = 5

#Establishing alpha range for optimal logistic regression
min_alpha = .001
max_alpha = 100

#Because there are infinite values between min_alpha and max_alpha, we must specify how many alphas Python should look for
#Python will then divide that interval into an even number of searches. We need numpy for this
n_candidates = 1000
alpha_list= list(np.linspace(min_alpha, max_alpha, num = n_candidates))
c_list= list(1/np.linspace(min_alpha, max_alpha, num = n_candidates))

#Plug in classifier_optimal to our previous Logistic model to find the optimal predictors
classifier_optimal = LogisticRegressionCV(Cs = c_list,cv=kfolds,scoring = 'roc_auc',penalty = 'l1',solver='saga',max_iter=2000, random_state=1, n_jobs = -1).fit(x,y)
print(summary_coef(classifier_optimal))

#Find the optimal selected alpha
print('\n',"The optimal alpha over the validation partition is",1/classifier_optimal.C_[0])

# Get the AUC of the best model
# y_nontest_actual is the actual values of the DV in the validation partition
y_nontest_actual = df_nontestdata[DV]
# X_nontest is the predictor values in the validation partition
X_nontest = df_nontestdata.drop(columns=[DV])

print('The AUC in the validation partition is',roc_auc_score(y_nontest_actual, classifier_optimal.predict_proba(X_nontest)[:,1]))

                              0
Binned_CRS_DEP_TIME_1 -0.443884
Binned_CRS_DEP_TIME_2 -0.541951
Binned_CRS_DEP_TIME_3 -0.644623
Binned_CRS_DEP_TIME_4 -0.528231
Binned_CRS_DEP_TIME_5  0.317313
Binned_CRS_DEP_TIME_7  0.119848
Binned_CRS_DEP_TIME_8  0.281964
CARRIER_CO             0.450728
CARRIER_DL            -0.478140
CARRIER_MQ             0.597032
CARRIER_OH            -1.028996
CARRIER_RU             0.000000
CARRIER_UA            -0.007900
CARRIER_US            -1.082095
DEST_EWR               0.037587
DEST_JFK              -0.176243
ORIGIN_BWI             0.406243
ORIGIN_IAD             0.282862
DAY_WEEK_1             0.843737
DAY_WEEK_2             0.523882
DAY_WEEK_3             0.168618
DAY_WEEK_4            -0.161375
DAY_WEEK_6            -0.798983
DAY_WEEK_7             0.654297
Intercept             -0.829062

 The optimal alpha over the validation partition is 0.4013963963963964
The AUC in the validation partition is 0.7211603145996496


# Logistic Regression Over Test Partition

In [12]:
#Logistic Regression Analysis:
DV = 'Flight Status_Delayed'
y2 = df_testdata[DV]
x2 = df_testdata.drop(columns = [DV])

def summary_coef(model_object):
    n_predictors = x.shape[1]
    model_coef = pd.DataFrame(model_object.coef_.reshape(1,n_predictors),columns = x.columns.values)
    model_coef['Intercept'] = model_object.intercept_
    return (model_coef.transpose())

#Setup Logistic Regression with k-folds = 5
kfolds = 5

#Establishing alpha range for optimal logistic regression
min_alpha = .001
max_alpha = 100

#Because there are infinite values between min_alpha and max_alpha, we must specify how many alphas Python should look for
#Python will then divide that interval into an even number of searches. We need numpy for this
n_candidates = 1000
alpha_list= list(np.linspace(min_alpha, max_alpha, num = n_candidates))
c_list= list(1/np.linspace(min_alpha, max_alpha, num = n_candidates))

#Plug in classifier_optimal to our previous Logistic model to find the optimal predictors
classifier_optimal2 = LogisticRegressionCV(Cs = c_list,cv=kfolds,scoring = 'roc_auc',penalty = 'l1',solver='saga',max_iter=2000, random_state=1, n_jobs = -1).fit(x2,y2)
print(summary_coef(classifier_optimal2))

#Find the optimal selected alpha
print('\n',"The optimal alpha over the test partition is",1/classifier_optimal2.C_[0])

# Get the AUC of the best model
# y_nontest_actual is the actual values of the DV in the test partition
y_test_actual = df_testdata[DV]
# X_nontest is the predictor values in the test partition
X_test = df_testdata.drop(columns=[DV])

print('The AUC in the test partition is',roc_auc_score(y_test_actual, classifier_optimal2.predict_proba(X_test)[:,1]))

                              0
Binned_CRS_DEP_TIME_1 -0.403519
Binned_CRS_DEP_TIME_2 -0.232250
Binned_CRS_DEP_TIME_3 -1.003762
Binned_CRS_DEP_TIME_4 -0.486730
Binned_CRS_DEP_TIME_5  0.000000
Binned_CRS_DEP_TIME_7  0.183454
Binned_CRS_DEP_TIME_8  0.000000
CARRIER_CO             0.000000
CARRIER_DL            -0.676930
CARRIER_MQ             0.335434
CARRIER_OH            -0.506590
CARRIER_RU             0.000000
CARRIER_UA             0.000000
CARRIER_US            -0.722948
DEST_EWR              -0.107451
DEST_JFK              -0.105360
ORIGIN_BWI             0.713569
ORIGIN_IAD             0.687136
DAY_WEEK_1             0.328891
DAY_WEEK_2             0.000000
DAY_WEEK_3            -0.188165
DAY_WEEK_4            -0.037408
DAY_WEEK_6            -0.829934
DAY_WEEK_7             0.796231
Intercept             -0.485924

 The optimal alpha over the test partition is 0.7016936936936937
The AUC in the test partition is 0.7265957446808511
