# Part 1: Importing Packages and Cleaning/Setting Up Data

In [4]:
#Import Needed Packages
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
import statistics as stat
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

#read in the data
df = pd.read_csv(r'C:\Users\Home\Documents\Data Mining\Assignments\Assignment 4\HW4_FlightDelays.csv')

#drop "Weather" as it is not an ex-ante predictor
df.drop('Weather', axis=1, inplace=True)

#Group variables into a list based on type (there are no numeric variables in this data set)
cvar_list = ['Binned_CRS_DEP_TIME','CARRIER','DEST','ORIGIN','DAY_WEEK','Flight Status']

#Creating Dummies for Categorical Variables
df2 = df.copy()
df2[cvar_list] = df[cvar_list].astype('category')
df2 = pd.get_dummies(df2, prefix_sep = '_')

#remove redundant dummy for dependent variable only
delay_rdummy = 'Flight Status_On-time'
rdummies = [delay_rdummy]
df3 = df2.copy()
df3 = df2.drop(columns=rdummies)

#Data Partition:
#Splitting the data into our partitions will return two dataframes, so we must prep like so:
testpart_size = .2
df_partition = df3

df_nontestdata, df_testdata = train_test_split(df_partition, test_size = testpart_size, random_state = 1)

# Run KNN Process Over the Validation Partition

In [9]:
# Separate the predictor values and the DV values into X and y respectively
# Placeholder variable: DV
DV = 'Flight Status_Delayed'
y = df_nontestdata[DV]
X = df_nontestdata.drop(columns=[DV])

# Run Nearest Neighbors with k-fold cross validation with k=5
# Placeholder variable: kfolds
kfolds = 5

# Here we specify within which range of Ks we will search through
max_k = 200

param_grid = {'n_neighbors': list(range(1, max_k+1))}

# Set n_jobs to be -1 to run  on all CPU cores.
# The search criterion is to find the model that maximizes whatever the scoring function - for this case roc_auc - returns.
gridsearch = GridSearchCV(KNeighborsClassifier(metric = 'euclidean'), param_grid, scoring='roc_auc', cv=kfolds, n_jobs=-1)
gridsearch.fit(X,y)
clf_bestKNN = gridsearch.best_estimator_

# Display optimal k
print('The optimal k in the validation partition is',clf_bestKNN.n_neighbors)

# y_nontest_actual is the actual values of the DV in the validation partition
y_nontest_actual = df_nontestdata[DV]

# X_test is the predictor values in the test partition
X_nontest = df_nontestdata.drop(columns=[DV])

# Get the AUC of the best model
print('The AUC in the validation partition is',roc_auc_score(y_nontest_actual, clf_bestKNN.predict_proba(X_nontest)[:,1]))

The optimal k in the validation partition is 28
The AUC in the validation partition is 0.7591232237328394


# Run KNN Process Over the Test Partition

In [10]:
# Separate the predictor values and the DV values into X and y respectively
# Placeholder variable: DV
DV = 'Flight Status_Delayed'
y2 = df_testdata[DV]
X2 = df_testdata.drop(columns=[DV])

# Run Nearest Neighbors with k-fold cross validation with k=5
# Placeholder variable: kfolds
kfolds = 5

# Here we specify within which range of Ks we will search through
max_k = 200

param_grid = {'n_neighbors': list(range(1, max_k+1))}

# Set n_jobs to be -1 to run  on all CPU cores.
# The search criterion is to find the model that maximizes whatever the scoring function - for this case roc_auc - returns.
gridsearch2 = GridSearchCV(KNeighborsClassifier(metric = 'euclidean'), param_grid, scoring='roc_auc', cv=kfolds, n_jobs=-1)
gridsearch2.fit(X2,y2)
clf_bestKNN2 = gridsearch2.best_estimator_

# Display optimal k
print('The optimal k in the test partition is',clf_bestKNN2.n_neighbors)

# y_nontest_actual is the actual values of the DV in the validation partition
y_test_actual = df_testdata[DV]

# X_test is the predictor values in the test partition
X_test = df_testdata.drop(columns=[DV])

# Get the AUC of the best model
print('The AUC in the test partition is',roc_auc_score(y_test_actual, clf_bestKNN2.predict_proba(X_test)[:,1]))

The optimal k in the test partition is 34
The AUC in the test partition is 0.6995306633291615
