In [1]:
#Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import StratifiedKFold
from cross_validation_normalization import validation_norm_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Load Dataset
df = pd.read_csv('Dataset.csv')

features = df.iloc[:, 1:]
labels = df['Is_Open']

In [3]:
#Convert features and labels to numpy arrays
X = np.array(features)
y = np.array(labels)

#10-Fold cross validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

#Apply Training/Test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [4]:
#KNN model
knn = KNeighborsClassifier(n_neighbors=450, weights='distance', metric='minkowski', p=2)

In [8]:
#Perform 10-Fold cross validation by calling validation_pipeline defined cross_validation.py
validation_accuracy = validation_norm_pipeline(knn, cv, X_train, y_train)

Fold 1 accuracy: 0.7134264619334394
Fold 2 accuracy: 0.7203862738829334
Fold 3 accuracy: 0.7136763120233856
Fold 4 accuracy: 0.7148735953909844
Fold 5 accuracy: 0.7104429867068893
Fold 6 accuracy: 0.7059035860002232
Fold 7 accuracy: 0.7148703667280174
Fold 8 accuracy: 0.7161159646067496
Fold 9 accuracy: 0.7116796076331947
Fold 10 accuracy: 0.6989646141308147


In [7]:
print('KNN Validation Accuracy: ' + str(validation_accuracy))

KNN Validation Accuracy: 0.7120339769036631


In [9]:
#SMOTE that will be used to upsample training set.
smote = SMOTENC(categorical_features=[2,4,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,
            23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44], random_state=42)

#Upsample training data, will be used for training model for prediction on test set
X_train_upsample, y_train_upsample = smote.fit_resample(X_train, y_train)

#Min-Max Normalization
scaler = MinMaxScaler()
        
#Fit on training set
scaler.fit(X_train_upsample)
        
#scale on training set
X_train_upsample = scaler.transform(X_train_upsample)
        
#scale the test dataset
X_test = scaler.transform(X_test)

In [10]:
#fit model
knn.fit(X_train_upsample, y_train_upsample)

#Predictions
pred = knn.predict(X_test)

#vanilla accuracy
test_accuracy_vanilla = accuracy_score(y_test, pred)

#balanced accuracy
test_accuracy_balanced = balanced_accuracy_score(y_test, pred)

In [11]:
print('KNN Test Vanilla Accuracy: ' + str(test_accuracy_vanilla))

KNN Test Vanilla Accuracy: 0.7745458090935682


In [12]:
print('KNN Test Balanced Accuracy: ' + str(test_accuracy_balanced))

KNN Test Balanced Accuracy: 0.7082445729954864


In [13]:
#Open Businesses
recall_open = recall_score(y_test, pred, pos_label=1)
precision_open = precision_score(y_test, pred, pos_label=1)
f1_open = f1_score(y_test, pred, pos_label=1)

print("Recall: " + str(recall_open))
print("Precision: " + str(precision_open))
print("F1: " + str(f1_open))

Recall: 0.8126416219439475
Precision: 0.9018794670431484
F1: 0.8549382070638423


In [14]:
#Closed Businesses
recall_closed = recall_score(y_test, pred, pos_label=0)
precision_closed = precision_score(y_test, pred, pos_label=0)
f1_closed = f1_score(y_test, pred, pos_label=0)

print("Recall: " + str(recall_closed))
print("Precision: " + str(precision_closed))
print("F1: " + str(f1_closed))

Recall: 0.6038475240470252
Precision: 0.4183635690485005
F1: 0.4942771743092513
