In [1]:
#Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import StratifiedKFold
from cross_validation import validation_pipeline
from mixed_naive_bayes import MixedNB

In [2]:
#Load Dataset

df = pd.read_csv('Dataset.csv')

features = df.iloc[:, 1:]
labels = df['Is_Open']

In [3]:
#need to make sure ordinal categorical variables start with 0

features['Income_Category'] = features['Income_Category'] - 1
features['Noise_Level'] = features['Noise_Level'] - 1
features['Price_Range'] = features['Price_Range'] - 1 

In [4]:
#Convert features and labels to numpy arrays
X = np.array(features)
y = np.array(labels)

In [5]:
#10-Fold Cross Validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

#Apply Training/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
#Naive Bayes model
nb = MixedNB(categorical_features=[2,4,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,
                                   23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44])

In [8]:
#Perform 10-Fold cross validation by calling validation_pipeline defined cross_validation.py
validation_accuracy = validation_pipeline(nb, cv, X_train, y_train)

[5 2 2 2 2 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2]
Fold 1 accuracy: 0.6861873753245881
[5 2 2 2 2 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2]
Fold 2 accuracy: 0.6899504519703469
[5 2 2 2 2 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2]
Fold 3 accuracy: 0.6839300416274996
[5 2 2 2 2 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2]
Fold 4 accuracy: 0.6915547891074217
[5 2 2 2 2 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2]
Fold 5 accuracy: 0.6790184472541161
[5 2 2 2 2 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2]
Fold 6 accuracy: 0.6881267880395163
[5 2 2 2 2 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2]
Fold 7 accuracy: 0.6866351793694812
[5 2 2 2 2 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2]
Fold 8 accuracy: 0.6902295159978655
[5 2 2 2 2 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2

In [8]:
print('Naive-Bayes Validation Accuracy: ' + str(validation_accuracy))

Naive-Bayes Validation Accuracy: 0.6864975827583985


In [9]:
#SMOTE that will be used to upsample training set.
smote = SMOTENC(categorical_features=[2,4,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,
            23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44], random_state=42)

#Upsample training data, will be used for training model for prediction on test set
X_train_upsample, y_train_upsample = smote.fit_resample(X_train, y_train)

#fit model
nb.fit(X_train_upsample, y_train_upsample)

#predictions
pred = nb.predict(X_test)

#vanilla accuracy
test_accuracy_vanilla = accuracy_score(y_test, pred)

#balanced accuracy
test_accuracy_balanced = balanced_accuracy_score(y_test, pred)

[5 2 2 2 2 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2]


In [10]:
print("Naive-Bayes Vanilla Test Accuracy: " + str(test_accuracy_vanilla))

Naive-Bayes Vanilla Test Accuracy: 0.6780525853943905


In [11]:
print("Naive-Bayes Balanced-Test Accuracy: " + str(test_accuracy_balanced))

Naive-Bayes Balanced-Test Accuracy: 0.6830625575652542


In [12]:
#Open Businesses
recall_open = recall_score(y_test, pred, pos_label=1)
precision_open = precision_score(y_test, pred, pos_label=1)
f1_open = f1_score(y_test, pred, pos_label=1)

print("Recall: " + str(recall_open))
print("Precision: " + str(precision_open))
print("F1: " + str(f1_open))

Recall: 0.6751739216855496
Precision: 0.9073134248624393
F1: 0.7742170761726762


In [13]:
#Closed Businesses
recall_closed = recall_score(y_test, pred, pos_label=0)
precision_closed = precision_score(y_test, pred, pos_label=0)
f1_closed = f1_score(y_test, pred, pos_label=0)

print("Recall: " + str(recall_closed))
print("Precision: " + str(precision_closed))
print("F1: " + str(f1_closed))

Recall: 0.690951193444959
Precision: 0.32190871369294605
F1: 0.4391983695652174
