In [1]:
#Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import StratifiedKFold
from cross_validation import validation_pipeline
from sklearn.ensemble import RandomForestClassifier

In [2]:
#Load Dataset

df = pd.read_csv('Dataset.csv')

features = df.iloc[:, 1:]
labels = df['Is_Open']

In [3]:
#Convert features and labels to numpy arrays
X = np.array(features)
y = np.array(labels)

#10-Fold Cross Validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

#Apply Training/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [4]:
#Random Forest model
random_forest = RandomForestClassifier(criterion='entropy', bootstrap=True, max_samples = 0.8, 
                                       random_state=1, min_samples_split = 500)

In [5]:
#Perform 10-Fold cross validation by calling validation_pipeline defined cross_validation.py
validation_accuracy = validation_pipeline(random_forest, cv, X_train, y_train)

Fold 1 accuracy: 0.7086447957463575
Fold 2 accuracy: 0.7081910714561706
Fold 3 accuracy: 0.7080338243622029
Fold 4 accuracy: 0.7030688190039929
Fold 5 accuracy: 0.7030543082234102
Fold 6 accuracy: 0.7019452046348676
Fold 7 accuracy: 0.7031157965760821
Fold 8 accuracy: 0.7066794795478021
Fold 9 accuracy: 0.7052685863844794
Fold 10 accuracy: 0.6933782240070234


In [6]:
print('Random-Forest Validation Accuracy: ' + str(validation_accuracy))

Random-Forest Validation Accuracy: 0.7041380109942388


In [7]:
#SMOTE that will be used to upsample training set.
smote = SMOTENC(categorical_features=[2,4,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,
            23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44], random_state=42)

#Upsample training data, will be used for training model for prediction on test set
X_train_upsample, y_train_upsample = smote.fit_resample(X_train, y_train)

#fit model
random_forest.fit(X_train_upsample, y_train_upsample)

#predictions
pred = random_forest.predict(X_test)

#vanilla accuracy
test_accuracy_vanilla = accuracy_score(y_test, pred)

#balanced accuracy
test_accuracy_balanced = balanced_accuracy_score(y_test, pred)

In [8]:
print("Random-Forest Vanilla Test Accuracy: " + str(test_accuracy_vanilla))

Random-Forest Vanilla Test Accuracy: 0.8264811986089896


In [9]:
print("Random-Forest Balanced Test Accuracy: " + str(test_accuracy_balanced))

Random-Forest Balanced Test Accuracy: 0.7055528682623735


In [10]:
#Open Businesses
recall_open = recall_score(y_test, pred, pos_label=1)
precision_open = precision_score(y_test, pred, pos_label=1)
f1_open = f1_score(y_test, pred, pos_label=1)

print("Recall: " + str(recall_open))
print("Precision: " + str(precision_open))
print("F1: " + str(f1_open))

Recall: 0.8959650168952494
Precision: 0.8922406967537608
F1: 0.8940989784786274


In [11]:
#Closed Businesses
recall_closed = recall_score(y_test, pred, pos_label=0)
precision_closed = precision_score(y_test, pred, pos_label=0)
f1_closed = f1_score(y_test, pred, pos_label=0)

print("Recall: " + str(recall_closed))
print("Precision: " + str(precision_closed))
print("F1: " + str(f1_closed))

Recall: 0.5151407196294977
Precision: 0.524959157741877
F1: 0.5200035961521171
