In [3]:
#Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import StratifiedKFold
from cross_validation_normalization import validation_norm_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [4]:
#Load Dataset

df = pd.read_csv('Dataset.csv')

features = df.iloc[:, 1:]
labels = df['Is_Open']

In [5]:
#Convert features and labels to numpy arrays
X = np.array(features)
y = np.array(labels)

#10-Fold Cross Validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

#Apply Training/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
#Logistic Regression Model
log = LogisticRegression(solver = 'saga', max_iter = 1000000000, multi_class='ovr')

In [9]:
validation_accuracy = validation_norm_pipeline(log, cv, X_train, y_train)

Fold 1 accuracy: 0.7021492577954531
Fold 2 accuracy: 0.7082555853108465
Fold 3 accuracy: 0.7093842304521718
Fold 4 accuracy: 0.6995834645172931
Fold 5 accuracy: 0.6961324943408574
Fold 6 accuracy: 0.7047387904657576
Fold 7 accuracy: 0.7090155617239164
Fold 8 accuracy: 0.7091905176864217
Fold 9 accuracy: 0.7012884956305514
Fold 10 accuracy: 0.6942688082544306


In [8]:
print('Logistic-Regression Validation Accuracy: ' + str(validation_accuracy))

Logistic-Regression Validation Accuracy: 0.7034007206177699


In [10]:
#SMOTE that will be used to upsample training set.
smote = SMOTENC(categorical_features=[2,4,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,
            23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44], random_state=42)


#Upsample training data, will be used for training model for prediction on test set
X_train_upsample, y_train_upsample = smote.fit_resample(X_train, y_train)


#Min-Max Normalization
scaler = MinMaxScaler()
        
#Fit on training set
scaler.fit(X_train_upsample)
        
#scale on training set
X_train_upsample = scaler.transform(X_train_upsample)
        
#scale the test dataset
X_test = scaler.transform(X_test)

In [11]:
#fit model
log.fit(X_train_upsample, y_train_upsample)

#predictions
pred = log.predict(X_test)

#vanilla accuracy
test_accuracy_vanilla = accuracy_score(y_test, pred)

#balanced accuracy
test_accuracy_balanced = balanced_accuracy_score(y_test, pred)

In [12]:
print('Logistic-Regression Test Vanilla Accuracy: ' + str(test_accuracy_vanilla))

Logistic-Regression Test Vanilla Accuracy: 0.7687282654619909


In [13]:
print('Logistic-Regression Test Balanced Accuracy: ' + str(test_accuracy_balanced))

Logistic-Regression Test Balanced Accuracy: 0.7035796515469426


In [14]:
#Open Businesses
recall_open = recall_score(y_test, pred, pos_label=1)
precision_open = precision_score(y_test, pred, pos_label=1)
f1_open = f1_score(y_test, pred, pos_label=1)

print("Recall: " + str(recall_open))
print("Precision: " + str(precision_open))
print("F1: " + str(f1_open))

Recall: 0.8061617968594713
Precision: 0.9005284426484302
F1: 0.8507362503670763


In [15]:
#Closed Businesses
recall_closed = recall_score(y_test, pred, pos_label=0)
precision_closed = precision_score(y_test, pred, pos_label=0)
f1_closed = f1_score(y_test, pred, pos_label=0)

print("Recall: " + str(recall_closed))
print("Precision: " + str(precision_closed))
print("F1: " + str(f1_closed))

Recall: 0.600997506234414
Precision: 0.408969696969697
F1: 0.48672821696480095
