In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import our dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Import our input dataset

# Load the file
file_path = Path('FINAL_NYC_restaurants_full_database.csv')
# Read into a dataFrame
df = pd.read_csv(file_path)
# Show dataFrame
df.head()

Unnamed: 0,DBA,STREET,INCOME_LEVEL,BOROUGH,ZIPCODE,CUISINE_DESCRIPTION,SCORE,GRADE
0,PATHOS,1 AVENUE,high income,Manhattan,10022,Mediterranean,9,A
1,THE LITTLE BEET,PARK AVENUE,high income,Manhattan,10017,Salads,13,A
2,AMAZE FUSION & LOUNGE,3 AVENUE,high income,Manhattan,10017,Asian/Asian Fusion,27,B
3,NOURISH THAI,VANDERBILT AVENUE,medium income,Brooklyn,11238,Thai,9,A
4,ESSEN,MADISON AVENUE,high income,Manhattan,10017,Sandwiches,13,A


In [4]:
df = df.drop(['DBA', 'STREET', 'INCOME_LEVEL', 'BOROUGH', 'ZIPCODE', 'CUISINE_DESCRIPTION'], axis=1)

In [5]:
print(df.dtypes)

SCORE     int64
GRADE    object
dtype: object


In [6]:
df.head()

Unnamed: 0,SCORE,GRADE
0,9,A
1,13,A
2,27,B
3,9,A
4,13,A


In [7]:
# Determine different grades under 'GRADE' column
df['GRADE'].value_counts()

A    7033
B     691
C     222
P     138
Z      90
N      50
Name: GRADE, dtype: int64

In [8]:
# Drop the grades 'N', 'P', and 'Z'
df.drop(df[(df['GRADE'] == 'N') | (df['GRADE'] == 'P') | (df['GRADE'] == 'Z')].index, inplace = True)

In [9]:
# Determine different grades under 'GRADE' column
df['GRADE'].value_counts()

A    7033
B     691
C     222
Name: GRADE, dtype: int64

# Preprocessing

## Normalizing the Categorical Variables

In [10]:
# # Generate our categorical variable list
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
df[df_cat].nunique()

GRADE    3
dtype: int64

## Encoding the Variables

### Encoding the variable 'GRADE'

In [11]:
# Creating a 'high' and 'low' grade

# Creating an instance of label encoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["Grade"] = label_encoder.fit_transform(df["GRADE"])
df.head()

Unnamed: 0,SCORE,GRADE,Grade
0,9,A,0
1,13,A,0
2,27,B,1
3,9,A,0
4,13,A,0


In [12]:
# Grade dictionary
## The dictionary creates two categories for grades: "high" and "low".
## "high" grade has been made to replace grades A and B, 
## whereas "low" grade has been made to replace all grades lower than A and B.

GRADE_num = {
    "A": "high",
    "B": "high",
    "C": "low",
}

In [13]:
# Grades encoded using the dictionary values
df["GRADE_num"] = df["GRADE"].apply(lambda x: GRADE_num[x])
df.head()

Unnamed: 0,SCORE,GRADE,Grade,GRADE_num
0,9,A,0,high
1,13,A,0,high
2,27,B,1,high
3,9,A,0,high
4,13,A,0,high


In [14]:
# Drop the GRADE and Grade columns
df = df.drop(["GRADE", "Grade"], axis=1)
df.head()

Unnamed: 0,SCORE,GRADE_num
0,9,high
1,13,high
2,27,high
3,9,high
4,13,high


In [15]:
# # Generate our categorical variable list again
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column again
df[df_cat].nunique()

GRADE_num    2
dtype: int64

In [16]:
GRADE_num_counts = df.GRADE_num.value_counts()
print(GRADE_num_counts)

high    7724
low      222
Name: GRADE_num, dtype: int64


### Encoding the Categorical Variables

In [17]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[df_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(df_cat)
encode_df.head()

Unnamed: 0,GRADE_num_high,GRADE_num_low
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [18]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(df_cat,1)
df.head()

Unnamed: 0,SCORE,GRADE_num_high,GRADE_num_low
0,9,1.0,0.0
1,13,1.0,0.0
2,27,1.0,0.0
3,9,1.0,0.0
4,13,1.0,0.0


## Splitting the Data, and Standardizing the Numerical Variables

In [19]:
# Remove GRADE_num target from features data
y = df.GRADE_num_high
X = df.drop(columns=["GRADE_num_high","GRADE_num_low"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Resampling Models

## Oversampling: Naive Random Oversampling 

In [20]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_resampled)

Counter({1.0: 5598, 0.0: 5598})

In [21]:
y_resampled.shape

(11196,)

In [22]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [23]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)
y_pred

array([1., 1., 0., ..., 0., 0., 0.])

In [24]:
# Calculated the accuracy score
acc_score = accuracy_score(y_test, y_pred)
print(f" This model's predictive accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive accuracy is: 0.517


In [25]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
acc_score = balanced_accuracy_score(y_test, y_pred)
print(f" This model's predictive balanced accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive balanced accuracy is: 0.517


In [26]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 30,  23],
       [904, 963]])

In [27]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.03      0.57      0.52      0.06      0.54      0.29        53
        1.0       0.98      0.52      0.57      0.68      0.54      0.29      1867

avg / total       0.95      0.52      0.56      0.66      0.54      0.29      1920



## Oversampling: SMOTE Oversampling

In [28]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train_scaled, y_train
)

Counter(y_resampled)

Counter({1.0: 5598, 0.0: 5598})

In [29]:
y_resampled.shape

(11196,)

In [30]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [31]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)
y_pred

array([1., 1., 0., ..., 0., 0., 0.])

In [32]:
# Calculated the accuracy score
acc_score = accuracy_score(y_test, y_pred)
print(f" This model's predictive accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive accuracy is: 0.517


In [33]:
# Calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, y_pred)
print(f" This model's predictive balanced accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive balanced accuracy is: 0.517


In [34]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 30,  23],
       [904, 963]])

In [35]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.03      0.57      0.52      0.06      0.54      0.29        53
        1.0       0.98      0.52      0.57      0.68      0.54      0.29      1867

avg / total       0.95      0.52      0.56      0.66      0.54      0.29      1920



## Undersampling: Cluster Centroids

In [36]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0.0: 159, 1.0: 159})

In [37]:
y_resampled.shape

(318,)

In [38]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [39]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)
y_pred

array([0., 0., 1., ..., 0., 0., 1.])

In [40]:
# Calculated the accuracy score
acc_score = accuracy_score(y_test, y_pred)
print(f" This model's predictive accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive accuracy is: 0.128


In [41]:
# Calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, y_pred)
print(f" This model's predictive balanced accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive balanced accuracy is: 0.128


In [42]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  41,   12],
       [1663,  204]])

In [43]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.02      0.77      0.11      0.05      0.29      0.09        53
        1.0       0.94      0.11      0.77      0.20      0.29      0.08      1867

avg / total       0.92      0.13      0.76      0.19      0.29      0.08      1920



## Combination (Over and Under) Sampling: SMOTEENN

In [44]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0.0: 805, 1.0: 630})

In [45]:
y_resampled.shape

(1435,)

In [46]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [47]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)
y_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [48]:
# Calculated the accuracy score
acc_score = accuracy_score(y_test, y_pred)
print(f" This model's predictive accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive accuracy is: 0.028


In [49]:
# Calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, y_pred)
print(f" This model's predictive balanced accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive balanced accuracy is: 0.028


In [50]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  53,    0],
       [1867,    0]])

In [51]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.03      1.00      0.00      0.05      0.00      0.00        53
        1.0       0.00      0.00      1.00      0.00      0.00      0.00      1867

avg / total       0.00      0.03      0.97      0.00      0.00      0.00      1920



# Ensemble Learning Models

## Random Forest Classifier

In [52]:
# Create a random forest classifier.
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [53]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

Counter(y_train)

Counter({1.0: 5598, 0.0: 159})

In [54]:
y_train.shape

(5757,)

In [55]:
# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
y_pred

array([1., 1., 1., ..., 1., 1., 1.])

In [56]:
# Calculated the accuracy score
acc_score = accuracy_score(y_test, y_pred)
print(f" This model's predictive accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive accuracy is: 0.972


In [57]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
acc_score = balanced_accuracy_score(y_test, y_pred)
print(f" This model's predictive balanced accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive balanced accuracy is: 0.972


In [58]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

print("Confusion Matrix")
display(cm_df)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,53
Actual 1,0,1867


In [59]:
# Print the imbalanced classification report
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.00      0.00      1.00      0.00      0.00      0.00        53
        1.0       0.97      1.00      0.00      0.99      0.00      0.00      1867

avg / total       0.95      0.97      0.03      0.96      0.00      0.00      1920



## Balanced Random Forest Classifier

In [60]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
# Create a balanced random forest classifier:
brf_model = BalancedRandomForestClassifier(n_estimators=128, random_state=1)

In [61]:
# Fitting the model:
brf_model = brf_model.fit(X_train_scaled, y_train)
Counter(y_train)

Counter({1.0: 5598, 0.0: 159})

In [62]:
y_train.shape

(5757,)

In [63]:
# Making predictions using the testing data:
y_pred = brf_model.predict(X_test_scaled)
y_pred

array([0., 1., 0., ..., 1., 1., 1.])

In [64]:
# Calculated the accuracy score
acc_score = accuracy_score(y_test, y_pred)
print(f" This model's predictive accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive accuracy is: 0.640


In [65]:
# Calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, y_pred)
print(f" This model's predictive balanced accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive balanced accuracy is: 0.640


In [66]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

print("Confusion Matrix")
display(cm_df)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,20,33
Actual 1,658,1209


In [67]:
# Print the imbalanced classification report
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.03      0.38      0.65      0.05      0.49      0.24        53
        1.0       0.97      0.65      0.38      0.78      0.49      0.25      1867

avg / total       0.95      0.64      0.38      0.76      0.49      0.25      1920



In [68]:
# List the features sorted in descending order by feature importance
# Calculate feature importance in the Balanced Random Forest model.
importances = brf_model.feature_importances_
importances

# We can sort the features by their importance.
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(1.0, 'SCORE')]

## Easy Ensemble AdaBoost Classifier

In [69]:
# Train the EasyEnsembleClassifier
# Create a balanced random forest classifier:
from imblearn.ensemble import EasyEnsembleClassifier

ee_model = EasyEnsembleClassifier(n_estimators=128, random_state=1)

In [70]:
# Fitting the model:
ee_model = ee_model.fit(X_train_scaled, y_train)
Counter(y_train)

Counter({1.0: 5598, 0.0: 159})

In [71]:
y_train.shape

(5757,)

In [72]:
# Making predictions using the testing data:
y_pred = ee_model.predict(X_test_scaled)
y_pred

array([0., 1., 0., ..., 1., 1., 1.])

In [73]:
# Calculated the accuracy score
acc_score = accuracy_score(y_test, y_pred)
print(f" This model's predictive accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive accuracy is: 0.629


In [74]:
# Calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, y_pred)
print(f" This model's predictive balanced accuracy is: {accuracy_score(y_test,y_pred):.3f}")

 This model's predictive balanced accuracy is: 0.629


In [75]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

print("Confusion Matrix")
display(cm_df)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,23,30
Actual 1,682,1185


In [76]:
# Print the imbalanced classification report
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.03      0.43      0.63      0.06      0.52      0.27        53
        1.0       0.98      0.63      0.43      0.77      0.52      0.28      1867

avg / total       0.95      0.63      0.44      0.75      0.52      0.28      1920

