In [77]:
import warnings
warnings.filterwarnings('ignore')

In [78]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

## Read the CSV and Perform Basic Data Cleaning

In [79]:
# Load the file
file_path = Path('NYC_restaurants_full_dataset.csv')

In [80]:
# Read into a dataFrame
df = pd.read_csv(file_path)

In [81]:
# Show dataFrame
df

Unnamed: 0,DBA,STREET,INCOME_LEVEL,CUISINE_DESCRIPTION,SCORE,GRADE
0,(LEWIS DRUG STORE) LOCANDA VINI E OLII,GATES AVENUE,medium income,Italian,2,A
1,(LEWIS DRUG STORE) LOCANDA VINI E OLII,GATES AVENUE,medium income,Italian,9,A
2,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,high income,American,11,A
3,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,high income,American,12,A
4,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,high income,American,11,A
...,...,...,...,...,...,...
33688,ZUCKER'S BAGELS AND SMOKED FISH,CHAMBERS STREET,high income,Bagels/Pretzels,11,A
33689,ZUCKER'S BAGELS AND SMOKED FISH,CHAMBERS STREET,high income,Bagels/Pretzels,11,A
33690,ZUCKER'S BAGELS AND SMOKED FISH,CHAMBERS STREET,high income,Bagels/Pretzels,12,A
33691,ZUCKER'S BAGELS AND SMOKED FISH,CHAMBERS STREET,high income,Bagels/Pretzels,12,A


In [82]:
# Show df's first 5 rows only
df.head()

Unnamed: 0,DBA,STREET,INCOME_LEVEL,CUISINE_DESCRIPTION,SCORE,GRADE
0,(LEWIS DRUG STORE) LOCANDA VINI E OLII,GATES AVENUE,medium income,Italian,2,A
1,(LEWIS DRUG STORE) LOCANDA VINI E OLII,GATES AVENUE,medium income,Italian,9,A
2,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,high income,American,11,A
3,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,high income,American,12,A
4,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,high income,American,11,A


In [83]:
# Show df's first 5 rows and last 5 rows only
df.tail()

Unnamed: 0,DBA,STREET,INCOME_LEVEL,CUISINE_DESCRIPTION,SCORE,GRADE
33688,ZUCKER'S BAGELS AND SMOKED FISH,CHAMBERS STREET,high income,Bagels/Pretzels,11,A
33689,ZUCKER'S BAGELS AND SMOKED FISH,CHAMBERS STREET,high income,Bagels/Pretzels,11,A
33690,ZUCKER'S BAGELS AND SMOKED FISH,CHAMBERS STREET,high income,Bagels/Pretzels,12,A
33691,ZUCKER'S BAGELS AND SMOKED FISH,CHAMBERS STREET,high income,Bagels/Pretzels,12,A
33692,ZUCKER'S BAGELS AND SMOKED FISH,CHAMBERS STREET,high income,Bagels/Pretzels,12,A


In [84]:
# Determine missing values: Count values in each column
df.count()

DBA                    33693
STREET                 33693
INCOME_LEVEL           33693
CUISINE_DESCRIPTION    33693
SCORE                  33693
GRADE                  33693
dtype: int64

In [85]:
# Determine missing values: isnull().sum()
df.isnull().sum()

DBA                    0
STREET                 0
INCOME_LEVEL           0
CUISINE_DESCRIPTION    0
SCORE                  0
GRADE                  0
dtype: int64

In [86]:
# Get the names of all columns
df.columns

Index(['DBA', 'STREET', 'INCOME_LEVEL', 'CUISINE_DESCRIPTION', 'SCORE',
       'GRADE'],
      dtype='object')

In [87]:
# Set the variables 'columns' and 'target', containing features varaibles and target variables respectively
columns = ['DBA', 'STREET', 'INCOME_LEVEL', 'CUISINE_DESCRIPTION', 'SCORE']

target = ['GRADE']

In [88]:
# Check data types
datatypes = df.dtypes
print(datatypes)

DBA                    object
STREET                 object
INCOME_LEVEL           object
CUISINE_DESCRIPTION    object
SCORE                   int64
GRADE                  object
dtype: object


In [89]:
# Determine unique values in the column "INCOME_LEVEL"

## Get a variable holding a list made out of values in "INCOME_LEVEL"
income_types = df["INCOME_LEVEL"].tolist()
## Get the unique items in the "income_types" list
set(income_types)

{'high income', 'low income', 'medium income'}

In [90]:
# Determine unique values in the column "GRADE"

## Get a variable holding a list made out of values in "INCOME_LEVEL"
grade_types = df["GRADE"].tolist()
## Get the unique items in the "income_types" list
set(grade_types)

{'A', 'B', 'C', 'P', 'Z'}

## Split the Data into Training and Testing

### (a) Encoding Features Variables

In [91]:
from sklearn.preprocessing import LabelEncoder

In [92]:
# Creating an instance of label encoder
label_encoder = LabelEncoder()
df["Income_levels"] = label_encoder.fit_transform(df["INCOME_LEVEL"])
df.head()

Unnamed: 0,DBA,STREET,INCOME_LEVEL,CUISINE_DESCRIPTION,SCORE,GRADE,Income_levels
0,(LEWIS DRUG STORE) LOCANDA VINI E OLII,GATES AVENUE,medium income,Italian,2,A,2
1,(LEWIS DRUG STORE) LOCANDA VINI E OLII,GATES AVENUE,medium income,Italian,9,A,2
2,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,high income,American,11,A,0
3,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,high income,American,12,A,0
4,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,high income,American,11,A,0


In [93]:
# Income Level dictionary
INCOME_LEVEL_num = {
    "high income": 1,
    "medium income": 2,
    "low income": 3
}

In [94]:
# Income Levels encoded using the dictionary values
df["INCOME_LEVEL_num"] = df["INCOME_LEVEL"].apply(lambda x: INCOME_LEVEL_num[x])
df.head()

Unnamed: 0,DBA,STREET,INCOME_LEVEL,CUISINE_DESCRIPTION,SCORE,GRADE,Income_levels,INCOME_LEVEL_num
0,(LEWIS DRUG STORE) LOCANDA VINI E OLII,GATES AVENUE,medium income,Italian,2,A,2,2
1,(LEWIS DRUG STORE) LOCANDA VINI E OLII,GATES AVENUE,medium income,Italian,9,A,2,2
2,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,high income,American,11,A,0,1
3,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,high income,American,12,A,0,1
4,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,high income,American,11,A,0,1


In [95]:
# Drop the INCOME_LEVEL and Income_levels columns
df = df.drop(["INCOME_LEVEL", "Income_levels"], axis=1)
df.head()

Unnamed: 0,DBA,STREET,CUISINE_DESCRIPTION,SCORE,GRADE,INCOME_LEVEL_num
0,(LEWIS DRUG STORE) LOCANDA VINI E OLII,GATES AVENUE,Italian,2,A,2
1,(LEWIS DRUG STORE) LOCANDA VINI E OLII,GATES AVENUE,Italian,9,A,2
2,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,American,11,A,1
3,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,American,12,A,1
4,(PUBLIC FARE) 81st street and central park wes...,CENTRAL PARK WEST,American,11,A,1


In [96]:
# Binary encoding using Pandas (multiple columns)
df = pd.get_dummies(df, columns=["DBA", "STREET", "CUISINE_DESCRIPTION"])
df.head()

Unnamed: 0,SCORE,GRADE,INCOME_LEVEL_num,DBA_(LEWIS DRUG STORE) LOCANDA VINI E OLII,DBA_(PUBLIC FARE) 81st street and central park west (Delacorte Theatre),DBA_1 DARBAR,DBA_1 EAST 66TH STREET KITCHEN,DBA_101 DELI,DBA_16 HANDLES,DBA_18 RESTAURANT,...,CUISINE_DESCRIPTION_Soul Food,CUISINE_DESCRIPTION_Soups & Sandwiches,CUISINE_DESCRIPTION_Spanish,CUISINE_DESCRIPTION_Steak,CUISINE_DESCRIPTION_Tapas,CUISINE_DESCRIPTION_Tex-Mex,CUISINE_DESCRIPTION_Thai,CUISINE_DESCRIPTION_Turkish,CUISINE_DESCRIPTION_Vegetarian,CUISINE_DESCRIPTION_Vietnamese/Cambodian/Malaysia
0,2,A,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,A,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11,A,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,12,A,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11,A,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### (b) Encoding Target Variables

In [97]:
# Creating an instance of label encoder
label_encoder = LabelEncoder()
df["Grade"] = label_encoder.fit_transform(df["GRADE"])
df.head()

Unnamed: 0,SCORE,GRADE,INCOME_LEVEL_num,DBA_(LEWIS DRUG STORE) LOCANDA VINI E OLII,DBA_(PUBLIC FARE) 81st street and central park west (Delacorte Theatre),DBA_1 DARBAR,DBA_1 EAST 66TH STREET KITCHEN,DBA_101 DELI,DBA_16 HANDLES,DBA_18 RESTAURANT,...,CUISINE_DESCRIPTION_Soups & Sandwiches,CUISINE_DESCRIPTION_Spanish,CUISINE_DESCRIPTION_Steak,CUISINE_DESCRIPTION_Tapas,CUISINE_DESCRIPTION_Tex-Mex,CUISINE_DESCRIPTION_Thai,CUISINE_DESCRIPTION_Turkish,CUISINE_DESCRIPTION_Vegetarian,CUISINE_DESCRIPTION_Vietnamese/Cambodian/Malaysia,Grade
0,2,A,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,A,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11,A,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,12,A,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11,A,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
# Grade dictionary
## The dictionary creates two categories for grades: "high" and "low".
## "high" grade has been made to replace grades A and B, 
## whereas "low" grade has been made to replace all grades lower than A and B.

GRADE_num = {
    "A": "high",
    "B": "high",
    "C": "low",
    "P": "low",
    "Z": "low"
}

In [99]:
# Grades encoded using the dictionary values
df["GRADE_num"] = df["GRADE"].apply(lambda x: GRADE_num[x])
df.head()

Unnamed: 0,SCORE,GRADE,INCOME_LEVEL_num,DBA_(LEWIS DRUG STORE) LOCANDA VINI E OLII,DBA_(PUBLIC FARE) 81st street and central park west (Delacorte Theatre),DBA_1 DARBAR,DBA_1 EAST 66TH STREET KITCHEN,DBA_101 DELI,DBA_16 HANDLES,DBA_18 RESTAURANT,...,CUISINE_DESCRIPTION_Spanish,CUISINE_DESCRIPTION_Steak,CUISINE_DESCRIPTION_Tapas,CUISINE_DESCRIPTION_Tex-Mex,CUISINE_DESCRIPTION_Thai,CUISINE_DESCRIPTION_Turkish,CUISINE_DESCRIPTION_Vegetarian,CUISINE_DESCRIPTION_Vietnamese/Cambodian/Malaysia,Grade,GRADE_num
0,2,A,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,high
1,9,A,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,high
2,11,A,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,high
3,12,A,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,high
4,11,A,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,high


In [100]:
# Drop the GRADE and Grade columns
df = df.drop(["GRADE", "Grade"], axis=1)
df.head()

Unnamed: 0,SCORE,INCOME_LEVEL_num,DBA_(LEWIS DRUG STORE) LOCANDA VINI E OLII,DBA_(PUBLIC FARE) 81st street and central park west (Delacorte Theatre),DBA_1 DARBAR,DBA_1 EAST 66TH STREET KITCHEN,DBA_101 DELI,DBA_16 HANDLES,DBA_18 RESTAURANT,DBA_27 SPORTS BAR & CAFE,...,CUISINE_DESCRIPTION_Soups & Sandwiches,CUISINE_DESCRIPTION_Spanish,CUISINE_DESCRIPTION_Steak,CUISINE_DESCRIPTION_Tapas,CUISINE_DESCRIPTION_Tex-Mex,CUISINE_DESCRIPTION_Thai,CUISINE_DESCRIPTION_Turkish,CUISINE_DESCRIPTION_Vegetarian,CUISINE_DESCRIPTION_Vietnamese/Cambodian/Malaysia,GRADE_num
0,2,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,high
1,9,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,high
2,11,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,high
3,12,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,high
4,11,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,high


### (c) Features and Target Variables

In [101]:
# Create our features:
X = df.copy()
X = X.drop("GRADE_num", axis=1)
X.head()

Unnamed: 0,SCORE,INCOME_LEVEL_num,DBA_(LEWIS DRUG STORE) LOCANDA VINI E OLII,DBA_(PUBLIC FARE) 81st street and central park west (Delacorte Theatre),DBA_1 DARBAR,DBA_1 EAST 66TH STREET KITCHEN,DBA_101 DELI,DBA_16 HANDLES,DBA_18 RESTAURANT,DBA_27 SPORTS BAR & CAFE,...,CUISINE_DESCRIPTION_Soul Food,CUISINE_DESCRIPTION_Soups & Sandwiches,CUISINE_DESCRIPTION_Spanish,CUISINE_DESCRIPTION_Steak,CUISINE_DESCRIPTION_Tapas,CUISINE_DESCRIPTION_Tex-Mex,CUISINE_DESCRIPTION_Thai,CUISINE_DESCRIPTION_Turkish,CUISINE_DESCRIPTION_Vegetarian,CUISINE_DESCRIPTION_Vietnamese/Cambodian/Malaysia
0,2,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,12,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
# Create our target
y = df["GRADE_num"].values

In [103]:
X.describe()

Unnamed: 0,SCORE,INCOME_LEVEL_num,DBA_(LEWIS DRUG STORE) LOCANDA VINI E OLII,DBA_(PUBLIC FARE) 81st street and central park west (Delacorte Theatre),DBA_1 DARBAR,DBA_1 EAST 66TH STREET KITCHEN,DBA_101 DELI,DBA_16 HANDLES,DBA_18 RESTAURANT,DBA_27 SPORTS BAR & CAFE,...,CUISINE_DESCRIPTION_Soul Food,CUISINE_DESCRIPTION_Soups & Sandwiches,CUISINE_DESCRIPTION_Spanish,CUISINE_DESCRIPTION_Steak,CUISINE_DESCRIPTION_Tapas,CUISINE_DESCRIPTION_Tex-Mex,CUISINE_DESCRIPTION_Thai,CUISINE_DESCRIPTION_Turkish,CUISINE_DESCRIPTION_Vegetarian,CUISINE_DESCRIPTION_Vietnamese/Cambodian/Malaysia
count,33693.0,33693.0,33693.0,33693.0,33693.0,33693.0,33693.0,33693.0,33693.0,33693.0,...,33693.0,33693.0,33693.0,33693.0,33693.0,33693.0,33693.0,33693.0,33693.0,33693.0
mean,11.898317,1.553854,0.000148,0.000148,0.000208,8.9e-05,0.00089,0.001454,0.000386,0.000208,...,0.000297,0.005372,0.019974,0.007212,0.000356,0.009824,0.006084,0.002671,0.002879,0.00552
std,7.474663,0.751695,0.012181,0.012181,0.014413,0.009436,0.029827,0.038108,0.019639,0.014413,...,0.017226,0.073098,0.139915,0.084619,0.018869,0.098629,0.077766,0.051615,0.053579,0.074095
min,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,11.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,13.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,83.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [104]:
# Check the balance of our target values
df["GRADE_num"].value_counts(normalize=True)

high    0.953343
low     0.046657
Name: GRADE_num, dtype: float64

In [105]:
y.shape

(33693,)

In [106]:
Counter(y)

Counter({'high': 32121, 'low': 1572})

### (d) Split the Data

In [107]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'high': 24105, 'low': 1164})

## Resampling

### (a) Oversampling: Naive Random Oversampling

In [108]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'high': 24105, 'low': 24105})

In [109]:
y_resampled.shape

(48210,)

In [110]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [111]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9215392743923917

In [112]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[7544,  472],
       [  40,  368]])

In [113]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       high       0.99      0.94      0.90      0.97      0.92      0.85      8016
        low       0.44      0.90      0.94      0.59      0.92      0.85       408

avg / total       0.97      0.94      0.90      0.95      0.92      0.85      8424



### (b) Oversampling: SMOTE Oversampling

In [114]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)

Counter(y_resampled)

Counter({'high': 24105, 'low': 24105})

In [115]:
y_resampled.shape

(48210,)

In [116]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [117]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9151733298109663

In [118]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[7717,  299],
       [  54,  354]])

In [119]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       high       0.99      0.96      0.87      0.98      0.91      0.84      8016
        low       0.54      0.87      0.96      0.67      0.91      0.83       408

avg / total       0.97      0.96      0.87      0.96      0.91      0.84      8424



### (c) Undersampling: Cluster Centroids

In [120]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'high': 1164, 'low': 1164})

In [121]:
y_resampled.shape

(2328,)

In [122]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [123]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8324233885170835

In [124]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[6076, 1940],
       [  38,  370]])

In [125]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       high       0.99      0.76      0.91      0.86      0.83      0.68      8016
        low       0.16      0.91      0.76      0.27      0.83      0.70       408

avg / total       0.95      0.77      0.90      0.83      0.83      0.68      8424



### (d) Combination (Over and Under) Sampling: SMOTEENN

In [None]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

In [None]:
y_resampled.shape

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

## Ensemble Learners

### (a) Balanced Random Forest Classifier

In [None]:
# Resample the training data with the BalancedRandomForestClassifier

# Create a balanced random forest classifier:
from imblearn.ensemble import BalancedRandomForestClassifier

brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

In [None]:
# Fitting the model:
brf_model = brf_model.fit(X_train, y_train)
Counter(y_train)

In [None]:
y_train.shape

In [None]:
y_train.value_counts(normalize=True)

In [57]:
# Making predictions using the testing data:
predictions = brf_model.predict(X_test)

predictions

array(['high', 'high', 'high', ..., 'high', 'high', 'high'], dtype=object)

In [58]:
# Calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, predictions)

print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.9732777092873077


In [59]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

print("Confusion Matrix")
display(cm_df)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,7843,173
Actual 1,13,395


In [60]:
# Print the imbalanced classification report
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

       high       1.00      0.98      0.97      0.99      0.97      0.95      8016
        low       0.70      0.97      0.98      0.81      0.97      0.95       408

avg / total       0.98      0.98      0.97      0.98      0.97      0.95      8424



In [61]:
# List the features sorted in descending order by feature importance
# Calculate feature importance in the Balanced Random Forest model.
importances = brf_model.feature_importances_
importances

# We can sort the features by their importance.
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.4667783136724509, 'SCORE'),
 (0.026386081038895347, 'DBA_STARBUCKS'),
 (0.009614876176344841, 'CUISINE_DESCRIPTION_CafÃ©/Coffee/Tea'),
 (0.009510991612352792, 'STREET_BROADWAY'),
 (0.008004872348511733, 'INCOME_LEVEL_num'),
 (0.0077898701537397206, 'STREET_MOTT STREET'),
 (0.007127893064676105, 'CUISINE_DESCRIPTION_American'),
 (0.005699626303956569, 'STREET_COLUMBUS AVENUE'),
 (0.005576977323056296, 'DBA_JG MELON RESTAURANT'),
 (0.004987406618165989, 'DBA_ESSEN'),
 (0.004759316966356674, 'STREET_EAST   45 STREET'),
 (0.004651625768219958, 'DBA_GOOD ENOUGH TO EAT (A.G. BISTRO)'),
 (0.004617157520453671, 'CUISINE_DESCRIPTION_Mexican'),
 (0.0044608105180524065, 'STREET_3 AVENUE'),
 (0.004460557043189487, 'CUISINE_DESCRIPTION_Indian'),
 (0.00445545792840127, 'STREET_20 AVENUE'),
 (0.004422126694994087, 'STREET_GREENWICH STREET'),
 (0.00430466899704203, 'DBA_KAFFE 1668'),
 (0.004231103376043434, 'CUISINE_DESCRIPTION_Japanese'),
 (0.004123371963624976, 'DBA_GREAT N.Y. NOODLETOWN'),
 (0.

### (b) Easy Ensemble AdaBoost Classifier

In [1]:
# Train the EasyEnsembleClassifier

# Create a balanced random forest classifier:
from imblearn.ensemble import EasyEnsembleClassifier

ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [2]:
# Fitting the model:
ee_model = ee_model.fit(X_train, y_train)
Counter(y_train)

NameError: name 'X_train' is not defined

In [None]:
y_train.shape

In [None]:
# Making predictions using the testing data:
predictions = ee_model.predict(X_test)

predictions

In [None]:
# Calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, predictions)

print(f"Accuracy Score : {acc_score}")

In [None]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

print("Confusion Matrix")
display(cm_df)

In [None]:
# Print the imbalanced classification report
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))