In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
#pip install flask_sqlalchemy


In [4]:
#Import modules for SQL database connection
from sqlalchemy import create_engine

from flask import Flask
from flask_sqlalchemy import SQLAlchemy


## Connect to SQL server database

In [5]:
#Create database engine
"postgresql://[user]:[password]@[location]:[port]/[database]"

'postgresql://[user]:[password]@[location]:[port]/[database]'

In [6]:
#Import password from config.py file
from config import db_password

In [7]:
#Connection string for local server
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/NYC_restaurants"

In [8]:
#pip install Psycopg2

In [9]:
#Create database engine
engine = create_engine(db_string)

In [11]:
#Read database from postgreSQL into pandas dataframe
final_database_df = pd.read_sql_query('select * from "newtable5"',con=engine)

In [12]:
#View retrieved database
final_database_df

Unnamed: 0,DBA,STREET,INCOME_LEVEL,BOROUGH,ZIPCODE,CUISINE_DESCRIPTION,SCORE,GRADE
0,PATHOS,1 AVENUE,high income,Manhattan,10022,Mediterranean,9.0,A
1,THE LITTLE BEET,PARK AVENUE,high income,Manhattan,10017,Salads,13.0,A
2,AMAZE FUSION & LOUNGE,3 AVENUE,high income,Manhattan,10017,Asian/Asian Fusion,27.0,B
3,NOURISH THAI,VANDERBILT AVENUE,medium income,Brooklyn,11238,Thai,9.0,A
4,ESSEN,MADISON AVENUE,high income,Manhattan,10017,Sandwiches,13.0,A
...,...,...,...,...,...,...,...,...
8219,SOUTHSIDE CAFE (Produce Market),HUNTS POINT TERMINAL MARKET,low income,Bronx,10474,Latin American,9.0,A
8220,HOP KEE RESTAURANT,MOTT STREET,high income,Manhattan,10013,Chinese,12.0,A
8221,HALE & HEARTY SOUP,LEXINGTON AVENUE,high income,Manhattan,10017,Soups/Salads/Sandwiches,10.0,A
8222,DYNASTY TASTE LAM,EAST TREMONT AVENUE,low income,Bronx,10457,Chinese,7.0,A


## Read the CSV and Perform Basic Data Cleaning

In [None]:
# Load the file
file_path = Path('NYC_restaurants_full_dataset.csv')

In [None]:
# Read into a dataFrame
df = pd.read_csv(file_path)

In [None]:
# Show dataFrame
df

In [None]:
# Show df's first 5 rows only
df.head()

In [None]:
# Show df's first 5 rows and last 5 rows only
df.tail()

In [None]:
# Determine missing values: Count values in each column
df.count()

In [None]:
# Determine missing values: isnull().sum()
df.isnull().sum()

In [None]:
# Get the names of all columns
df.columns

In [None]:
# Set the variables 'columns' and 'target', containing features varaibles and target variables respectively
columns = ['DBA', 'STREET', 'INCOME_LEVEL', 'CUISINE_DESCRIPTION', 'SCORE']

target = ['GRADE']

In [None]:
# Check data types
datatypes = df.dtypes
print(datatypes)

In [None]:
# Determine unique values in the column "INCOME_LEVEL"

## Get a variable holding a list made out of values in "INCOME_LEVEL"
income_types = df["INCOME_LEVEL"].tolist()
## Get the unique items in the "income_types" list
set(income_types)

In [None]:
# Determine unique values in the column "GRADE"

## Get a variable holding a list made out of values in "INCOME_LEVEL"
grade_types = df["GRADE"].tolist()
## Get the unique items in the "income_types" list
set(grade_types)

## Split the Data into Training and Testing

### (a) Encoding Features Variables

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Creating an instance of label encoder
label_encoder = LabelEncoder()
df["Income_levels"] = label_encoder.fit_transform(df["INCOME_LEVEL"])
df.head()

In [None]:
# Income Level dictionary
INCOME_LEVEL_num = {
    "high income": 1,
    "medium income": 2,
    "low income": 3
}

In [None]:
# Income Levels encoded using the dictionary values
df["INCOME_LEVEL_num"] = df["INCOME_LEVEL"].apply(lambda x: INCOME_LEVEL_num[x])
df.head()

In [None]:
# Drop the INCOME_LEVEL and Income_levels columns
df = df.drop(["INCOME_LEVEL", "Income_levels"], axis=1)
df.head()

In [None]:
# Binary encoding using Pandas (multiple columns)
df = pd.get_dummies(df, columns=["DBA", "STREET", "CUISINE_DESCRIPTION"])
df.head()

### (b) Encoding Target Variables

In [None]:
# Creating an instance of label encoder
label_encoder = LabelEncoder()
df["Grade"] = label_encoder.fit_transform(df["GRADE"])
df.head()

In [None]:
# Grade dictionary
## The dictionary creates two categories for grades: "high" and "low".
## "high" grade has been made to replace grades A and B, 
## whereas "low" grade has been made to replace all grades lower than A and B.

GRADE_num = {
    "A": "high",
    "B": "high",
    "C": "low",
    "P": "low",
    "Z": "low"
}

In [None]:
# Grades encoded using the dictionary values
df["GRADE_num"] = df["GRADE"].apply(lambda x: GRADE_num[x])
df.head()

In [None]:
# Drop the GRADE and Grade columns
df = df.drop(["GRADE", "Grade"], axis=1)
df.head()

### (c) Features and Target Variables

In [None]:
# Create our features:
X = df.copy()
X = X.drop("GRADE_num", axis=1)
X.head()

In [None]:
# Create our target
y = df["GRADE_num"].values

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
df["GRADE_num"].value_counts(normalize=True)

In [None]:
y.shape

In [None]:
Counter(y)

### (d) Split the Data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

## Resampling

### (a) Oversampling: Naive Random Oversampling

In [None]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

In [None]:
y_resampled.shape

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

### (b) Oversampling: SMOTE Oversampling

In [None]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)

Counter(y_resampled)

In [None]:
y_resampled.shape

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

### (c) Undersampling: Cluster Centroids

In [None]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

In [None]:
y_resampled.shape

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

### (d) Combination (Over and Under) Sampling: SMOTEENN

In [None]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

In [None]:
y_resampled.shape

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))