In [1]:
# Initial imports.
import sqlalchemy
import numpy as np
from pathlib import Path
from collections import Counter
import pandas as pd
from path import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine

In [2]:
# Loading data
file_path = Path("../Data_cleaning_ML/final_table_ml_stage2.csv")
risk_df = pd.read_csv(file_path)
risk_df.head()

Unnamed: 0,business_postal_code,population,avg_income,inspection_id,inspection_date,inspection_score,violation_description,risk_category,Neighborhoods,Current Police Districts
0,94127,20624,"$95,313.00",38798_20170928,September,94.0,Improper or defective plumbing,Low Risk,95.0,9.0
1,94127,20624,"$95,313.00",5796_20170825,August,98.0,Improper food storage,Low Risk,63.0,10.0
2,94127,20624,"$95,313.00",2808_20170621,June,81.0,Other low risk violation,Low Risk,67.0,10.0
3,94127,20624,"$95,313.00",4630_20170304,March,96.0,Food safety certificate or food handler card n...,Low Risk,67.0,10.0
4,94127,20624,"$95,313.00",2808_20190103,January,74.0,Unclean or degraded floors walls or ceilings,Low Risk,67.0,10.0


In [3]:
# Drop inspection_id
risk_df1 = risk_df.drop(columns='violation_description')

In [4]:
# Create our features
X = risk_df1.drop(columns='risk_category')

# Create our target
y = risk_df1['risk_category']
y.value_counts()

Low Risk         10129
Moderate Risk     7886
High Risk         2954
Name: risk_category, dtype: int64

In [5]:
X.describe()

Unnamed: 0,business_postal_code,population,inspection_score,Neighborhoods,Current Police Districts
count,20969.0,20969.0,20969.0,20969.0,20969.0
mean,94113.76327,36628.411751,85.335066,57.718251,5.303496
std,9.368833,22194.064691,8.094191,35.207225,2.717798
min,94101.0,0.0,46.0,1.0,1.0
25%,94107.0,23016.0,81.0,31.0,3.0
50%,94110.0,30574.0,87.0,53.0,6.0
75%,94121.0,55492.0,92.0,97.0,8.0
max,94158.0,74633.0,100.0,117.0,10.0


In [7]:
months_num = {
   "January": 1,
   "February": 2,
   "March": 3,
   "April": 4,
   "May": 5,
   "June": 6,
   "July": 7,
   "August": 8,
   "September": 9,
   "October": 10,
   "November": 11,
   "December": 12,
}

In [8]:
X["months_num"] = X["inspection_date"].apply(lambda x: months_num[x])

In [9]:
months_num["June"]

6

In [10]:
X

Unnamed: 0,business_postal_code,population,avg_income,inspection_id,inspection_date,inspection_score,Neighborhoods,Current Police Districts,months_num
0,94127,20624,"$95,313.00",38798_20170928,September,94.0,95.0,9.0,9
1,94127,20624,"$95,313.00",5796_20170825,August,98.0,63.0,10.0,8
2,94127,20624,"$95,313.00",2808_20170621,June,81.0,67.0,10.0,6
3,94127,20624,"$95,313.00",4630_20170304,March,96.0,67.0,10.0,3
4,94127,20624,"$95,313.00",2808_20190103,January,74.0,67.0,10.0,1
...,...,...,...,...,...,...,...,...,...
20964,94101,0,$0.00,65856_20190117,January,86.0,32.0,5.0,1
20965,94101,0,$0.00,65856_20190828,August,84.0,32.0,5.0,8
20966,94101,0,$0.00,65856_20170821,August,73.0,32.0,5.0,8
20967,94101,0,$0.00,65856_20170821,August,73.0,32.0,5.0,8


In [11]:
X = X.drop(columns='inspection_date')
X.head()

Unnamed: 0,business_postal_code,population,avg_income,inspection_id,inspection_score,Neighborhoods,Current Police Districts,months_num
0,94127,20624,"$95,313.00",38798_20170928,94.0,95.0,9.0,9
1,94127,20624,"$95,313.00",5796_20170825,98.0,63.0,10.0,8
2,94127,20624,"$95,313.00",2808_20170621,81.0,67.0,10.0,6
3,94127,20624,"$95,313.00",4630_20170304,96.0,67.0,10.0,3
4,94127,20624,"$95,313.00",2808_20190103,74.0,67.0,10.0,1


In [12]:
X_encoded = pd.get_dummies(X)
X_encoded

Unnamed: 0,business_postal_code,population,inspection_score,Neighborhoods,Current Police Districts,months_num,avg_income_$0.00,"avg_income_$14,609.00","avg_income_$163,949.00","avg_income_$22,351.00",...,inspection_id_9948_20180504,inspection_id_9948_20190204,inspection_id_9948_20190830,inspection_id_994_20171010,inspection_id_994_20190111,inspection_id_999_20170714,inspection_id_999_20180123,inspection_id_999_20190909,inspection_id_99_20171207,inspection_id_99_20180808
0,94127,20624,94.0,95.0,9.0,9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,94127,20624,98.0,63.0,10.0,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,94127,20624,81.0,67.0,10.0,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,94127,20624,96.0,67.0,10.0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,94127,20624,74.0,67.0,10.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20964,94101,0,86.0,32.0,5.0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20965,94101,0,84.0,32.0,5.0,8,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20966,94101,0,73.0,32.0,5.0,8,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20967,94101,0,73.0,32.0,5.0,8,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
data_scaler = StandardScaler()

In [14]:
X_encoded_scaled = data_scaler.fit_transform(X_encoded)
X_encoded_scaled[:1]

array([[ 1.41288099, -0.72112944,  1.07053825, ..., -0.016918  ,
        -0.016918  , -0.016918  ]])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded_scaled, y, random_state = 1)

In [16]:
y_train.value_counts()

Low Risk         7624
Moderate Risk    5896
High Risk        2206
Name: risk_category, dtype: int64

In [17]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [19]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [20]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [21]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2"], columns=["Predicted 0", "Predicted 1", "Actual 2"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1,Actual 2
Actual 0,109,348,291
Actual 1,242,1424,839
Actual 2,219,1128,643


In [22]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [23]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Actual 2
Actual 0,109,348,291
Actual 1,242,1424,839
Actual 2,219,1128,643


Accuracy Score : 0.41502956322716
Classification Report
               precision    recall  f1-score   support

    High Risk       0.19      0.15      0.17       748
     Low Risk       0.49      0.57      0.53      2505
Moderate Risk       0.36      0.32      0.34      1990

     accuracy                           0.42      5243
    macro avg       0.35      0.35      0.34      5243
 weighted avg       0.40      0.42      0.41      5243



In [24]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([7.63895915e-03, 8.81402929e-03, 1.20874024e-01, ...,
       2.09539533e-04, 2.22811508e-04, 6.64727717e-05])

In [25]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X_encoded.columns), reverse=True)

[(0.12087402393710041, 'inspection_score'),
 (0.055895648399773616, 'months_num'),
 (0.03319106644734966, 'Neighborhoods'),
 (0.012529314533599057, 'Current Police Districts'),
 (0.008814029286903089, 'population '),
 (0.007638959153090931, 'business_postal_code'),
 (0.0019760202581833395, 'avg_income_$163,949.00'),
 (0.001948974182523447, 'avg_income_$88,976.00'),
 (0.0014024003306613844, 'avg_income_$31,131.00'),
 (0.0013379591602556491, 'avg_income_$75,727.00'),
 (0.0013311982213109112, 'avg_income_$40,990.00'),
 (0.0009760273149721139, 'avg_income_$63,983.00'),
 (0.0009441733658725026, 'avg_income_$54,879.00'),
 (0.000925109611374672, 'avg_income_$43,444.00'),
 (0.000883095236712519, 'avg_income_$53,795.00'),
 (0.0008249888069272283, 'avg_income_$61,609.00'),
 (0.000788749779679758, 'inspection_id_1673_20170608'),
 (0.0007673440264459278, 'avg_income_$22,351.00'),
 (0.0007649466798706839, 'avg_income_$31,542.00'),
 (0.0007603509976186133, 'avg_income_$61,362.00'),
 (0.0007459429320