In [1]:
# Initial imports.
import sqlalchemy
import numpy as np
from pathlib import Path
from collections import Counter
import pandas as pd
from path import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine

In [25]:
# Loading data
file_path = Path("../Data_cleaning_ML/final_table_ml_stage2.csv")
risk_df = pd.read_csv(file_path)
risk_df.head()

Unnamed: 0,business_postal_code,population,avg_income,inspection_id,inspection_date,inspection_score,violation_description,risk_category,Neighborhoods,Current Police Districts
0,94127,20624,"$95,313.00",38798_20170928,September,94.0,Improper or defective plumbing,Low Risk,95.0,9.0
1,94127,20624,"$95,313.00",5796_20170825,August,98.0,Improper food storage,Low Risk,63.0,10.0
2,94127,20624,"$95,313.00",2808_20170621,June,81.0,Other low risk violation,Low Risk,67.0,10.0
3,94127,20624,"$95,313.00",4630_20170304,March,96.0,Food safety certificate or food handler card n...,Low Risk,67.0,10.0
4,94127,20624,"$95,313.00",2808_20190103,January,74.0,Unclean or degraded floors walls or ceilings,Low Risk,67.0,10.0


In [3]:
# Create our features
X = risk_df.drop(columns='risk_category')

# Create our target
y = risk_df['risk_category']
y.value_counts()

Low Risk         10129
Moderate Risk     7886
High Risk         2954
Name: risk_category, dtype: int64

In [4]:
X.describe()

Unnamed: 0,business_postal_code,population,inspection_score,Neighborhoods,Current Police Districts
count,20969.0,20969.0,20969.0,20969.0,20969.0
mean,94113.76327,36628.411751,85.335066,57.718251,5.303496
std,9.368833,22194.064691,8.094191,35.207225,2.717798
min,94101.0,0.0,46.0,1.0,1.0
25%,94107.0,23016.0,81.0,31.0,3.0
50%,94110.0,30574.0,87.0,53.0,6.0
75%,94121.0,55492.0,92.0,97.0,8.0
max,94158.0,74633.0,100.0,117.0,10.0


In [5]:
X

Unnamed: 0,business_postal_code,population,avg_income,inspection_id,inspection_date,inspection_score,violation_description,Neighborhoods,Current Police Districts
0,94127,20624,"$95,313.00",38798_20170928,September,94.0,Improper or defective plumbing,95.0,9.0
1,94127,20624,"$95,313.00",5796_20170825,August,98.0,Improper food storage,63.0,10.0
2,94127,20624,"$95,313.00",2808_20170621,June,81.0,Other low risk violation,67.0,10.0
3,94127,20624,"$95,313.00",4630_20170304,March,96.0,Food safety certificate or food handler card n...,67.0,10.0
4,94127,20624,"$95,313.00",2808_20190103,January,74.0,Unclean or degraded floors walls or ceilings,67.0,10.0
...,...,...,...,...,...,...,...,...,...
20964,94101,0,$0.00,65856_20190117,January,86.0,Wiping cloths not clean or properly stored or ...,32.0,5.0
20965,94101,0,$0.00,65856_20190828,August,84.0,Inadequate food safety knowledge or lack of ce...,32.0,5.0
20966,94101,0,$0.00,65856_20170821,August,73.0,High risk food holding temperature,32.0,5.0
20967,94101,0,$0.00,65856_20170821,August,73.0,Improper storage use or identification of toxi...,32.0,5.0


In [6]:
months_num = {
   "January": 1,
   "February": 2,
   "March": 3,
   "April": 4,
   "May": 5,
   "June": 6,
   "July": 7,
   "August": 8,
   "September": 9,
   "October": 10,
   "November": 11,
   "December": 12,
}

In [7]:
X["months_num"] = X["inspection_date"].apply(lambda x: months_num[x])

In [8]:
months_num["June"]

6

In [9]:
X

Unnamed: 0,business_postal_code,population,avg_income,inspection_id,inspection_date,inspection_score,violation_description,Neighborhoods,Current Police Districts,months_num
0,94127,20624,"$95,313.00",38798_20170928,September,94.0,Improper or defective plumbing,95.0,9.0,9
1,94127,20624,"$95,313.00",5796_20170825,August,98.0,Improper food storage,63.0,10.0,8
2,94127,20624,"$95,313.00",2808_20170621,June,81.0,Other low risk violation,67.0,10.0,6
3,94127,20624,"$95,313.00",4630_20170304,March,96.0,Food safety certificate or food handler card n...,67.0,10.0,3
4,94127,20624,"$95,313.00",2808_20190103,January,74.0,Unclean or degraded floors walls or ceilings,67.0,10.0,1
...,...,...,...,...,...,...,...,...,...,...
20964,94101,0,$0.00,65856_20190117,January,86.0,Wiping cloths not clean or properly stored or ...,32.0,5.0,1
20965,94101,0,$0.00,65856_20190828,August,84.0,Inadequate food safety knowledge or lack of ce...,32.0,5.0,8
20966,94101,0,$0.00,65856_20170821,August,73.0,High risk food holding temperature,32.0,5.0,8
20967,94101,0,$0.00,65856_20170821,August,73.0,Improper storage use or identification of toxi...,32.0,5.0,8


In [10]:
X = X.drop(columns='inspection_date')
X.head()

Unnamed: 0,business_postal_code,population,avg_income,inspection_id,inspection_score,violation_description,Neighborhoods,Current Police Districts,months_num
0,94127,20624,"$95,313.00",38798_20170928,94.0,Improper or defective plumbing,95.0,9.0,9
1,94127,20624,"$95,313.00",5796_20170825,98.0,Improper food storage,63.0,10.0,8
2,94127,20624,"$95,313.00",2808_20170621,81.0,Other low risk violation,67.0,10.0,6
3,94127,20624,"$95,313.00",4630_20170304,96.0,Food safety certificate or food handler card n...,67.0,10.0,3
4,94127,20624,"$95,313.00",2808_20190103,74.0,Unclean or degraded floors walls or ceilings,67.0,10.0,1


In [11]:
X_encoded = pd.get_dummies(X)
X_encoded

Unnamed: 0,business_postal_code,population,inspection_score,Neighborhoods,Current Police Districts,months_num,avg_income_$0.00,"avg_income_$14,609.00","avg_income_$163,949.00","avg_income_$22,351.00",...,violation_description_Unauthorized or unsafe use of time as a public health control measure,violation_description_Unclean hands or improper use of gloves,violation_description_Unclean nonfood contact surfaces,violation_description_Unclean or degraded floors walls or ceilings,violation_description_Unclean or unsanitary food contact surfaces,violation_description_Unclean unmaintained or improperly constructed toilet facilities,violation_description_Unpermitted food facility,violation_description_Unsanitary employee garments hair or nails,violation_description_Wiping cloths not clean or properly stored or inadequate sanitizer,violation_description_Worker safety hazards
0,94127,20624,94.0,95.0,9.0,9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,94127,20624,98.0,63.0,10.0,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,94127,20624,81.0,67.0,10.0,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,94127,20624,96.0,67.0,10.0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,94127,20624,74.0,67.0,10.0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20964,94101,0,86.0,32.0,5.0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
20965,94101,0,84.0,32.0,5.0,8,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20966,94101,0,73.0,32.0,5.0,8,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20967,94101,0,73.0,32.0,5.0,8,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
data_scaler = StandardScaler()

In [13]:
X_encoded_scaled = data_scaler.fit_transform(X_encoded)
X_encoded_scaled[:1]

array([[ 1.41288099, -0.72112944,  1.07053825, ..., -0.02675544,
        -0.21262608, -0.02763356]])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded_scaled, y, random_state = 1)

In [15]:
y_train.value_counts()

Low Risk         7624
Moderate Risk    5896
High Risk        2206
Name: risk_category, dtype: int64

In [16]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [18]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [19]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [20]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2"], columns=["Predicted 0", "Predicted 1", "Actual 2"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1,Actual 2
Actual 0,747,0,1
Actual 1,0,2505,0
Actual 2,0,4,1986


In [21]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [22]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Actual 2
Actual 0,747,0,1
Actual 1,0,2505,0
Actual 2,0,4,1986


Accuracy Score : 0.999046347510967
Classification Report
               precision    recall  f1-score   support

    High Risk       1.00      1.00      1.00       748
     Low Risk       1.00      1.00      1.00      2505
Moderate Risk       1.00      1.00      1.00      1990

     accuracy                           1.00      5243
    macro avg       1.00      1.00      1.00      5243
 weighted avg       1.00      1.00      1.00      5243



In [23]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.00266003, 0.00262838, 0.02724856, ..., 0.00023229, 0.02540204,
       0.00046052])

In [24]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X_encoded.columns), reverse=True)

[(0.06290572671588596,
  'violation_description_Inadequately cleaned or sanitized food contact surfaces'),
 (0.06178719040785824,
  'violation_description_Moderate risk food holding temperature'),
 (0.05682108375726322,
  'violation_description_Unclean or degraded floors walls or ceilings'),
 (0.056254446853441165,
  'violation_description_High risk food holding temperature'),
 (0.05367104006614869,
  'violation_description_Inadequate and inaccessible handwashing facilities'),
 (0.046979034963780184,
  'violation_description_Foods not protected from contamination'),
 (0.04417591721358414,
  'violation_description_Moderate risk vermin infestation'),
 (0.044004187726672916,
  'violation_description_Unclean or unsanitary food contact surfaces'),
 (0.04264160387579721,
  'violation_description_Unapproved or unmaintained equipment or utensils'),
 (0.031072982177111096, 'violation_description_High risk vermin infestation'),
 (0.029173833550235027, 'violation_description_Improper cooling meth