In [19]:
# Initial imports.
import sqlalchemy
import numpy as np
from pathlib import Path
from collections import Counter
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine


In [22]:
file_path = Path("../Final_Project/Notebooks/risk_df1.csv")
risk_df1 = pd.read_csv(file_path)
risk_df1.head()

Unnamed: 0,business_postal_code,population,avg_income,inspection_id,inspection_date,inspection_score,risk_category,Neighborhoods,Current Police Districts
0,94127,20624,95313,38798_20170928,September,94.0,Low Risk,95.0,9.0
1,94127,20624,95313,5796_20170825,August,98.0,Low Risk,63.0,10.0
2,94127,20624,95313,2808_20170621,June,81.0,Low Risk,67.0,10.0
3,94127,20624,95313,4630_20170304,March,96.0,Low Risk,67.0,10.0
4,94127,20624,95313,2808_20190103,January,74.0,Low Risk,67.0,10.0


In [23]:
risk_df1

Unnamed: 0,business_postal_code,population,avg_income,inspection_id,inspection_date,inspection_score,risk_category,Neighborhoods,Current Police Districts
0,94127,20624,95313,38798_20170928,September,94.0,Low Risk,95.0,9.0
1,94127,20624,95313,5796_20170825,August,98.0,Low Risk,63.0,10.0
2,94127,20624,95313,2808_20170621,June,81.0,Low Risk,67.0,10.0
3,94127,20624,95313,4630_20170304,March,96.0,Low Risk,67.0,10.0
4,94127,20624,95313,2808_20190103,January,74.0,Low Risk,67.0,10.0
...,...,...,...,...,...,...,...,...,...
20964,94101,0,0,65856_20190117,January,86.0,Low Risk,32.0,5.0
20965,94101,0,0,65856_20190828,August,84.0,Moderate Risk,32.0,5.0
20966,94101,0,0,65856_20170821,August,73.0,High Risk,32.0,5.0
20967,94101,0,0,65856_20170821,August,73.0,Low Risk,32.0,5.0


In [24]:
#change risk_category to binary values. Label 'Moderate Risk' and 'Low Risk' == 0

y_binary = []

for i in risk_df1['risk_category']:
  if i == 'High Risk':
    y_binary.append(1)

  elif i == 'Moderate Risk':
    y_binary.append(0)

  else:
    y_binary.append(0) 

risk_df1['risk_category'] = y_binary

In [25]:
risk_df1['risk_category'].value_counts()

0    18015
1     2954
Name: risk_category, dtype: int64

In [26]:
risk_df1

Unnamed: 0,business_postal_code,population,avg_income,inspection_id,inspection_date,inspection_score,risk_category,Neighborhoods,Current Police Districts
0,94127,20624,95313,38798_20170928,September,94.0,0,95.0,9.0
1,94127,20624,95313,5796_20170825,August,98.0,0,63.0,10.0
2,94127,20624,95313,2808_20170621,June,81.0,0,67.0,10.0
3,94127,20624,95313,4630_20170304,March,96.0,0,67.0,10.0
4,94127,20624,95313,2808_20190103,January,74.0,0,67.0,10.0
...,...,...,...,...,...,...,...,...,...
20964,94101,0,0,65856_20190117,January,86.0,0,32.0,5.0
20965,94101,0,0,65856_20190828,August,84.0,0,32.0,5.0
20966,94101,0,0,65856_20170821,August,73.0,1,32.0,5.0
20967,94101,0,0,65856_20170821,August,73.0,0,32.0,5.0


In [27]:
# Create our features. 
X = risk_df1.drop(columns='risk_category')

# Create our target
y = risk_df1['risk_category']
y.value_counts()

0    18015
1     2954
Name: risk_category, dtype: int64

In [28]:
X.describe()

Unnamed: 0,business_postal_code,population,avg_income,inspection_score,Neighborhoods,Current Police Districts
count,20969.0,20969.0,20969.0,20969.0,20969.0,20969.0
mean,94113.76327,36628.411751,55053.795317,85.335066,57.718251,5.303496
std,9.368833,22194.064691,26574.28013,8.094191,35.207225,2.717798
min,94101.0,0.0,0.0,46.0,1.0,1.0
25%,94107.0,23016.0,40990.0,81.0,31.0,3.0
50%,94110.0,30574.0,54342.0,87.0,53.0,6.0
75%,94121.0,55492.0,61609.0,92.0,97.0,8.0
max,94158.0,74633.0,163949.0,100.0,117.0,10.0


In [29]:
#view X
X

Unnamed: 0,business_postal_code,population,avg_income,inspection_id,inspection_date,inspection_score,Neighborhoods,Current Police Districts
0,94127,20624,95313,38798_20170928,September,94.0,95.0,9.0
1,94127,20624,95313,5796_20170825,August,98.0,63.0,10.0
2,94127,20624,95313,2808_20170621,June,81.0,67.0,10.0
3,94127,20624,95313,4630_20170304,March,96.0,67.0,10.0
4,94127,20624,95313,2808_20190103,January,74.0,67.0,10.0
...,...,...,...,...,...,...,...,...
20964,94101,0,0,65856_20190117,January,86.0,32.0,5.0
20965,94101,0,0,65856_20190828,August,84.0,32.0,5.0
20966,94101,0,0,65856_20170821,August,73.0,32.0,5.0
20967,94101,0,0,65856_20170821,August,73.0,32.0,5.0


In [30]:
# encode Month as a number
months_num = {
   "January": 1,
   "February": 2,
   "March": 3,
   "April": 4,
   "May": 5,
   "June": 6,
   "July": 7,
   "August": 8,
   "September": 9,
   "October": 10,
   "November": 11,
   "December": 12,
}

In [31]:
X["months_num"] = X["inspection_date"].apply(lambda x: months_num[x])

In [32]:
months_num["June"]

6

In [33]:
X

Unnamed: 0,business_postal_code,population,avg_income,inspection_id,inspection_date,inspection_score,Neighborhoods,Current Police Districts,months_num
0,94127,20624,95313,38798_20170928,September,94.0,95.0,9.0,9
1,94127,20624,95313,5796_20170825,August,98.0,63.0,10.0,8
2,94127,20624,95313,2808_20170621,June,81.0,67.0,10.0,6
3,94127,20624,95313,4630_20170304,March,96.0,67.0,10.0,3
4,94127,20624,95313,2808_20190103,January,74.0,67.0,10.0,1
...,...,...,...,...,...,...,...,...,...
20964,94101,0,0,65856_20190117,January,86.0,32.0,5.0,1
20965,94101,0,0,65856_20190828,August,84.0,32.0,5.0,8
20966,94101,0,0,65856_20170821,August,73.0,32.0,5.0,8
20967,94101,0,0,65856_20170821,August,73.0,32.0,5.0,8


In [34]:
# drop inspection_date
X = X.drop(columns=['inspection_date'])
X.head()

Unnamed: 0,business_postal_code,population,avg_income,inspection_id,inspection_score,Neighborhoods,Current Police Districts,months_num
0,94127,20624,95313,38798_20170928,94.0,95.0,9.0,9
1,94127,20624,95313,5796_20170825,98.0,63.0,10.0,8
2,94127,20624,95313,2808_20170621,81.0,67.0,10.0,6
3,94127,20624,95313,4630_20170304,96.0,67.0,10.0,3
4,94127,20624,95313,2808_20190103,74.0,67.0,10.0,1


In [35]:
X.shape

(20969, 8)

In [36]:
# Split data to test and train 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, train_size=0.80)


In [37]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(16775, 8)
(4194, 8)
(16775,)
(4194,)


In [38]:
from collections import Counter
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [39]:
Counter(y_resampled)

Counter({0: 12002, 1: 8821})

In [40]:
#Counter(y_resampled)

In [41]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=250) 

In [42]:
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

In [43]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_resampled)


In [44]:
predictions

array([0, 0, 0, ..., 1, 1, 1])

In [45]:
from sklearn.metrics import confusion_matrix
predictions = rf_model.predict(X_test)
confusion_matrix(y_test, predictions)

array([[3080,  544],
       [ 267,  303]])

In [46]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8066285169289461

In [47]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


In [48]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3080,544
Actual 1,267,303


Accuracy Score : 0.8066285169289461
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.85      0.88      3624
           1       0.36      0.53      0.43       570

    accuracy                           0.81      4194
   macro avg       0.64      0.69      0.66      4194
weighted avg       0.84      0.81      0.82      4194



In [49]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, predictions)

0.6907342860462414

In [50]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.85      0.53      0.88      0.67      0.47      3624
          1       0.36      0.53      0.85      0.43      0.67      0.44       570

avg / total       0.84      0.81      0.57      0.82      0.67      0.46      4194



In [51]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.05148948, 0.06345439, 0.06179532, 0.12206796, 0.48911463,
       0.07086021, 0.05554257, 0.08567543])

In [52]:
# sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X_resampled.columns), reverse=True)

[(0.48911462503626524, 'inspection_score'),
 (0.12206795829371764, 'inspection_id'),
 (0.08567543286659947, 'months_num'),
 (0.07086021249371112, 'Neighborhoods'),
 (0.06345439442884783, 'population '),
 (0.061795323734392615, 'avg_income'),
 (0.05554257303411686, 'Current Police Districts'),
 (0.051489480112349335, 'business_postal_code')]