In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import requests
import json
from pathlib import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from collections import Counter
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
data = Path('./Resources/DataProcessingExtractFile-RawData.csv')
df = pd.read_csv(data)
df.head(10)

Unnamed: 0,Diabetes_Status,HighBP,HighChol,CholCheck,BMI_Range,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,Mental_Health_Range,Physical_Health_Range,DiffWalk,Sex,Age,Education,Income
0,1,1.0,1.0,1.0,4,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,4,3,1.0,0.0,9.0,4.0,3.0
1,1,0.0,0.0,0.0,3,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,1,1,0.0,0.0,7.0,6.0,1.0
2,1,1.0,1.0,1.0,3,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,6,6,1.0,0.0,9.0,4.0,8.0
3,1,1.0,0.0,1.0,3,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,1,1,0.0,0.0,11.0,3.0,6.0
4,1,1.0,1.0,1.0,2,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,1,1,0.0,0.0,11.0,5.0,4.0
5,1,1.0,1.0,1.0,3,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,1,1,0.0,1.0,10.0,6.0,8.0
6,1,1.0,0.0,1.0,4,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,1,3,0.0,0.0,9.0,6.0,7.0
7,1,1.0,1.0,1.0,3,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,1,1,1.0,0.0,11.0,4.0,4.0
8,2,1.0,1.0,1.0,4,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,5.0,6,6,1.0,0.0,9.0,5.0,1.0
9,1,0.0,0.0,1.0,2,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,1,1,0.0,1.0,8.0,4.0,3.0


In [3]:
# Random Forest Model

X = df.copy()
X = df.drop(df.columns[[0,1]], axis=1)
X.head()

Unnamed: 0,HighChol,CholCheck,BMI_Range,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,Mental_Health_Range,Physical_Health_Range,DiffWalk,Sex,Age,Education,Income
0,1.0,1.0,4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,4,3,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,1,1,0.0,0.0,7.0,6.0,1.0
2,1.0,1.0,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,6,6,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,3,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,1,1,0.0,0.0,11.0,3.0,6.0
4,1.0,1.0,2,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,1,1,0.0,0.0,11.0,5.0,4.0


In [4]:
y = df["Diabetes_Status"].ravel()
y[:5]

array([1, 1, 1, 1, 1])

In [5]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [6]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [8]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [9]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 1, 1, ..., 1, 2, 1])

In [10]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,50635,2713
Actual 1,7894,2178


In [11]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8327499211605172

In [12]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,50635,2713
Actual 1,7894,2178


Accuracy Score : 0.8327499211605172
Classification Report
              precision    recall  f1-score   support

           1       0.87      0.95      0.91     53348
           2       0.45      0.22      0.29     10072

    accuracy                           0.83     63420
   macro avg       0.66      0.58      0.60     63420
weighted avg       0.80      0.83      0.81     63420



In [13]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.03620573, 0.0054599 , 0.05812235, 0.0419158 , 0.01584019,
       0.02223732, 0.03366242, 0.04181514, 0.03373854, 0.00994884,
       0.01154454, 0.02087445, 0.09205389, 0.04694685, 0.05631653,
       0.03312359, 0.03468449, 0.16846254, 0.09516603, 0.14188088])

In [14]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1684625357867824, 'Age'),
 (0.14188087913675332, 'Income'),
 (0.09516602583939433, 'Education'),
 (0.09205389463070586, 'GenHlth'),
 (0.05812234583140896, 'BMI_Range'),
 (0.056316533607825084, 'Physical_Health_Range'),
 (0.04694684782412168, 'Mental_Health_Range'),
 (0.04191580072867783, 'Smoker'),
 (0.04181514000991582, 'Fruits'),
 (0.036205725682980076, 'HighChol'),
 (0.03468448723553452, 'Sex'),
 (0.0337385365951434, 'Veggies'),
 (0.0336624196578226, 'PhysActivity'),
 (0.03312358737797923, 'DiffWalk'),
 (0.022237315289232198, 'HeartDiseaseorAttack'),
 (0.020874450329278642, 'NoDocbcCost'),
 (0.015840193729732627, 'Stroke'),
 (0.011544540394619541, 'AnyHealthcare'),
 (0.009948836624424194, 'HvyAlcoholConsump'),
 (0.005459903687667746, 'CholCheck')]

In [18]:
pickle.dump(rf_model, open('model.pkl','wb'))