In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import requests
import json
from pathlib import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from collections import Counter
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
data = Path('./Resources/DataProcessingExtractFile-SampleOne.csv')
df = pd.read_csv(data)
df.head(10)

Unnamed: 0.1,Unnamed: 0,Diabetes_Status,HighBP,HighChol,CholCheck,BMI_Range,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,...,AnyHealthcare,NoDocbcCost,GenHlth,Mental_Health_Range,Physical_Health_Range,DiffWalk,Sex,Age,Education,Income
0,0,1,1.0,1.0,1.0,4,1.0,0.0,0.0,0.0,...,1.0,0.0,5.0,4,3,1.0,0.0,9.0,4.0,3.0
1,1,1,0.0,0.0,0.0,3,1.0,0.0,0.0,1.0,...,0.0,1.0,3.0,1,1,0.0,0.0,7.0,6.0,1.0
2,2,1,1.0,1.0,1.0,3,0.0,0.0,0.0,0.0,...,1.0,1.0,5.0,6,6,1.0,0.0,9.0,4.0,8.0
3,3,1,1.0,0.0,1.0,3,0.0,0.0,0.0,1.0,...,1.0,0.0,2.0,1,1,0.0,0.0,11.0,3.0,6.0
4,4,1,1.0,1.0,1.0,2,0.0,0.0,0.0,1.0,...,1.0,0.0,2.0,1,1,0.0,0.0,11.0,5.0,4.0
5,5,1,1.0,1.0,1.0,3,1.0,0.0,0.0,1.0,...,1.0,0.0,2.0,1,1,0.0,1.0,10.0,6.0,8.0
6,6,1,1.0,0.0,1.0,4,1.0,0.0,0.0,0.0,...,1.0,0.0,3.0,1,3,0.0,0.0,9.0,6.0,7.0
7,7,1,1.0,1.0,1.0,3,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,1,1,1.0,0.0,11.0,4.0,4.0
8,8,2,1.0,1.0,1.0,4,1.0,0.0,1.0,0.0,...,1.0,0.0,5.0,6,6,1.0,0.0,9.0,5.0,1.0
9,9,1,0.0,0.0,1.0,2,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,1,1,0.0,1.0,8.0,4.0,3.0


In [3]:
# Random Forest Model

X = df.copy()
X = df.drop(df.columns[[0,1]], axis=1)
X.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI_Range,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,Mental_Health_Range,Physical_Health_Range,DiffWalk,Sex,Age,Education,Income
0,1.0,1.0,1.0,4,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,5.0,4,3,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,3,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,3.0,1,1,0.0,0.0,7.0,6.0,1.0
2,1.0,1.0,1.0,3,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,5.0,6,6,1.0,0.0,9.0,4.0,8.0
3,1.0,0.0,1.0,3,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,1,1,0.0,0.0,11.0,3.0,6.0
4,1.0,1.0,1.0,2,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,1,1,0.0,0.0,11.0,5.0,4.0


In [4]:
y = df["Diabetes_Status"].ravel()
y[:5]

array([1, 1, 1, 1, 1])

In [5]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    random_state=78,
                                                    stratify=y)

In [6]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [8]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [9]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 1, 1, ..., 1, 2, 1])

In [10]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,44743,2771
Actual 1,7727,2205


In [11]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8172544650628416

In [12]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,44743,2771
Actual 1,7727,2205


Accuracy Score : 0.8172544650628416
Classification Report
              precision    recall  f1-score   support

           1       0.85      0.94      0.90     47514
           2       0.44      0.22      0.30      9932

    accuracy                           0.82     57446
   macro avg       0.65      0.58      0.60     57446
weighted avg       0.78      0.82      0.79     57446



In [13]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.04628988, 0.0300354 , 0.004928  , 0.05292581, 0.0412091 ,
       0.01483166, 0.02058573, 0.03443553, 0.04167014, 0.03372169,
       0.00934239, 0.01096677, 0.01953467, 0.08121689, 0.04391416,
       0.05192752, 0.02731703, 0.03365938, 0.16722384, 0.09662487,
       0.13763952])

In [14]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.16722384265211326, 'Age'),
 (0.13763952206063224, 'Income'),
 (0.09662487024110564, 'Education'),
 (0.08121689126336264, 'GenHlth'),
 (0.052925813734272076, 'BMI_Range'),
 (0.05192751612644789, 'Physical_Health_Range'),
 (0.04628987680905071, 'HighBP'),
 (0.04391416059258605, 'Mental_Health_Range'),
 (0.041670143917955475, 'Fruits'),
 (0.041209097179659655, 'Smoker'),
 (0.03443553312415648, 'PhysActivity'),
 (0.033721691502435786, 'Veggies'),
 (0.03365937953046361, 'Sex'),
 (0.03003540061862769, 'HighChol'),
 (0.027317031847651167, 'DiffWalk'),
 (0.020585728756999332, 'HeartDiseaseorAttack'),
 (0.019534672194636886, 'NoDocbcCost'),
 (0.014831660518579173, 'Stroke'),
 (0.01096677388066286, 'AnyHealthcare'),
 (0.009342392306696982, 'HvyAlcoholConsump'),
 (0.00492800114190434, 'CholCheck')]

In [15]:
pickle.dump(rf_model, open('model_2.pkl','wb'))

In [16]:
array = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]

In [17]:
y_pred = rf_model.predict([array])
y_pred

array([1])