## Part 1: Prepare the Data

In [1]:
# Import Dependencies
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix

Load the data into a Pandas DataFrame and fetch the top 10 rows.

In [2]:
# Read in CSV
file_path = Path("../Resources/diabetes_data.csv")
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
5,0.0,1.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,2.0,0.0,1.0,10.0,6.0,8.0
6,0.0,1.0,0.0,1.0,30.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,0.0,14.0,0.0,0.0,9.0,6.0,7.0
7,0.0,1.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,0.0,0.0,1.0,0.0,11.0,4.0,4.0
8,1.0,1.0,1.0,1.0,30.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,5.0,30.0,30.0,1.0,0.0,9.0,5.0,1.0
9,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,4.0,3.0


List the DataFrame's data types to ensure they're aligned to the type of data stored on each column.

In [3]:
# List dataframe data types
df.dtypes

Diabetes_binary         float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64
dtype: object

Remove all rows with `null` values if any.

In [4]:
# Find null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column Diabetes_binary has 0 null values
Column HighBP has 0 null values
Column HighChol has 0 null values
Column CholCheck has 0 null values
Column BMI has 0 null values
Column Smoker has 0 null values
Column Stroke has 0 null values
Column HeartDiseaseorAttack has 0 null values
Column PhysActivity has 0 null values
Column Fruits has 0 null values
Column Veggies has 0 null values
Column HvyAlcoholConsump has 0 null values
Column AnyHealthcare has 0 null values
Column NoDocbcCost has 0 null values
Column GenHlth has 0 null values
Column MentHlth has 0 null values
Column PhysHlth has 0 null values
Column DiffWalk has 0 null values
Column Sex has 0 null values
Column Age has 0 null values
Column Education has 0 null values
Column Income has 0 null values


In [5]:
# Look at min and max values
for column in df.columns:
    print(f"Column {column} has {df[column].min()} as minimum value")
    print(f"Column {column} has {df[column].max()} as maximum value")

Column Diabetes_binary has 0.0 as minimum value
Column Diabetes_binary has 1.0 as maximum value
Column HighBP has 0.0 as minimum value
Column HighBP has 1.0 as maximum value
Column HighChol has 0.0 as minimum value
Column HighChol has 1.0 as maximum value
Column CholCheck has 0.0 as minimum value
Column CholCheck has 1.0 as maximum value
Column BMI has 12.0 as minimum value
Column BMI has 98.0 as maximum value
Column Smoker has 0.0 as minimum value
Column Smoker has 1.0 as maximum value
Column Stroke has 0.0 as minimum value
Column Stroke has 1.0 as maximum value
Column HeartDiseaseorAttack has 0.0 as minimum value
Column HeartDiseaseorAttack has 1.0 as maximum value
Column PhysActivity has 0.0 as minimum value
Column PhysActivity has 1.0 as maximum value
Column Fruits has 0.0 as minimum value
Column Fruits has 1.0 as maximum value
Column Veggies has 0.0 as minimum value
Column Veggies has 1.0 as maximum value
Column HvyAlcoholConsump has 0.0 as minimum value
Column HvyAlcoholConsump h

No null values were found. Check for duplicates.

In [6]:
# Find duplicate entries
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 24206


In [7]:
# Drop duplicate entries
df = df.drop_duplicates()

In [8]:
# Confirm that duplicate entries have been dropped
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 0


Check for duplicates but ignore the target variable.

In [9]:
# Find duplicate entries ignoring target column
features = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income']
print(f"Duplicate entries excluding the target variable: {df.duplicated(subset=features).sum()}")

Duplicate entries excluding the target variable: 1566


In [10]:
# Drop duplicate entries
df = df.drop_duplicates(subset=features, keep=False)

In [11]:
# Confirm that duplicate entries have been dropped
print(f"Duplicate entries excluding the target variable: {df.duplicated(subset=features).sum()}")

Duplicate entries excluding the target variable: 0


In [12]:
# Evaluate shape after dropping duplicates
df.shape

(226342, 22)

Export CSV for Tableau Visualizations

In [13]:
# export csv. commented out to avoid overwriting file once exported
#df.to_csv('../Resources/viz_data.csv', index_label='index')

In [14]:
# function by Boern found in the following link 
# https://stackoverflow.com/questions/31323499/sklearn-error-valueerror-input-contains-nan-infinity-or-a-value-too-large-for
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [15]:
df_cleaned = clean_dataset(df)

In [16]:
# Assign feature (X) and target (y) variables
X = df_cleaned.drop('Diabetes_binary', axis=1)
y = df_cleaned['Diabetes_binary']

Look at X and y to make sure everything looks as expected.

In [17]:
# Preview X
X.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [18]:
# Prevew y
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Diabetes_binary, dtype: float64

In [19]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [20]:
def model_tester(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    clf = model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(f'Training Score: {clf.score(X_train, y_train)}')
    print(f'Testing Score: {clf.score(X_test, y_test)}')

In [21]:
# Look at different Logistic Regression models and find bester performing for further tuning
model_tester(LogisticRegression(random_state=42), X, y)
model_tester(LogisticRegression(random_state=42, max_iter=500), X, y)
model_tester(LogisticRegression(random_state=42, max_iter=1000), X, y)
model_tester(LogisticRegression(random_state=42, max_iter=10000), X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92     48252
         1.0       0.52      0.16      0.25      8334

    accuracy                           0.85     56586
   macro avg       0.70      0.57      0.58     56586
weighted avg       0.82      0.85      0.82     56586

Training Score: 0.8536546572727916
Testing Score: 0.8547343865973915
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     48252
         1.0       0.54      0.16      0.25      8334

    accuracy                           0.86     56586
   macro avg       0.70      0.57      0.59     56586
weighted avg       0.82      0.86      0.82     56586

Training Score: 0.8550095431089328
Testing Score: 0.8562011805040116
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     48252
         1.0       0.54      0.16      0.25      8334

    accuracy                           0.86 

The second model with max_iter=500 is the model selected as the 'best' LogisticRegression model. The accuracy scores leveled off for 1000 and 10000 suggesting there were diminishing returns for the additional iterations.

In [22]:
# Look at different RandomForest models and find bester performing for further tuning
model_tester(RandomForestClassifier(random_state=42), X, y)
model_tester(RandomForestClassifier(random_state=42, bootstrap=False), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=200), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=200, bootstrap=False), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=500), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=500, bootstrap=False), X, y)

              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92     48252
         1.0       0.53      0.18      0.26      8334

    accuracy                           0.86     56586
   macro avg       0.70      0.57      0.59     56586
weighted avg       0.82      0.86      0.82     56586

Training Score: 0.9999882183840335
Testing Score: 0.8556003251687697
              precision    recall  f1-score   support

         0.0       0.87      0.96      0.92     48252
         1.0       0.49      0.20      0.29      8334

    accuracy                           0.85     56586
   macro avg       0.68      0.58      0.60     56586
weighted avg       0.82      0.85      0.82     56586

Training Score: 1.0
Testing Score: 0.8515357155480154
              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92     48252
         1.0       0.53      0.18      0.26      8334

    accuracy                           0.86     56586
   ma

The third model with n_estimators=500 and the default bootstrap setting is selected as the 'best' Random Forest model. This is due to it having the highest testing accuracy score.

In [23]:
# Look at different AdaBoost models and find bester performing for further tuning
model_tester(AdaBoostClassifier(random_state=42, n_estimators=100), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=500, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=1000, learning_rate=0.1), X, y)

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     48252
         1.0       0.55      0.20      0.29      8334

    accuracy                           0.86     56586
   macro avg       0.71      0.59      0.61     56586
weighted avg       0.83      0.86      0.83     56586

Training Score: 0.8568356935837319
Testing Score: 0.8583571908245856
              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     48252
         1.0       0.55      0.20      0.29      8334

    accuracy                           0.86     56586
   macro avg       0.71      0.58      0.61     56586
weighted avg       0.83      0.86      0.83     56586

Training Score: 0.8568062395438159
Testing Score: 0.8583571908245856
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     48252
         1.0       0.57      0.15      0.24      8334

    accuracy                           0.86 

The models with n_estimators=1000 are performing the best. Next, we will try to further tune by changing the learning_rate and keeping the n_estimators the same.

In [24]:
# further tune based on results of previous cell
model_tester(AdaBoostClassifier(random_state=42, n_estimators=1000), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=1000, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=1000, learning_rate=0.05), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=1000, learning_rate=0.01), X, y)

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     48252
         1.0       0.55      0.20      0.29      8334

    accuracy                           0.86     56586
   macro avg       0.71      0.59      0.61     56586
weighted avg       0.83      0.86      0.83     56586

Training Score: 0.8570300902471784
Testing Score: 0.858374863040328
              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92     48252
         1.0       0.56      0.19      0.29      8334

    accuracy                           0.86     56586
   macro avg       0.72      0.58      0.60     56586
weighted avg       0.83      0.86      0.83     56586

Training Score: 0.8569063832795306
Testing Score: 0.8584808963347824
              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92     48252
         1.0       0.56      0.19      0.28      8334

    accuracy                           0.86  

The model with n_estimators=1000 and default learning_rate=0.1 is selected as the 'best' Ada Boost model. This is due to it having the highest testing accuracy score. An argument could be made for learning_rate=0.01, if a subject matter expert preferred the slightly higher recall over the slightly higher accuracy score.

In [25]:
# The best of each type of model
model_tester(LogisticRegression(random_state=42, max_iter=500), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=500), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=1000, learning_rate=0.1), X, y)

              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     48252
         1.0       0.54      0.16      0.25      8334

    accuracy                           0.86     56586
   macro avg       0.70      0.57      0.59     56586
weighted avg       0.82      0.86      0.82     56586

Training Score: 0.8550095431089328
Testing Score: 0.8562011805040116
              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92     48252
         1.0       0.54      0.18      0.27      8334

    accuracy                           0.86     56586
   macro avg       0.71      0.58      0.59     56586
weighted avg       0.82      0.86      0.82     56586

Training Score: 1.0
Testing Score: 0.8567843636235111
              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92     48252
         1.0       0.56      0.19      0.29      8334

    accuracy                           0.86     56586
   ma

The best performing model of all models tested is the Ada Boost Classifier. With a testing accuracy score of 0.85848 it edged out the Logistic Regression (0.85620) and the Random Forest Classifier (0.85678). It also had the best recall with 0.98.

The models with unscaled data performed better than the models with scaled data.

In [28]:
# import pickle
import pickle

# save best performing model
model = AdaBoostClassifier(random_state=42, n_estimators=1000, learning_rate=0.1).fit(X_train, y_train)

pickle.dump(model, open('../Model/model.json', 'wb'))

In [32]:
X_train.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
85161,0.0,1.0,1.0,27.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,13.0,6.0,6.0
246686,1.0,1.0,0.0,31.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,5.0,5.0
228308,1.0,1.0,1.0,28.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,30.0,1.0,0.0,13.0,4.0,5.0
70000,0.0,1.0,1.0,30.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,9.0,5.0,7.0
153929,0.0,0.0,1.0,38.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,3.0,0.0,14.0,0.0,1.0,5.0,5.0,8.0


In [33]:
X_train.columns

Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')