## Part 1: Prepare the Data

In [1]:
# Import Dependencies
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix

Load the data into a Pandas DataFrame and fetch the top 10 rows.

In [2]:
# Read in CSV
file_path = Path("../Resources/diabetes_data.csv")
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
5,0.0,1.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,2.0,0.0,1.0,10.0,6.0,8.0
6,0.0,1.0,0.0,1.0,30.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,0.0,14.0,0.0,0.0,9.0,6.0,7.0
7,0.0,1.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,0.0,0.0,1.0,0.0,11.0,4.0,4.0
8,1.0,1.0,1.0,1.0,30.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,5.0,30.0,30.0,1.0,0.0,9.0,5.0,1.0
9,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,4.0,3.0


List the DataFrame's data types to ensure they're aligned to the type of data stored on each column.

In [3]:
# List dataframe data types
df.dtypes

Diabetes_binary         float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64
dtype: object

Remove all rows with `null` values if any.

In [4]:
# Find null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column Diabetes_binary has 0 null values
Column HighBP has 0 null values
Column HighChol has 0 null values
Column CholCheck has 0 null values
Column BMI has 0 null values
Column Smoker has 0 null values
Column Stroke has 0 null values
Column HeartDiseaseorAttack has 0 null values
Column PhysActivity has 0 null values
Column Fruits has 0 null values
Column Veggies has 0 null values
Column HvyAlcoholConsump has 0 null values
Column AnyHealthcare has 0 null values
Column NoDocbcCost has 0 null values
Column GenHlth has 0 null values
Column MentHlth has 0 null values
Column PhysHlth has 0 null values
Column DiffWalk has 0 null values
Column Sex has 0 null values
Column Age has 0 null values
Column Education has 0 null values
Column Income has 0 null values


In [5]:
# Look at min and max values
for column in df.columns:
    print(f"Column {column} has {df[column].min()} as minimum value")
    print(f"Column {column} has {df[column].max()} as maximum value")

Column Diabetes_binary has 0.0 as minimum value
Column Diabetes_binary has 1.0 as maximum value
Column HighBP has 0.0 as minimum value
Column HighBP has 1.0 as maximum value
Column HighChol has 0.0 as minimum value
Column HighChol has 1.0 as maximum value
Column CholCheck has 0.0 as minimum value
Column CholCheck has 1.0 as maximum value
Column BMI has 12.0 as minimum value
Column BMI has 98.0 as maximum value
Column Smoker has 0.0 as minimum value
Column Smoker has 1.0 as maximum value
Column Stroke has 0.0 as minimum value
Column Stroke has 1.0 as maximum value
Column HeartDiseaseorAttack has 0.0 as minimum value
Column HeartDiseaseorAttack has 1.0 as maximum value
Column PhysActivity has 0.0 as minimum value
Column PhysActivity has 1.0 as maximum value
Column Fruits has 0.0 as minimum value
Column Fruits has 1.0 as maximum value
Column Veggies has 0.0 as minimum value
Column Veggies has 1.0 as maximum value
Column HvyAlcoholConsump has 0.0 as minimum value
Column HvyAlcoholConsump h

No null values were found. Check for duplicates.

In [6]:
# Find duplicate entries
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 24206


In [7]:
# Drop duplicate entries
df = df.drop_duplicates()

In [8]:
# Confirm that duplicate entries have been dropped
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 0


Check for duplicates but ignore the target variable.

In [9]:
# Find duplicate entries ignoring target column
features = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income']
print(f"Duplicate entries excluding the target variable: {df.duplicated(subset=features).sum()}")

Duplicate entries excluding the target variable: 1566


In [10]:
# Drop duplicate entries
df = df.drop_duplicates(subset=features, keep=False)

In [11]:
# Confirm that duplicate entries have been dropped
print(f"Duplicate entries excluding the target variable: {df.duplicated(subset=features).sum()}")

Duplicate entries excluding the target variable: 0


In [12]:
# Evaluate shape after dropping duplicates
df.shape

(226342, 22)

Export CSV for Tableau Visualizations

In [13]:
# export csv. commented out to avoid overwriting file once exported
#df.to_csv('../Resources/viz_data.csv', index_label='index')

Standardize the dataset so that columns that contain larger values do not influence the outcome more than columns with smaller values. Exclude columns that already have binary or binned data from scaling.

In [14]:
# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[['BMI', 'MentHlth', 'PhysHlth']])

In [15]:
# Get list of the columns from the original DataFrame so we know what to add to scaled data
df.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [16]:
# Create a DataFrame with the transformed data
scaled_data = pd.DataFrame(scaled_data, columns=['BMI', 'MentHlth', 'PhysHlth'])
scaled_data['Diabetes_binary'] = df['Diabetes_binary']
scaled_data['HighBP'] = df['HighBP']
scaled_data['HighChol'] = df['HighChol']
scaled_data['CholCheck'] = df['CholCheck']
scaled_data['Smoker'] = df['Smoker']
scaled_data['Stroke'] = df['Stroke']
scaled_data['HeartDiseaseorAttack'] = df['HeartDiseaseorAttack']
scaled_data['PhysActivity'] = df['PhysActivity']
scaled_data['Fruits'] = df['Fruits']
scaled_data['Veggies'] = df['Veggies']
scaled_data['HvyAlcoholConsump'] = df['HvyAlcoholConsump']
scaled_data['AnyHealthcare'] = df['AnyHealthcare']
scaled_data['NoDocbcCost'] = df['NoDocbcCost']
scaled_data['GenHlth'] = df['GenHlth']
scaled_data['DiffWalk'] = df['DiffWalk']
scaled_data['Sex'] = df['Sex']
scaled_data['Age'] = df['Age']
scaled_data['Education'] = df['Education']
scaled_data['Income'] = df['Income']
scaled_data.head()

Unnamed: 0,BMI,MentHlth,PhysHlth,Diabetes_binary,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,...,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income
0,1.657061,1.861123,1.12771,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,3.0
1,-0.541693,-0.458522,-0.521575,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,3.0,0.0,0.0,7.0,6.0,1.0
2,-0.101943,3.407553,2.776996,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,5.0,1.0,0.0,9.0,4.0,8.0
3,-0.248526,-0.458522,-0.521575,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,3.0,6.0
4,-0.688277,-0.071915,-0.521575,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,5.0,4.0


In [17]:
# function by Boern found in the following link 
# https://stackoverflow.com/questions/31323499/sklearn-error-valueerror-input-contains-nan-infinity-or-a-value-too-large-for
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [18]:
df_cleaned = clean_dataset(scaled_data)

In [19]:
# Assign feature (X) and target (y) variables
X = df_cleaned.drop('Diabetes_binary', axis=1)
y = df_cleaned['Diabetes_binary']

Look at X and y to make sure everything looks as expected.

In [20]:
# Preview X
X.head()

Unnamed: 0,BMI,MentHlth,PhysHlth,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,...,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income
0,1.657061,1.861123,1.12771,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,3.0
1,-0.541693,-0.458522,-0.521575,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,3.0,0.0,0.0,7.0,6.0,1.0
2,-0.101943,3.407553,2.776996,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,5.0,1.0,0.0,9.0,4.0,8.0
3,-0.248526,-0.458522,-0.521575,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,3.0,6.0
4,-0.688277,-0.071915,-0.521575,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,5.0,4.0


In [21]:
# Prevew y
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Diabetes_binary, dtype: float64

In [22]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [23]:
def model_tester(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    clf = model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(f'Training Score: {clf.score(X_train, y_train)}')
    print(f'Testing Score: {clf.score(X_test, y_test)}')

In [24]:
# Look at different Logistic Regression models and find bester performing for further tuning
model_tester(LogisticRegression(random_state=42), X, y)
model_tester(LogisticRegression(random_state=42, max_iter=500), X, y)
model_tester(LogisticRegression(random_state=42, max_iter=1000), X, y)
model_tester(LogisticRegression(random_state=42, max_iter=10000), X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     43186
         1.0       0.49      0.12      0.20      7473

    accuracy                           0.85     50659
   macro avg       0.68      0.55      0.56     50659
weighted avg       0.81      0.85      0.81     50659

Training Score: 0.8524230959039316
Testing Score: 0.8518525829566316
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     43186
         1.0       0.49      0.12      0.20      7473

    accuracy                           0.85     50659
   macro avg       0.68      0.55      0.56     50659
weighted avg       0.81      0.85      0.81     50659

Training Score: 0.8524296759335417
Testing Score: 0.8518723227856847
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     43186
         1.0       0.49      0.12      0.20      7473

    accuracy                           0.85 

The second model with max_iter=500 is the model selected as the 'best' LogisticRegression model. The accuracy scores leveled off for 1000 and 10000 suggesting there were diminishing returns for the additional iterations.

In [25]:
# Look at different RandomForest models and find bester performing for further tuning
model_tester(RandomForestClassifier(random_state=42), X, y)
model_tester(RandomForestClassifier(random_state=42, bootstrap=False), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=200), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=200, bootstrap=False), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=500), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=500, bootstrap=False), X, y)

              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92     43186
         1.0       0.44      0.12      0.19      7473

    accuracy                           0.85     50659
   macro avg       0.65      0.55      0.55     50659
weighted avg       0.80      0.85      0.81     50659

Training Score: 0.9981049514722816
Testing Score: 0.8471347638129454
              precision    recall  f1-score   support

         0.0       0.87      0.96      0.91     43186
         1.0       0.40      0.15      0.22      7473

    accuracy                           0.84     50659
   macro avg       0.63      0.56      0.57     50659
weighted avg       0.80      0.84      0.81     50659

Training Score: 0.9981181115315019
Testing Score: 0.8407390591997473
              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92     43186
         1.0       0.45      0.13      0.20      7473

    accuracy                           0.85 

The third model with n_estimators=200 and the default bootstrap setting is selected as the 'best' Random Forest model. This is due to it having the highest testing accuracy score.

In [26]:
# Look at different AdaBoost models and find bester performing for further tuning
model_tester(AdaBoostClassifier(random_state=42, n_estimators=100), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=500, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=1000, learning_rate=0.1), X, y)

              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     43186
         1.0       0.51      0.14      0.22      7473

    accuracy                           0.85     50659
   macro avg       0.69      0.56      0.57     50659
weighted avg       0.82      0.85      0.82     50659

Training Score: 0.8533706201677907
Testing Score: 0.853431769280878
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     43186
         1.0       0.51      0.14      0.22      7473

    accuracy                           0.85     50659
   macro avg       0.69      0.56      0.57     50659
weighted avg       0.82      0.85      0.82     50659

Training Score: 0.8533772001974009
Testing Score: 0.8535502082551966
              precision    recall  f1-score   support

         0.0       0.86      0.98      0.92     43186
         1.0       0.51      0.10      0.16      7473

    accuracy                           0.85  

The models with n_estimators=200 are performing the best. Next, we will try to further tune by changing the learning_rate and keeping the n_estimators the same.

In [27]:
# further tune based on results of previous cell
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200, learning_rate=0.05), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200, learning_rate=0.01), X, y)

              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     43186
         1.0       0.51      0.14      0.22      7473

    accuracy                           0.85     50659
   macro avg       0.69      0.56      0.57     50659
weighted avg       0.82      0.85      0.82     50659

Training Score: 0.8533772001974009
Testing Score: 0.8535502082551966
              precision    recall  f1-score   support

         0.0       0.86      0.98      0.92     43186
         1.0       0.51      0.10      0.16      7473

    accuracy                           0.85     50659
   macro avg       0.69      0.54      0.54     50659
weighted avg       0.81      0.85      0.81     50659

Training Score: 0.8538312222405001
Testing Score: 0.8532738506484534
              precision    recall  f1-score   support

         0.0       0.86      0.99      0.92     43186
         1.0       0.51      0.04      0.07      7473

    accuracy                           0.85 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.85      1.00      0.92     43186
         1.0       0.00      0.00      0.00      7473

    accuracy                           0.85     50659
   macro avg       0.43      0.50      0.46     50659
weighted avg       0.73      0.85      0.78     50659

Training Score: 0.8531666392498767
Testing Score: 0.8524842574863302


The model with n_estimaors=200 and default learning_rate=1 is selected as the 'best' Ada Boost model. This is due to it having the highest testing accuracy score. An argument could be made for learning_rate=0.01, if a subject matter expert preferred the slightly higher recall over the slightly higher accuracy score.

In [29]:
# The best of each type of model
model_tester(LogisticRegression(random_state=42, max_iter=500), X, y)
model_tester(RandomForestClassifier(random_state=42, n_estimators=200), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200), X, y)

              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     43186
         1.0       0.49      0.12      0.20      7473

    accuracy                           0.85     50659
   macro avg       0.68      0.55      0.56     50659
weighted avg       0.81      0.85      0.81     50659

Training Score: 0.8524296759335417
Testing Score: 0.8518723227856847
              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92     43186
         1.0       0.45      0.13      0.20      7473

    accuracy                           0.85     50659
   macro avg       0.66      0.55      0.56     50659
weighted avg       0.80      0.85      0.81     50659

Training Score: 0.9981181115315019
Testing Score: 0.8480033162912809
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     43186
         1.0       0.51      0.14      0.22      7473

    accuracy                           0.85 

The best performing model of all models tested is the Ada Boost Classifier. With a testing accuracy score of 0.85355 it edged out the Logistic Regression (0.85187) and the Random Forest Classifier (0.84800). It also had tied for the best recall with 0.98.