In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Split the Data into Training and Testing Sets

### Read the data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("./Resources/diabetes_012_health_indicators_BRFSS2015.csv")
df_health = pd.read_csv(file_path)

# Review the DataFrame
display(df_health.head())
display(df_health.tail())

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,2.0,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0
253679,2.0,1.0,1.0,1.0,25.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,9.0,6.0,2.0


### Create the target labels set (`y`)  from the “Diabetes_012” column, and then create the features (`X`) DataFrame from the remaining columns.

In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df_health['Diabetes_012']

# Separate the X variable, the features
X = df_health.drop(columns=['Diabetes_012'])

In [4]:
# Review the y variable Series
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Diabetes_012, dtype: float64

In [5]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


### Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [6]:
# Check the balance of our target values
y.value_counts()

0.0    213703
2.0     35346
1.0      4631
Name: Diabetes_012, dtype: int64

### Split the data into training and testing datasets by using `train_test_split`.

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

###  Use StandardScaler to scale the training data (`X_train` and `y_train`).

In [8]:
from sklearn.preprocessing import StandardScaler
# Create the StandardScaler instance
scaler = StandardScaler()

In [9]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [10]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Create a Random Forest Model with the Scaled Data

In [11]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [12]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [13]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [14]:
# Calculating the confusion matrix
rf_cm = confusion_matrix(y_test, rf_predictions)
rf_cm_df = pd.DataFrame(
    rf_cm, index=["No Diabetes 0", "Prediabetes 1", "Diabetes 2"], 
    columns=["Predicted 0", "Predicted 1", "Predicted 2"]
    )

# Calculating the accuracy score
rf_acc_score = accuracy_score(y_test, rf_predictions)

In [15]:
# Displaying results
print("Confusion Matrix")
display(rf_cm_df)
print(f"Accuracy Score : {rf_acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
No Diabetes 0,51696,44,1686
Prediabetes 1,1043,0,115
Diabetes 2,7122,9,1705


Accuracy Score : 0.8420214443393251
Classification Report
              precision    recall  f1-score   support

         0.0       0.86      0.97      0.91     53426
         1.0       0.00      0.00      0.00      1158
         2.0       0.49      0.19      0.28      8836

    accuracy                           0.84     63420
   macro avg       0.45      0.39      0.40     63420
weighted avg       0.80      0.84      0.81     63420



## Create a Logistic Regression Model with the Scaled Data

In [16]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_model = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
lr_model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

In [17]:
# Make a prediction using the testing data
lr_predictions = lr_model.predict(X_test_scaled)

In [18]:
# Calculating the confusion matrix
lr_cm = confusion_matrix(y_test, lr_predictions)
lr_cm_df = pd.DataFrame(
    lr_cm, index=["No Diabetes 0", "Prediabetes 1", "Diabetes 2"], 
    columns=["Predicted 0", "Predicted 1", "Predicted 2"]
    )

# Calculating the accuracy score
lr_acc_score = accuracy_score(y_test, lr_predictions)

In [19]:
# Displaying results
print("Confusion Matrix")
display(lr_cm_df)
print(f"Accuracy Score : {lr_acc_score}")
print("Classification Report")
print(classification_report(y_test, lr_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
No Diabetes 0,52144,0,1282
Prediabetes 1,1058,0,100
Diabetes 2,7304,0,1532


Accuracy Score : 0.8463576158940397
Classification Report
              precision    recall  f1-score   support

         0.0       0.86      0.98      0.92     53426
         1.0       0.00      0.00      0.00      1158
         2.0       0.53      0.17      0.26      8836

    accuracy                           0.85     63420
   macro avg       0.46      0.38      0.39     63420
weighted avg       0.80      0.85      0.81     63420



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predict a Random Forest Model with Resampled Training Data

### Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [20]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
# https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.RandomOverSampler.html 
# https://www.youtube.com/watch?v=P9328wVm6ao 
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled= ros.fit_resample(X_train_scaled, y_train)

In [21]:
# Count the distinct values of the resampled labels data
y_resampled.value_counts()

0.0    160277
2.0    160277
1.0    160277
Name: Diabetes_012, dtype: int64

In [22]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
ros_model_rf = RandomForestClassifier(n_estimators=500, random_state=78)

# Fit the model using the resampled training data
ros_model_rf.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
ros_predictions_rf = ros_model_rf.predict(X_test_scaled)

In [23]:
# Calculating the confusion matrix
ros_cm_rf = confusion_matrix(y_test, ros_predictions_rf)
ros_cm_rf_df = pd.DataFrame(
    ros_cm_rf, index=["No Diabetes 0", "Prediabetes 1", "Diabetes 2"], 
    columns=["Predicted 0", "Predicted 1", "Predicted 2"]
    )

# Calculating the accuracy score
ros_acc_score_rf = accuracy_score(y_test, ros_predictions_rf)

In [24]:
# Displaying results
print("Confusion Matrix")
display(ros_cm_rf_df)
print(f"Accuracy Score : {ros_acc_score_rf}")
print("Classification Report")
print(classification_report(y_test, ros_predictions_rf))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
No Diabetes 0,49409,216,3801
Prediabetes 1,909,2,247
Diabetes 2,5940,37,2859


Accuracy Score : 0.8241879533270262
Classification Report
              precision    recall  f1-score   support

         0.0       0.88      0.92      0.90     53426
         1.0       0.01      0.00      0.00      1158
         2.0       0.41      0.32      0.36      8836

    accuracy                           0.82     63420
   macro avg       0.43      0.42      0.42     63420
weighted avg       0.80      0.82      0.81     63420



## Predict a Logistic Regression Model with Resampled Training Data

### Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [25]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
# https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.RandomOverSampler.html 
# https://www.youtube.com/watch?v=P9328wVm6ao 
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled= ros.fit_resample(X_train_scaled, y_train)

In [26]:
# Count the distinct values of the resampled labels data
y_resampled.value_counts()

0.0    160277
2.0    160277
1.0    160277
Name: Diabetes_012, dtype: int64

In [27]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
ros_model_lr = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the resampled training data
ros_model_lr.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
ros_predictions_lr = ros_model_lr.predict(X_test_scaled)

In [28]:
# Calculating the confusion matrix
ros_cm_lr = confusion_matrix(y_test, ros_predictions_lr)
ros_cm_lr_df = pd.DataFrame(
    ros_cm_lr, index=["No Diabetes 0", "Prediabetes 1", "Diabetes 2"], 
    columns=["Predicted 0", "Predicted 1", "Predicted 2"]
    )

# Calculating the accuracy score
ros_acc_score_lr = accuracy_score(y_test, ros_predictions_lr)

In [29]:
# Displaying results
print("Confusion Matrix")
display(ros_cm_lr_df)
print(f"Accuracy Score : {ros_acc_score_lr}")
print("Classification Report")
print(classification_report(y_test, ros_predictions_lr))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
No Diabetes 0,35360,9074,8992
Prediabetes 1,305,357,496
Diabetes 2,1518,2140,5178


Accuracy Score : 0.6448281299274676
Classification Report
              precision    recall  f1-score   support

         0.0       0.95      0.66      0.78     53426
         1.0       0.03      0.31      0.06      1158
         2.0       0.35      0.59      0.44      8836

    accuracy                           0.64     63420
   macro avg       0.44      0.52      0.43     63420
weighted avg       0.85      0.64      0.72     63420

