## Part 1: Prepare the Data

In [1]:
# Import Dependencies
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Load the data into a Pandas DataFrame and fetch the top 10 rows.

In [2]:
# Read in CSV
file_path = Path("../Resources/diabetes_data.csv")
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
5,0.0,1.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,2.0,0.0,1.0,10.0,6.0,8.0
6,0.0,1.0,0.0,1.0,30.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,0.0,14.0,0.0,0.0,9.0,6.0,7.0
7,0.0,1.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,0.0,0.0,1.0,0.0,11.0,4.0,4.0
8,1.0,1.0,1.0,1.0,30.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,5.0,30.0,30.0,1.0,0.0,9.0,5.0,1.0
9,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,4.0,3.0


List the DataFrame's data types to ensure they're aligned to the type of data stored on each column.

In [3]:
# List dataframe data types
df.dtypes

Diabetes_binary         float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64
dtype: object

Remove all rows with `null` values if any.

In [4]:
# Find null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column Diabetes_binary has 0 null values
Column HighBP has 0 null values
Column HighChol has 0 null values
Column CholCheck has 0 null values
Column BMI has 0 null values
Column Smoker has 0 null values
Column Stroke has 0 null values
Column HeartDiseaseorAttack has 0 null values
Column PhysActivity has 0 null values
Column Fruits has 0 null values
Column Veggies has 0 null values
Column HvyAlcoholConsump has 0 null values
Column AnyHealthcare has 0 null values
Column NoDocbcCost has 0 null values
Column GenHlth has 0 null values
Column MentHlth has 0 null values
Column PhysHlth has 0 null values
Column DiffWalk has 0 null values
Column Sex has 0 null values
Column Age has 0 null values
Column Education has 0 null values
Column Income has 0 null values


No null values were found.

Standardize the dataset so that columns that contain larger values do not influence the outcome more than columns with smaller values. Exclude columns that already have binary or binned data from scaling.

In [5]:
# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[['BMI', 'MentHlth', 'PhysHlth']])

In [6]:
# Get list of the columns from the original DataFrame so we know what to add to scaled data
df.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [7]:
# Create a DataFrame with the transformed data
scaled_data = pd.DataFrame(scaled_data, columns=['BMI', 'MentHlth', 'PhysHlth'])
scaled_data['Diabetes_binary'] = df['Diabetes_binary']
scaled_data['HighBP'] = df['HighBP']
scaled_data['HighChol'] = df['HighChol']
scaled_data['CholCheck'] = df['CholCheck']
scaled_data['Smoker'] = df['Smoker']
scaled_data['Stroke'] = df['Stroke']
scaled_data['HeartDiseaseorAttack'] = df['HeartDiseaseorAttack']
scaled_data['PhysActivity'] = df['PhysActivity']
scaled_data['Fruits'] = df['Fruits']
scaled_data['Veggies'] = df['Veggies']
scaled_data['HvyAlcoholConsump'] = df['HvyAlcoholConsump']
scaled_data['AnyHealthcare'] = df['AnyHealthcare']
scaled_data['NoDocbcCost'] = df['NoDocbcCost']
scaled_data['GenHlth'] = df['GenHlth']
scaled_data['DiffWalk'] = df['DiffWalk']
scaled_data['Sex'] = df['Sex']
scaled_data['Age'] = df['Age']
scaled_data['Education'] = df['Education']
scaled_data['Income'] = df['Income']
scaled_data.head()

Unnamed: 0,BMI,MentHlth,PhysHlth,Diabetes_binary,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,...,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income
0,1.757936,1.998592,1.233999,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,3.0
1,-0.511806,-0.42963,-0.486592,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,3.0,0.0,0.0,7.0,6.0,1.0
2,-0.057858,3.617407,2.95459,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,5.0,1.0,0.0,9.0,4.0,8.0
3,-0.209174,-0.42963,-0.486592,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,3.0,6.0
4,-0.663122,-0.024926,-0.486592,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,5.0,4.0


In [8]:
# Assign feature (X) and target (y) variables
X = scaled_data.drop('Diabetes_binary', axis=1)
y = df['Diabetes_binary']

Look at X and y to make sure everything looks as expected.

In [9]:
# Preview X
X.head()

Unnamed: 0,BMI,MentHlth,PhysHlth,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,...,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income
0,1.757936,1.998592,1.233999,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,3.0
1,-0.511806,-0.42963,-0.486592,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,3.0,0.0,0.0,7.0,6.0,1.0
2,-0.057858,3.617407,2.95459,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,5.0,1.0,0.0,9.0,4.0,8.0
3,-0.209174,-0.42963,-0.486592,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,3.0,6.0
4,-0.663122,-0.024926,-0.486592,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,5.0,4.0


In [10]:
# Prevew y
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Diabetes_binary, dtype: float64

In [11]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)