In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
%ls

diabetes_012_health_indicators_updated.csv  [0m[01;34msample_data[0m/


In [3]:
# Read the CSV file 'diabetes_012_health_indicators_updated.csv' into a DataFrame
df = pd.read_csv('diabetes_012_health_indicators_updated.csv')

# Create a new DataFrame using the read data
df = pd.DataFrame(df)

In [4]:
# Display a random sample (single row) from the DataFrame 'df'
df.sample()

Unnamed: 0.1,Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
1587,1587,2.0,1.0,Yes,1.0,30.0,Yes,0.0,0.0,0.0,...,1.0,0.0,3.0,0.0,0.0,No,female,65.0,4.0,8.0


**Data Preprocessing**

In [5]:
# Create a new column 'Diabetes_01' in the DataFrame 'df' and assign the values from the 'Diabetes_012' column
df['Diabetes_01'] = df.loc[:, 'Diabetes_012']

In [6]:
df.sample()

Unnamed: 0.1,Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_01
14483,14483,0.0,0.0,No,1.0,22.0,Yes,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,No,female,67.0,6.0,8.0,0.0


In [7]:
# Replace values in the 'Diabetes_01' column of the DataFrame 'df'
df['Diabetes_01'].replace([1, 2], [1, 1], inplace=True)

In [8]:
# Display information about the DataFrame 'df', including data types and memory usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21852 entries, 0 to 21851
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            21852 non-null  int64  
 1   Diabetes_012          21852 non-null  float64
 2   HighBP                21852 non-null  float64
 3   HighChol              21852 non-null  object 
 4   CholCheck             21852 non-null  float64
 5   BMI                   21852 non-null  float64
 6   Smoker                21852 non-null  object 
 7   Stroke                21852 non-null  float64
 8   HeartDiseaseorAttack  21852 non-null  float64
 9   PhysActivity          21852 non-null  float64
 10  Fruits                21852 non-null  object 
 11  Veggies               21852 non-null  object 
 12  HvyAlcoholConsump     21852 non-null  float64
 13  AnyHealthcare         21852 non-null  float64
 14  NoDocbcCost           21852 non-null  float64
 15  GenHlth            

In [9]:
# Drop specified columns ('Unnamed: 0', 'AnyHealthcare', 'NoDocbcCost') from the DataFrame 'df'
df = df.drop(['Unnamed: 0', 'AnyHealthcare', 'NoDocbcCost'], axis=1)

In [10]:
# Replace values in specific columns of the DataFrame 'df' as a step of pre-processing
df['HighChol'].replace(['Yes', 'No'], [1, 0], inplace=True)
df['DiffWalk'].replace(['Yes', 'No'], [1, 0], inplace=True)
df['Veggies'].replace(['Yes', 'No'], [1, 0], inplace=True)
df['Fruits'].replace(['Yes', 'No'], [1, 0], inplace=True)
df['Smoker'].replace(['Yes', 'No'], [1, 0], inplace=True)
df['Sex'].replace(['male', 'female'], [1, 0], inplace=True)

In [15]:
def up_sample():
    from sklearn.utils import resample

    # Separate majority and minority classes
    df_majority = df[df.Diabetes_01 == 0]
    df_minority = df[df.Diabetes_01 == 1]

    # Upsample the minority class to match the majority class
    df_minority_upsampled = resample(df_minority,
                                     replace=True,      # Sample with replacement
                                     n_samples=213703,  # Number of samples to match the majority class
                                     random_state=123)  # For reproducible results

    # Concatenate the upsampled minority class with the majority class
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])

    # Display the count of each class after upsampling
    df_upsampled.Diabetes_01.value_counts()

    # Separate features (x) and target variable (y) in the upsampled DataFrame
    y = df_upsampled.Diabetes_01
    x = df_upsampled.drop(["Diabetes_01", "Diabetes_012"], axis=1)


In [16]:
def down_sample():
    from sklearn.utils import resample

    # Separate majority and minority classes
    df_majority = df[df.Diabetes_01 == 0]
    df_minority = df[df.Diabetes_01 == 1]

    # Downsample the majority class to match the minority class
    df_majority_downsampled = resample(df_majority,
                                       replace=False,      # Sample without replacement
                                       n_samples=39977,    # Number of samples to match the minority class
                                       random_state=123)   # For reproducible results

    # Concatenate the downsampled majority class with the minority class
    df_downsampled = pd.concat([df_majority_downsampled, df_minority])

    # Display the count of each class after downsampling
    df_downsampled.Diabetes_01.value_counts()

    # Separate features (x) and target variable (y) in the downsampled DataFrame
    y = df_downsampled.Diabetes_01
    x = df_downsampled.drop(["Diabetes_01", "Diabetes_012"], axis=1)


In [17]:
# Separate features (x) and target variable (y) from the DataFrame 'df'
x = df.drop(["Diabetes_01", "Diabetes_012"], axis=1)
y = df['Diabetes_01']

In [18]:
# Split the data into training and testing sets using train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)