In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset

df = pd.read_csv("diabetes.csv")

# Step 1: Cleaning the Data
# Checking for missing values
print("Missing values before handling:", df.isnull().sum())

# Step 2: Handling Missing Values
# Replace zeros with NaN in specific columns
columns_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[columns_with_zeros] = df[columns_with_zeros].replace(0, np.nan)

# Check the number of missing values after replacement
print("Missing values after replacing zeros:", df.isnull().sum())

# Impute missing values using the median strategy
imputer = SimpleImputer(strategy='median')
df[columns_with_zeros] = imputer.fit_transform(df[columns_with_zeros])

# Step 3: Transformation
# No additional transformations needed at this point

# Step 4: Normalization
scaler = StandardScaler()
df[columns_with_zeros + ['Pregnancies', 'DiabetesPedigreeFunction', 'Age']] = scaler.fit_transform(df[columns_with_zeros + ['Pregnancies', 'DiabetesPedigreeFunction', 'Age']])

# Step 5: Encoding
# The Outcome column is already encoded

# Step 6: Feature Engineering
# No new features to add at this point

# Display the first few rows of the preprocessed dataset
print(df.head())


Missing values before handling: Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
Missing values after replacing zeros: Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.639947  0.866045      -0.031990       0.670643 -0.181541  0.166619   
1    -0.844885 -1.205066      -0.528319      -0.012301 -0.181541 -0.852200   
2     1.233880  2.016662      -0.693761      -0.012301 -0.181541 -1.332500   
3    -0.844885 -1.073567      -0.5