In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from imblearn.over_sampling import SMOTE
from collections import Counter
import joblib


In [2]:
# Load dataset
df = pd.read_csv('../data/diabetes.csv')
print("Initial dataset shape:", df.shape)

# Separate features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

Initial dataset shape: (768, 9)


In [3]:
# Handle missing values encoded as zeros

zero_as_missing = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

print("\nZero counts before replacement:")
print((X[zero_as_missing] == 0).sum())

X[zero_as_missing] = X[zero_as_missing].replace(0, np.nan)

print("\nMissing values after replacement:")
print(X.isna().sum())

# Remove duplicate rows

X['Outcome'] = y
before_dupes = X.shape[0]
X = X.drop_duplicates()
after_dupes = X.shape[0]

print(f"\nDuplicates removed: {before_dupes - after_dupes}")

y = X['Outcome']
X = X.drop('Outcome', axis=1)


Zero counts before replacement:
Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64

Missing values after replacement:
Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64

Duplicates removed: 0


In [4]:
# Feature Engineering
# BMI * Age interaction feature
X['BMI_Age'] = X['BMI'] * X['Age']

print("\nFeature list after engineering:")
print(X.columns.tolist())


Feature list after engineering:
['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'BMI_Age']


In [5]:
# Train / Validation / Test Split
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.30,
    stratify=y,
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42
)

print("\nDataset splits:")
print("Train:", X_train.shape)
print("Validation:", X_val.shape)
print("Test:", X_test.shape)

print("\nClass distribution check (before SMOTE):")
print("Train:", y_train.value_counts(normalize=True))
print("Validation:", y_val.value_counts(normalize=True))
print("Test:", y_test.value_counts(normalize=True))


Dataset splits:
Train: (537, 9)
Validation: (115, 9)
Test: (116, 9)

Class distribution check (before SMOTE):
Train: Outcome
0    0.651769
1    0.348231
Name: proportion, dtype: float64
Validation: Outcome
0    0.652174
1    0.347826
Name: proportion, dtype: float64
Test: Outcome
0    0.646552
1    0.353448
Name: proportion, dtype: float64
