In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from imblearn.over_sampling import SMOTE
from collections import Counter
import joblib


In [2]:
# Load dataset
df = pd.read_csv('../data/diabetes.csv')
print("Initial dataset shape:", df.shape)

# Separate features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

Initial dataset shape: (768, 9)


In [3]:
# Handle missing values encoded as zeros

zero_as_missing = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

print("\nZero counts before replacement:")
print((X[zero_as_missing] == 0).sum())

X[zero_as_missing] = X[zero_as_missing].replace(0, np.nan)

print("\nMissing values after replacement:")
print(X.isna().sum())

# Remove duplicate rows

X['Outcome'] = y
before_dupes = X.shape[0]
X = X.drop_duplicates()
after_dupes = X.shape[0]

print(f"\nDuplicates removed: {before_dupes - after_dupes}")

y = X['Outcome']
X = X.drop('Outcome', axis=1)


Zero counts before replacement:
Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64

Missing values after replacement:
Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64

Duplicates removed: 0


In [4]:
# Feature Engineering
# BMI * Age interaction feature
X['BMI_Age'] = X['BMI'] * X['Age']

print("\nFeature list after engineering:")
print(X.columns.tolist())


Feature list after engineering:
['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'BMI_Age']


In [5]:
# Train / Validation / Test Split
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.30,
    stratify=y,
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42
)

print("\nDataset splits:")
print("Train:", X_train.shape)
print("Validation:", X_val.shape)
print("Test:", X_test.shape)

print("\nClass distribution check (before SMOTE):")
print("Train:", y_train.value_counts(normalize=True))
print("Validation:", y_val.value_counts(normalize=True))
print("Test:", y_test.value_counts(normalize=True))


Dataset splits:
Train: (537, 9)
Validation: (115, 9)
Test: (116, 9)

Class distribution check (before SMOTE):
Train: Outcome
0    0.651769
1    0.348231
Name: proportion, dtype: float64
Validation: Outcome
0    0.652174
1    0.347826
Name: proportion, dtype: float64
Test: Outcome
0    0.646552
1    0.353448
Name: proportion, dtype: float64


In [6]:
# Preprocessing Pipeline

numeric_features = X.columns

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, numeric_features)
])

# Fit ONLY on training data
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

# Checks after preprocessing
print("Checks after preprocessing:")

print("Any NaNs in processed train data:",
    np.isnan(X_train_processed).any())

print("Mean of first 5 features (train):",
    X_train_processed.mean(axis=0)[:5])

print("Std of first 5 features (train):",
    X_train_processed.std(axis=0)[:5])

print("Processed feature count:",
    X_train_processed.shape[1])

Checks after preprocessing:
Any NaNs in processed train data: False
Mean of first 5 features (train): [ 2.31554895e-17  1.98475625e-17 -2.61326239e-16  1.90205807e-16
  1.32317083e-16]
Std of first 5 features (train): [1. 1. 1. 1. 1.]
Processed feature count: 9


In [7]:
# Handle Class Imbalance using SMOTE

print("\nClass distribution BEFORE SMOTE:")
print(Counter(y_train))

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(
    X_train_processed, y_train
)

print("\nClass distribution AFTER SMOTE:")
print(Counter(y_train_balanced))


Class distribution BEFORE SMOTE:
Counter({0: 350, 1: 187})

Class distribution AFTER SMOTE:
Counter({1: 350, 0: 350})


In [8]:
# Save preprocessing pipeline

joblib.dump(preprocessor, 'preprocessing_pipeline.pkl')

# Reload check (reproducibility)
loaded_preprocessor = joblib.load('preprocessing_pipeline.pkl')
X_test_check = loaded_preprocessor.transform(X_test)

print("\nPipeline reload successful:",
    X_test_check.shape == X_test_processed.shape)

print("\nPreprocessing complete.")
print("Final training set shape after SMOTE:", X_train_balanced.shape)


Pipeline reload successful: True

Preprocessing complete.
Final training set shape after SMOTE: (700, 9)


In [9]:
# Save processed datasets for model training
processed_data = {
    'X_train': X_train_balanced,
    'y_train': y_train_balanced,
    'X_val': X_val_processed,
    'y_val': y_val,
    'X_test': X_test_processed,
    'y_test': y_test
}

joblib.dump(processed_data, '../data/processed_diabetes_data.pkl')

print("Processed data exported for model training.")

Processed data exported for model training.
