## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully!")

✓ Libraries imported successfully!


## Load Dataset

In [2]:
# Load the original dataset
df = pd.read_csv('data/diabetes.csv')

print(f"✓ Dataset loaded: {df.shape}")
print(f"  Rows: {df.shape[0]}")
print(f"  Columns: {df.shape[1]}")

✓ Dataset loaded: (768, 9)
  Rows: 768
  Columns: 9


## Identify Columns with Zero Problems

In [3]:
# These columns shouldn't have zeros (medically impossible)
zero_not_allowed = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

print("Checking for impossible zero values:\n")
print("-" * 60)

for col in zero_not_allowed:
    zero_count = (df[col] == 0).sum()
    zero_pct = (zero_count / len(df)) * 100
    print(f"{col:20s}: {zero_count:4d} zeros ({zero_pct:5.2f}%)")

Checking for impossible zero values:

------------------------------------------------------------
Glucose             :    5 zeros ( 0.65%)
BloodPressure       :   35 zeros ( 4.56%)
SkinThickness       :  227 zeros (29.56%)
Insulin             :  374 zeros (48.70%)
BMI                 :   11 zeros ( 1.43%)


## Replace zero with NAN

In [4]:
# Replace zeros with NaN (proper way to mark missing data)
df[zero_not_allowed] = df[zero_not_allowed].replace(0, np.nan)

print("✓ Zeros replaced with NaN (missing values)\n")
print("Missing values after replacement:")
print(df.isnull().sum())

✓ Zeros replaced with NaN (missing values)

Missing values after replacement:
Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


## Fill missing values with median

In [5]:
# Fill missing values with the median (middle value)
print("\nFilling missing values with median...\n")
print("-" * 60)

for col in zero_not_allowed:
    if df[col].isnull().sum() > 0:
        median_value = df[col].median()
        df[col].fillna(median_value, inplace=True)
        print(f"{col:20s}: Filled with median = {median_value:.2f}")

print("\n✓ All missing values filled!")


Filling missing values with median...

------------------------------------------------------------
Glucose             : Filled with median = 117.00
BloodPressure       : Filled with median = 72.00
SkinThickness       : Filled with median = 29.00
Insulin             : Filled with median = 125.00
BMI                 : Filled with median = 32.30

✓ All missing values filled!


## Verify no missing values exist

In [6]:
# Check if we successfully handled all missing values
print("Final missing value check:")
print("-" * 60)
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

if df.isnull().sum().sum() == 0:
    print("\n✓ SUCCESS: No missing values remaining!")
else:
    print("\n⚠ WARNING: Some missing values still exist!")

Final missing value check:
------------------------------------------------------------
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Total missing values: 0

✓ SUCCESS: No missing values remaining!


## Separate Features and Targets

In [7]:
# Separate input features (X) from output target (y)
X = df.drop('Outcome', axis=1)  # All columns except 'Outcome'
y = df['Outcome']                # Only the 'Outcome' column

print("Feature matrix (X) shape:", X.shape)
print("Target vector (y) shape:", y.shape)
print("\nFeatures (what we use to predict):")
print(X.columns.tolist())
print("\nTarget (what we're trying to predict):")
print("Outcome: 0=Non-Diabetic, 1=Diabetic")

Feature matrix (X) shape: (768, 8)
Target vector (y) shape: (768,)

Features (what we use to predict):
['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

Target (what we're trying to predict):
Outcome: 0=Non-Diabetic, 1=Diabetic


## Splitting data into training and test set

In [8]:
# Split data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 20% for testing
    random_state=42,    # For reproducibility
    stratify=y          # Keep same diabetic/non-diabetic ratio in both sets
)

print("Data split completed!")
print("-" * 60)
print(f"Training set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Testing set:  {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")

print("\nClass distribution in training set:")
print(y_train.value_counts())
print("\nClass distribution in testing set:")
print(y_test.value_counts())

Data split completed!
------------------------------------------------------------
Training set: 614 samples (79.9%)
Testing set:  154 samples (20.1%)

Class distribution in training set:
Outcome
0    400
1    214
Name: count, dtype: int64

Class distribution in testing set:
Outcome
0    100
1     54
Name: count, dtype: int64


## Scales the features

In [9]:
# Create a scaler object
scaler = StandardScaler()

# Fit the scaler on training data and transform both sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier viewing
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

print("✓ Features scaled successfully!")
print("\nBefore scaling (sample):")
print(X_train.head(3))
print("\nAfter scaling (sample):")
print(X_train_scaled.head(3))

✓ Features scaled successfully!

Before scaling (sample):
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
353            1     90.0           62.0           12.0     43.0  27.2   
711            5    126.0           78.0           27.0     22.0  29.6   
373            2    105.0           58.0           40.0     94.0  34.9   

     DiabetesPedigreeFunction  Age  
353                     0.580   24  
711                     0.439   40  
373                     0.225   25  

After scaling (sample):
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0    -0.851355 -1.056427      -0.826740      -1.918187 -1.203361 -0.769477   
1     0.356576  0.144399       0.477772      -0.229874 -1.470195 -0.417498   
2    -0.549372 -0.556083      -1.152868       1.233330 -0.555335  0.359790   

   DiabetesPedigreeFunction       Age  
0                  0.310794 -0.792169  
1                 -0.116439  0.561034  
2                 -0.764862 -0.707594  

## Check Scaling Results

In [10]:
# Verify scaling worked correctly
print("Checking scaled data statistics:")
print("-" * 60)
print("\nMeans (should be close to 0):")
print(X_train_scaled.mean())

print("\nStandard deviations (should be close to 1):")
print(X_train_scaled.std())

Checking scaled data statistics:
------------------------------------------------------------

Means (should be close to 0):
Pregnancies                -6.943414e-17
Glucose                    -1.099374e-16
BloodPressure               3.095606e-16
SkinThickness              -3.471707e-17
Insulin                    -4.339634e-18
BMI                        -1.148556e-15
DiabetesPedigreeFunction   -1.099374e-16
Age                        -1.084908e-16
dtype: float64

Standard deviations (should be close to 1):
Pregnancies                 1.000815
Glucose                     1.000815
BloodPressure               1.000815
SkinThickness               1.000815
Insulin                     1.000815
BMI                         1.000815
DiabetesPedigreeFunction    1.000815
Age                         1.000815
dtype: float64


## Save Processed Data

In [11]:
# Save the preprocessed data for later use
X_train_scaled.to_csv('data/X_train_scaled.csv', index=False)
X_test_scaled.to_csv('data/X_test_scaled.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)

print("✓ Preprocessed data saved!")
print("\nSaved files:")
print("  ✓ data/X_train_scaled.csv")
print("  ✓ data/X_test_scaled.csv")
print("  ✓ data/y_train.csv")
print("  ✓ data/y_test.csv")

✓ Preprocessed data saved!

Saved files:
  ✓ data/X_train_scaled.csv
  ✓ data/X_test_scaled.csv
  ✓ data/y_train.csv
  ✓ data/y_test.csv


## Save Scaler Object

In [13]:
import joblib

# Save the scaler for future use (deployment)
joblib.dump(scaler, 'models/scaler.pkl')

print("✓ Scaler saved to: models/scaler.pkl")

✓ Scaler saved to: models/scaler.pkl


## Data Processing Summary

In [15]:
print("=" * 70)
print("DATA PREPROCESSING SUMMARY")
print("=" * 70)

print(f"\n✓ Original dataset: {df.shape[0]} patients, {df.shape[1]} features")
print(f"✓ Handled missing values: {zero_not_allowed}")
print(f"✓ Training set: {len(X_train)} samples")
print(f"✓ Testing set: {len(X_test)} samples")
print(f"✓ Features scaled using StandardScaler")
print(f"✓ All data saved to data/ folder")
print(f"✓ Scaler saved to models/ folder")

print("\n" + "=" * 70)
print("READY FOR MODEL TRAINING!")
print("=" * 70)

DATA PREPROCESSING SUMMARY

✓ Original dataset: 768 patients, 9 features
✓ Handled missing values: ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
✓ Training set: 614 samples
✓ Testing set: 154 samples
✓ Features scaled using StandardScaler
✓ All data saved to data/ folder
✓ Scaler saved to models/ folder

READY FOR MODEL TRAINING!
