<div style="background-color: #3D3D3A; padding: 20px; border-radius: 10px; margin-bottom: 20px;">
    <h1 style="color: #FFFFFF; text-align: center; margin: 0;">🔄 Diabetes Classification Data Processing</h1>
    <p style="color: #CCCCCC; text-align: center; margin-top: 10px;">Data Processing, Feature Engineering & Validation Pipeline</p>
</div>

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">📚 Import Required Libraries</h2>
</div>

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
import sys
warnings.filterwarnings('ignore')

sys.path.append(str(Path.cwd().parent))

from src.data.data_processing import DataProcessor
from src.data.data_versioning import DataVersioner
from src.features.feature_engineering import FeatureEngineer
from src.training.imbalance_handler import ImbalanceHandler

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">📥 Load and Prepare Data</h2>
</div>

In [2]:
# Load the diabetes dataset
data_path = '../data/extracted/diabetes_prediction_dataset/diabetes_prediction_dataset.csv'
data = pd.read_csv(data_path)

print(f"Loaded data shape: {data.shape}")
display(data.head())

Loaded data shape: (100000, 9)


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">🔍 Initial Data Analysis</h2>
</div>

In [3]:
# Display initial statistics
print("\nFeature Information:")
display(data.info())

print("\nMissing Values:")
display(data.isnull().sum())

print("\nDuplicate Rows:", data.duplicated().sum())

print("\nStatistics for Numerical Features:")
display(data[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']].describe())

print("\nUnique values in categorical features:")
categorical_features = ['gender', 'smoking_history']
for feature in categorical_features:
    print(f"\n{feature}:")
    display(data[feature].value_counts())


Feature Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


None


Missing Values:


gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


Duplicate Rows: 3854

Statistics for Numerical Features:


Unnamed: 0,age,bmi,HbA1c_level,blood_glucose_level
count,100000.0,100000.0,100000.0,100000.0
mean,41.885856,27.320767,5.527507,138.05806
std,22.51684,6.636783,1.070672,40.708136
min,0.08,10.01,3.5,80.0
25%,24.0,23.63,4.8,100.0
50%,43.0,27.32,5.8,140.0
75%,60.0,29.58,6.2,159.0
max,80.0,95.69,9.0,300.0



Unique values in categorical features:

gender:


gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64


smoking_history:


smoking_history
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: count, dtype: int64

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">⚙️ Step 1: Feature Engineering</h2>
    <p style="color: #CCCCCC;">We do feature engineering first, before any splitting or preprocessing.</p>
</div>

In [4]:
# Initialize feature engineer
fe = FeatureEngineer()

# Transform all features
engineered_data = fe.transform(data)

print("New features created:")
new_features = set(engineered_data.columns) - set(data.columns)
print(list(new_features))

print("\nSample of engineered data:")
display(engineered_data.head())

[2025-02-24 02:27:52] |     INFO | [feature_engineering.py:  32] | feature_engineering | FeatureEngineer initialized with configuration
[2025-02-24 02:27:52] |     INFO | [feature_engineering.py:  44] | feature_engineering | Starting feature engineering transformation
[2025-02-24 02:27:52] |     INFO | [feature_engineering.py:  72] | feature_engineering | Created BMI category features
[2025-02-24 02:27:52] |     INFO | [feature_engineering.py:  83] | feature_engineering | Created age-related features
[2025-02-24 02:27:52] |     INFO | [feature_engineering.py:  95] | feature_engineering | Created medical risk score
[2025-02-24 02:27:52] |     INFO | [feature_engineering.py: 113] | feature_engineering | Created metabolic score
[2025-02-24 02:27:52] |     INFO | [feature_engineering.py: 143] | feature_engineering | Created lifestyle score
[2025-02-24 02:27:52] |     INFO | [feature_engineering.py: 162] | feature_engineering | Created interaction features
[2025-02-24 02:27:52] |     INFO |

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,bmi_category,age_risk,age_bmi_interaction,medical_risk_score,metabolic_score,smoking_risk,lifestyle_score,age_hypertension,age_heart_disease,cardio_metabolic_risk,combined_risk_score
0,Female,80.0,0,1,never,25.19,6.6,140,0,0,1,20.152,0.538462,0.4,0.0,0.0,0.0,80.0,0.0,0.375385
1,Female,54.0,0,0,No Info,27.32,6.6,80,0,0,1,14.7528,0.230769,0.4,0.5,0.3,0.0,0.0,0.0,0.312308
2,Male,28.0,0,0,never,27.32,5.7,158,0,0,0,7.6496,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.16
3,Female,36.0,0,0,current,23.45,5.0,155,0,1,0,8.442,0.0,0.4,1.0,0.6,0.0,0.0,0.0,0.28
4,Male,76.0,1,1,current,20.14,4.8,155,0,1,1,15.3064,0.846154,0.4,1.0,0.6,76.0,76.0,0.4,0.618462


<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">⚙️ Step 2: Initialize Data Processor</h2>
    <p style="color: #CCCCCC;">After feature engineering, we can process and split the data.</p>
</div>

In [5]:
# Define features for processing
numerical_features = [
    'bmi', 
    'HbA1c_level',
    'blood_glucose_level',
    'age_bmi_interaction',
    'medical_risk_score',
    'metabolic_score',
    'lifestyle_score',
    'cardio_metabolic_risk',
    'combined_risk_score'
]

categorical_features = [
    'gender',
    'smoking_history',
    'bmi_category',
    'age_risk'  # Assuming this is a categorized version of age risk
]

binary_features = [
    'hypertension',
    'heart_disease',
    'smoking_risk',
    'age_hypertension',  # Assuming this is a binary interaction
    'age_heart_disease'  # Assuming this is a binary interaction
]

processor_config = {
    'continuous_features': numerical_features,
    'categorical_features': categorical_features,
    'binary_features': binary_features,
    'target_column': 'diabetes',
    'categorical_encoding': 'label',
    'outlier_threshold': 3.0
}

processor = DataProcessor(config=processor_config)

[2025-02-24 02:27:52] |     INFO | [data_processing.py:  38] | data_processing | DataProcessor initialized with configuration


In [6]:
# Process the data - this will handle categorical encoding, scaling, and train/test split
X_train, X_test, y_train, y_test = processor.process_data(engineered_data)

print("Shapes after processing:")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")

print("\nProcessed feature names:")
display(X_train.columns.tolist())

print("\nSample of processed training data:")
display(X_train.head())

[2025-02-24 02:27:52] |     INFO | [data_processing.py: 241] | data_processing | Starting data processing on data with shape: (100000, 20)
[2025-02-24 02:27:52] |     INFO | [data_processing.py:  74] | data_processing | No missing values found in the dataset
[2025-02-24 02:27:52] |     INFO | [data_processing.py:  85] | data_processing | Found 3854 duplicate rows (3.85% of the dataset)
[2025-02-24 02:27:52] |     INFO | [data_processing.py:  87] | data_processing | Removed 3854 duplicate rows, new shape: (96146, 20)
[2025-02-24 02:27:52] |     INFO | [data_processing.py: 108] | data_processing | Found 958 outliers in bmi
[2025-02-24 02:27:52] |     INFO | [data_processing.py: 119] | data_processing | Capped outliers in bmi to range: [13.70, 39.54]
[2025-02-24 02:27:52] |     INFO | [data_processing.py: 108] | data_processing | Found 1041 outliers in HbA1c_level
[2025-02-24 02:27:52] |     INFO | [data_processing.py: 119] | data_processing | Capped outliers in HbA1c_level to range: [2.7

['gender',
 'age',
 'hypertension',
 'heart_disease',
 'smoking_history',
 'bmi',
 'HbA1c_level',
 'blood_glucose_level',
 'bmi_category',
 'age_risk',
 'age_bmi_interaction',
 'medical_risk_score',
 'metabolic_score',
 'smoking_risk',
 'lifestyle_score',
 'age_hypertension',
 'age_heart_disease',
 'cardio_metabolic_risk',
 'combined_risk_score']


Sample of processed training data:


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,bmi_category,age_risk,age_bmi_interaction,medical_risk_score,metabolic_score,smoking_risk,lifestyle_score,age_hypertension,age_heart_disease,cardio_metabolic_risk,combined_risk_score
76484,1,0.057143,0,0,0,-0.532973,0.169283,-0.237288,1,1,10.7415,0.230769,0.0,0.5,0.3,0.0,0.0,0.0,0.152308
31572,0,0.6,0,0,3,1.013727,-1.45144,-0.237288,2,1,21.1392,0.384615,0.2,0.7,0.82,0.0,0.0,0.0,0.397846
92017,0,0.057143,0,0,0,0.049573,1.027314,-0.237288,0,1,12.294,0.230769,0.4,0.5,0.3,0.0,0.0,0.0,0.312308
31616,1,0.028571,1,0,0,0.309607,-0.688747,-1.016949,0,1,12.6984,0.538462,0.0,0.5,0.3,44.0,0.0,0.0,0.275385
94106,0,-0.457143,0,0,5,-0.092265,0.931977,1.016949,0,0,7.1496,0.0,0.8,0.5,0.3,0.0,0.0,0.0,0.38


In [7]:
y_test.shape

(19230,)

In [8]:
X_test.shape

(19230, 19)

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">🔄 Step 3: Handle Class Imbalance</h2>
    <p style="color: #CCCCCC;">Apply SMOTE only to training data after processing.</p>
</div>

In [9]:
imbalance_handler = ImbalanceHandler(strategy='smote')

# Handle imbalance in the training data only
X_train_balanced, y_train_balanced = imbalance_handler.resample(X_train, y_train)

print("Class distribution before balancing:")
display(pd.Series(y_train).value_counts(normalize=True))

print("\nClass distribution after balancing:")
display(pd.Series(y_train_balanced).value_counts(normalize=True))

[2025-02-24 02:27:53] |     INFO | [imbalance_handler.py:  51] | imbalance_handler | Resampled dataset with strategy: smote.Previous shape (76916, 19) Vs New shape: (140260, 19)
Class distribution before balancing:


diabetes
0    0.911774
1    0.088226
Name: proportion, dtype: float64


Class distribution after balancing:


diabetes
0    0.5
1    0.5
Name: proportion, dtype: float64

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">💾 Step 4: Version the Processed Data</h2>
</div>

In [10]:
# Prepare final dataset with split information
X_train_balanced['diabetes'] = y_train_balanced
X_test['diabetes'] = y_test
X_train_balanced['split'] = 'train'
X_test['split'] = 'test'

processed_data = pd.concat([X_train_balanced, X_test], axis=0, ignore_index=True)

# Version the data
versioner = DataVersioner(base_dir='../data')

version_id = versioner.version_dataset(
    data=processed_data,
    dataset_name='diabetes_processed',
    dataset_description='Processed diabetes dataset with engineered features, encoded categories, and balanced classes'
)

print("Available versions:")
display(versioner.list_versions('diabetes_processed'))

[2025-02-24 02:27:54] |     INFO | [data_versioning.py:  29] | data_versioning | Creating Mlflow experiment: diabetes_classification
[2025-02-24 02:27:57] |     INFO | [data_versioning.py:  70] | data_versioning | Dataset saved to: ..\data\versions\diabetes_processed_2025_02_24_02_27_54\diabetes_processed.csv
Available versions:
[2025-02-24 02:27:58] |     INFO | [data_versioning.py: 170] | data_versioning | Found 1 versions for dataset: diabetes_processed


Unnamed: 0,version_id,mlflow_run_id,status,dataset_name,timestamp,description,rows,columns,local_path
0,2025_02_24_02_27_54,98e43b7b7e224fd6b87c07736b92f509,FINISHED,diabetes_processed,2025-02-24T02:27:57.928326,Processed diabetes dataset with engineered fea...,159490,21,..\data\versions\diabetes_processed_2025_02_24...


<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">📊 Final Validation</h2>
</div>

In [11]:
validation_results = {
    'missing_values': processed_data.isnull().sum().sum(),
    'duplicates': processed_data.duplicated().sum(),
    'numerical_stats': {
        feature: {
            'mean': processed_data[feature].mean(),
            'std': processed_data[feature].std()
        } for feature in numerical_features
    },
    'encoded_categorical_features': [
        col for col in processed_data.columns 
        if any(cat in col for cat in categorical_features)
    ],
    'class_distribution': processed_data['diabetes'].value_counts(normalize=True).to_dict(),
    'train_test_split': processed_data['split'].value_counts().to_dict()
}

print("Validation Results:")
display(validation_results)

Validation Results:


{'missing_values': 0,
 'duplicates': 76,
 'numerical_stats': {'bmi': {'mean': 0.27074257316353956,
   'std': 1.023137673339994},
  'HbA1c_level': {'mean': 0.4900635144740955, 'std': 1.1080327432072363},
  'blood_glucose_level': {'mean': 0.2813767101702315,
   'std': 0.8022566962962743},
  'age_bmi_interaction': {'mean': 14.86140871747302, 'std': 7.476309508700388},
  'medical_risk_score': {'mean': 0.2726919803756778,
   'std': 0.1893219734085856},
  'metabolic_score': {'mean': 0.5983648940645614, 'std': 0.30763365238977614},
  'lifestyle_score': {'mean': 0.39931568528418504, 'std': 0.27940692604670253},
  'cardio_metabolic_risk': {'mean': 0.0, 'std': 0.0},
  'combined_risk_score': {'mean': 0.43472346726943967,
   'std': 0.20154447423157298}},
 'encoded_categorical_features': ['gender',
  'smoking_history',
  'bmi_category',
  'age_risk'],
 'class_distribution': {0: 0.5496520158003637, 1: 0.45034798419963634},
 'train_test_split': {'train': 140260, 'test': 19230}}