<div style="background-color: #3D3D3A; padding: 20px; border-radius: 10px; margin-bottom: 20px;">
    <h1 style="color: #FFFFFF; text-align: center; margin: 0;">🔄 Data Processing Pipeline</h1>
    <p style="color: #CCCCCC; text-align: center; margin-top: 10px;">Preprocessing, Feature Engineering & Data Versioning</p>
</div>

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">📚 Import Required Libraries</h2>
</div>

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
import sys
warnings.filterwarnings('ignore')

sys.path.append(str(Path.cwd().parent))

from src.data.data_processing import DataProcessor
from src.data.data_versioning import DataVersioner


<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">📥 Load Data</h2>
</div>

In [2]:
# Load the diabetes dataset
data_path = Path('../data/extracted/diabetes_data/diabetes_012_health_indicators_BRFSS2015.csv')
data = pd.read_csv(data_path)

print(f"Loaded data shape: {data.shape}")
display(data.head())

Loaded data shape: (253680, 22)


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">🔍 Initial Data Analysis</h2>
</div>

In [3]:
# Display initial statistics
print("\nMissing Values:")
display(data.isnull().sum())

print("\nDuplicate Rows:", data.duplicated().sum())

print("\nStatistics for Key Features:")
display(data[['BMI', 'MentHlth', 'PhysHlth']].describe())


Missing Values:


Diabetes_012            0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64


Duplicate Rows: 23899

Statistics for Key Features:


Unnamed: 0,BMI,MentHlth,PhysHlth
count,253680.0,253680.0,253680.0
mean,28.382364,3.184772,4.242081
std,6.608694,7.412847,8.717951
min,12.0,0.0,0.0
25%,24.0,0.0,0.0
50%,27.0,0.0,0.0
75%,31.0,2.0,3.0
max,98.0,30.0,30.0


<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">⚙️ Initialize Data Processor</h2>
</div>

In [4]:
# Initialize the DataProcessor with configuration
processor_config = {
    'health_features': ['MentHlth', 'PhysHlth'],
    'bmi_feature': 'BMI',
    'target_column': 'Diabetes_012',
    'outlier_threshold': 3.0
}

processor = DataProcessor(config=processor_config)

[2025-02-22 22:41:32] |     INFO | [data_processing.py:  32] | data_processing | DataProcessor initialized with configuration


<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">🔄 Process Data</h2>
</div>

In [5]:
# Process the data using our pipeline
processed_data = processor.process_data(data)

print("\nProcessed Data Statistics:")
display(processed_data[['BMI', 'MentHlth', 'PhysHlth']].describe())

# Compare shapes
print(f"\nOriginal data shape: {data.shape}")
print(f"Processed data shape: {processed_data.shape}")

[2025-02-22 22:41:34] |     INFO | [data_processing.py: 192] | data_processing | Starting data processing on data with shape: (253680, 22)
[2025-02-22 22:41:34] |     INFO | [data_processing.py:  70] | data_processing | No missing values found in the dataset
[2025-02-22 22:41:34] |     INFO | [data_processing.py:  88] | data_processing | Found 23899 duplicate rows (9.42% of the dataset)
[2025-02-22 22:41:34] |     INFO | [data_processing.py:  92] | data_processing | Removed 23899 duplicate rows, new shape: (229781, 22)
[2025-02-22 22:41:34] |     INFO | [data_processing.py: 124] | data_processing | Found 12644 outliers in MentHlth
[2025-02-22 22:41:34] |     INFO | [data_processing.py: 135] | data_processing | Capped outliers in MentHlth to range: [-3.00, 5.00]
[2025-02-22 22:41:35] |     INFO | [data_processing.py: 163] | data_processing | Applied RobustScaler to BMI
[2025-02-22 22:41:35] |     INFO | [data_processing.py: 173] | data_processing | Applied StandardScaler to health featu

Unnamed: 0,BMI,MentHlth,PhysHlth
count,229367.0,229367.0,229367.0
mean,0.211609,-0.003329,0.000874
std,0.848562,0.997827,1.000639
min,-1.875,-0.640299,-0.516791
25%,-0.375,-0.640299,-0.516791
50%,0.0,-0.640299,-0.516791
75%,0.625,0.353528,-0.074634
max,8.875,1.844268,2.799391



Original data shape: (253680, 22)
Processed data shape: (229367, 22)


<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">💾 Version the Processed Data</h2>
</div>

In [None]:
# Initialize the DataVersioner
versioner = DataVersioner()

# version of the original dataset
version_id_original = versioner.version_dataset(
    data=data,
    dataset_name='diabetes_original',
    dataset_description='Original diabetes dataset'
)

# Version the processed dataset
version_id_processed = versioner.version_dataset(
    data=processed_data,
    dataset_name='diabetes_processed',
    dataset_description='Processed diabetes dataset with outlier handling and feature scaling'
)

print(f"\nDataset versioned with ID: {version_id_processed}")
print(f"\nDataset versioned with ID: {version_id_original}")

# List all versions
print("\nAll available versions:")
display(versioner.list_versions('diabetes_processed'))

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">📊 Validation Check</h2>
</div>

In [6]:
# Perform validation checks on the processed data
validation_results = {
    'missing_values': processed_data.isnull().sum().sum(),
    'duplicates': processed_data.duplicated().sum(),
    'scaled_features_stats': {
        'BMI': {
            'mean': processed_data['BMI'].mean(),
            'std': processed_data['BMI'].std()
        },
        'MentHlth': {
            'mean': processed_data['MentHlth'].mean(),
            'std': processed_data['MentHlth'].std()
        },
        'PhysHlth': {
            'mean': processed_data['PhysHlth'].mean(),
            'std': processed_data['PhysHlth'].std()
        }
    }
}

print("Validation Results:")
display(validation_results)

Validation Results:


{'missing_values': np.int64(0),
 'duplicates': np.int64(0),
 'scaled_features_stats': {'BMI': {'mean': np.float64(0.21160901524630832),
   'std': np.float64(0.8485621807015986)},
  'MentHlth': {'mean': np.float64(-0.003328843565465817),
   'std': np.float64(0.9978271134531762)},
  'PhysHlth': {'mean': np.float64(0.0008739958628074722),
   'std': np.float64(1.000639257896516)}}}