<div style="background-color: #3D3D3A; padding: 20px; border-radius: 10px; margin-bottom: 20px;">
    <h1 style="color: #FFFFFF; text-align: center; margin: 0;">🔄 Data Processing and Feature Engineering Pipeline</h1>
    <p style="color: #CCCCCC; text-align: center; margin-top: 10px;">Preprocessing, Feature Engineering & Data Versioning</p>
</div>

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">📚 Import Required Libraries</h2>
</div>

In [13]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
import sys
warnings.filterwarnings('ignore')

sys.path.append(str(Path.cwd().parent))

from src.data.data_processing import DataProcessor
from src.data.data_versioning import DataVersioner
from src.features.feature_engineering import FeatureEngineer
from src.training.imbalance_handler import ImbalanceHandler

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">📥 Load Data</h2>
</div>

In [None]:
# Load the diabetes dataset
data_path = Path('../data/extracted/diabetes_data/diabetes_012_health_indicators_BRFSS2015.csv')
data = pd.read_csv(data_path)

print(f"Loaded data shape: {data.shape}")
display(data.head())

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">🔍 Initial Data Analysis</h2>
</div>

In [None]:
# Display initial statistics
print("\nMissing Values:")
display(data.isnull().sum())

print("\nDuplicate Rows:", data.duplicated().sum())

print("\nStatistics for Key Features:")
display(data[['BMI', 'MentHlth', 'PhysHlth']].describe())

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">⚙️ Feature Engineering</h2>
</div>

In [None]:
fe = FeatureEngineer()
fe_dataset = fe.transform(data)
fe_dataset.head()


<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">⚙️ Initialize Data Processor</h2>
</div>

In [None]:
# Initialize the DataProcessor with configuration
processor_config = {
    'health_features': ['MentHlth', 'PhysHlth'],
    'bmi_feature': 'BMI',
    'target_column': 'Diabetes_012',
    'outlier_threshold': 3.0
}

processor = DataProcessor(config=processor_config)

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">🔄 Process Data</h2>
</div>

In [None]:
# Process the data using our pipeline
X_train, X_test, y_train, y_test = processor.process_data(fe_dataset)

# Print statistics for training data
print("\nTraining Data Statistics:")
display(X_train[['BMI', 'MentHlth', 'PhysHlth']].describe())

# Print statistics for test data
print("\nTest Data Statistics:")
display(X_test[['BMI', 'MentHlth', 'PhysHlth']].describe())

# Compare shapes
print(f"\nOriginal data shape: {fe_dataset.shape}")
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">🔄 Handle the imbalance target</h2>
</div>

In [None]:
imbalance_handler = ImbalanceHandler()

# Handle imbalance in the training data
X_train_balanced, y_train_balanced = imbalance_handler.resample(X_train, y_train)

# Compare shapes
print(f"\nTraining data shape: {X_train.shape}")
print(f"Balanced training data shape: {X_train_balanced.shape}")

# Compare class distributions
print("\nClass Distribution in Training Data:")
display(y_train.value_counts(normalize=True))

print("\nClass Distribution in Balanced Training Data:")
display(y_train_balanced.value_counts(normalize=True))


<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">💾 Version the Processed Data</h2>
</div>

In [None]:
X_train_balanced['Diabetes_012'] = y_train_balanced
X_test['Diabetes_012'] = y_test
X_train_balanced['split'] = 'train'
X_test['split'] = 'test'

processed_data = pd.concat([X_train_balanced, X_test], axis=0, ignore_index=True)
processed_data.head()

In [None]:
processed_data.value_counts(['Diabetes_012'], normalize=True)

In [None]:
# Initialize the DataVersioner
versioner = DataVersioner(base_dir='../data')

# # version of the original dataset
# version_id_original = versioner.version_dataset(
#     data=data,
#     dataset_name='diabetes_original',
#     dataset_description='Original diabetes dataset'
# )

# Version the processed dataset
version_id_processed = versioner.version_dataset(
    data=processed_data,
    dataset_name='diabetes_processed_02',
    dataset_description='Processed diabetes dataset with outlier handling, feature scaling, engineering, and class balancing handling with smote'
)
# List all versions
print("\nAll available versions:")
display(versioner.list_versions('diabetes_processed_02'))

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">📊 Validation Check</h2>
</div>

In [None]:
# Perform validation checks on the processed data
validation_results = {
    'missing_values': processed_data.isnull().sum().sum(),
    'duplicates': processed_data.duplicated().sum(),
    'scaled_features_stats': {
        'BMI': {
            'mean': processed_data['BMI'].mean(),
            'std': processed_data['BMI'].std()
        },
        'MentHlth': {
            'mean': processed_data['MentHlth'].mean(),
            'std': processed_data['MentHlth'].std()
        },
        'PhysHlth': {
            'mean': processed_data['PhysHlth'].mean(),
            'std': processed_data['PhysHlth'].std()
        }
    }
}

print("Validation Results:")
display(validation_results)