<div style="background-color: #3D3D3A; padding: 20px; border-radius: 10px; margin-bottom: 20px;">
    <h1 style="color: #FFFFFF; text-align: center; margin: 0;">🔄 Data Processing Pipeline</h1>
    <p style="color: #CCCCCC; text-align: center; margin-top: 10px;">Preprocessing, Feature Engineering & Data Versioning</p>
</div>

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">📚 Import Required Libraries</h2>
</div>

In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
import sys

# Ignore warnings for cleaner notebook output
warnings.filterwarnings('ignore')
from src.data.data_processing import DataProcessor
from src.data.data_versioning import DataVersioner
from src.utils.logging_config import setup_logger

# Set up logging
logger = setup_logger('data_processing_notebook')

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">📥 Load Data</h2>
</div>

In [None]:
# Load the diabetes dataset
data_path = Path('data/extracted/diabetes_data/diabetes_012_health_indicators_BRFSS2015.csv')
data = pd.read_csv(data_path)

print(f"Loaded data shape: {data.shape}")
display(data.head())

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">🔍 Initial Data Analysis</h2>
</div>

In [None]:
# Display initial statistics
print("\nMissing Values:")
display(data.isnull().sum())

print("\nDuplicate Rows:", data.duplicated().sum())

print("\nStatistics for Key Features:")
display(data[['BMI', 'MentHlth', 'PhysHlth']].describe())

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">⚙️ Initialize Data Processor</h2>
</div>

In [None]:
# Initialize the DataProcessor with configuration
processor_config = {
    'health_features': ['MentHlth', 'PhysHlth'],
    'bmi_feature': 'BMI',
    'target_column': 'Diabetes_012',
    'outlier_threshold': 3.0
}

processor = DataProcessor(config=processor_config)

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">🔄 Process Data</h2>
</div>

In [None]:
# Process the data using our pipeline
processed_data = processor.process_data(data)

print("\nProcessed Data Statistics:")
display(processed_data[['BMI', 'MentHlth', 'PhysHlth']].describe())

# Compare shapes
print(f"\nOriginal data shape: {data.shape}")
print(f"Processed data shape: {processed_data.shape}")

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">💾 Version the Processed Data</h2>
</div>

In [None]:
# Initialize the DataVersioner
versioner = DataVersioner()

# Version the processed dataset
version_id = versioner.version_dataset(
    data=processed_data,
    dataset_name='diabetes_processed',
    dataset_description='Processed diabetes dataset with outlier handling and feature scaling'
)

print(f"\nDataset versioned with ID: {version_id}")

# List all versions
print("\nAll available versions:")
display(versioner.list_versions('diabetes_processed'))

<div style="background-color: #2E2E2B; padding: 15px; border-radius: 8px; margin: 10px 0;">
    <h2 style="color: #FFFFFF; margin-top: 0;">📊 Validation Check</h2>
</div>

In [None]:
# Perform validation checks on the processed data
validation_results = {
    'missing_values': processed_data.isnull().sum().sum(),
    'duplicates': processed_data.duplicated().sum(),
    'scaled_features_stats': {
        'BMI': {
            'mean': processed_data['BMI'].mean(),
            'std': processed_data['BMI'].std()
        },
        'MentHlth': {
            'mean': processed_data['MentHlth'].mean(),
            'std': processed_data['MentHlth'].std()
        },
        'PhysHlth': {
            'mean': processed_data['PhysHlth'].mean(),
            'std': processed_data['PhysHlth'].std()
        }
    }
}

print("Validation Results:")
display(validation_results)