# Enhanced EDA & Preprocessing Pipeline for Diabetes Dataset

This notebook is a direct conversion of `enhanced_eda_preprocess.py` into notebook form.
Sections: imports, utility functions, enhanced EDA, visualization generation, outlier handling,
enhanced imputation, feature engineering, high-cardinality handling, full preprocessing pipeline, and a `main` runner cell.

Use the cells below interactively.

In [2]:
# Enhanced EDA & Preprocessing Pipeline for Diabetes Dataset
# Addresses gaps: year column handling, feature engineering, outlier detection,
# advanced imputation, comprehensive EDA, and high-dimensionality issues

import argparse
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set matplotlib backend for compatibility (non-interactive)
import matplotlib
matplotlib.use('Agg')

In [3]:
# ---------- Utility Functions ----------
def ensure_dir(p: str):
    """Create directory if it doesn't exist"""
    os.makedirs(p, exist_ok=True)

def is_binary_like(s: pd.Series) -> bool:
    """Check if series contains binary-like values"""
    vals = s.dropna().unique()
    if len(vals) == 2:
        return True
    lowered = pd.Series(vals).astype(str).str.lower().unique()
    return set(lowered).issubset({"yes","no","true","false","positive","negative","pos","neg","y","n","1","0"})

def guess_target(df: pd.DataFrame):
    """Automatically detect target column"""
    common = [
        "Outcome","outcome","target","Target","label","Label","class","Class",
        "diabetes","Diabetes","has_diabetes","diabetic","Diabetic"
    ]
    for c in common:
        if c in df.columns:
            return c
    return None

In [4]:
# ---------- Enhanced EDA Functions ----------
def comprehensive_eda(df: pd.DataFrame, reports_dir: str, target_col: str = None):
    """Enhanced EDA with comprehensive analysis"""
    ensure_dir(reports_dir)
    
    print("🔍 Running Comprehensive EDA...")
    
    # 1. Basic Dataset Info
    basic_info = {
        'total_rows': len(df),
        'total_columns': len(df.columns),
        'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2,
        'duplicated_rows': df.duplicated().sum()
    }
    
    # 2. Column types and info
    dtypes_df = df.dtypes.astype(str).rename("dtype").reset_index().rename(columns={"index":"column"})
    dtypes_df['unique_values'] = [df[col].nunique() for col in df.columns]
    dtypes_df['null_count'] = [df[col].isnull().sum() for col in df.columns]
    dtypes_df['null_percentage'] = dtypes_df['null_count'].apply(lambda x: round(x / len(df) * 100, 2))
    dtypes_df.to_csv(os.path.join(reports_dir, "01_enhanced_dtypes.csv"), index=False)
    
    # 3. Missing values analysis
    miss_analysis = df.isnull().sum().reset_index()
    miss_analysis.columns = ['column', 'missing_count']
    miss_analysis['missing_pct'] = miss_analysis['missing_count'].apply(lambda x: round(x / len(df) * 100, 2))
    miss_analysis = miss_analysis.sort_values('missing_pct', ascending=False)
    miss_analysis.to_csv(os.path.join(reports_dir, "02_enhanced_missing_values.csv"), index=False)
    
    # 4. Identify column types
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    # Remove target from feature lists if specified
    if target_col and target_col in numeric_cols:
        numeric_cols.remove(target_col)
    if target_col and target_col in categorical_cols:
        categorical_cols.remove(target_col)
    
    # 5. Enhanced numeric analysis
    if numeric_cols:
        numeric_stats = df[numeric_cols].describe()
        
        # Add additional statistics
        numeric_enhanced = numeric_stats.copy()
        for col in numeric_cols:
            data = df[col].dropna()
            numeric_enhanced.loc['skewness', col] = stats.skew(data)
            numeric_enhanced.loc['kurtosis', col] = stats.kurtosis(data)
            numeric_enhanced.loc['cv', col] = data.std() / data.mean() if data.mean() != 0 else 0
        
        numeric_enhanced.round(4).to_csv(os.path.join(reports_dir, "03_enhanced_numeric_analysis.csv"))
    
    # 6. Categorical analysis
    if categorical_cols:
        cat_analysis = []
        for col in categorical_cols:
            unique_vals = df[col].nunique()
            top_category = df[col].mode()[0] if not df[col].mode().empty else 'No Mode'
            top_frequency = df[col].value_counts().iloc[0] if unique_vals > 0 else 0
            
            cat_analysis.append({
                'column': col,
                'unique_categories': unique_vals,
                'top_category': top_category,
                'top_frequency': top_frequency,
                'top_percentage': round(top_frequency / len(df) * 100, 2)
            })
        
        cat_df = pd.DataFrame(cat_analysis)
        cat_df.to_csv(os.path.join(reports_dir, "04_categorical_analysis.csv"), index=False)
    
    # 7. Target distribution (if target specified)
    if target_col and target_col in df.columns:
        target_dist = df[target_col].value_counts().reset_index()
        target_dist.columns = [target_col, 'count']
        target_dist['percentage'] = target_dist['count'].apply(lambda x: round(x / len(df) * 100, 2))
        target_dist.to_csv(os.path.join(reports_dir, "05_target_distribution.csv"), index=False)
    
    # 8. Outlier detection for numeric columns
    outlier_analysis = []
    for col in numeric_cols:
        data = df[col].dropna()
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = data[(data < lower_bound) | (data > upper_bound)]
        
        outlier_analysis.append({
            'column': col,
            'outlier_count': len(outliers),
            'outlier_percentage': round(len(outliers) / len(data) * 100, 2),
            'lower_bound': lower_bound,
            'upper_bound': upper_bound
        })
    
    outlier_df = pd.DataFrame(outlier_analysis)
    outlier_df.to_csv(os.path.join(reports_dir, "06_outlier_analysis.csv"), index=False)
    
    # 9. Correlation analysis (numeric columns only)
    if len(numeric_cols) > 1:
        corr_matrix = df[numeric_cols].corr()
        
        # Save correlation matrix
        corr_matrix.round(3).to_csv(os.path.join(reports_dir, "07_correlation_matrix.csv"))
        
        # Create correlation heatmap
        plt.figure(figsize=(12, 10))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                    square=True, linewidths=0.5)
        plt.title('Feature Correlation Heatmap')
        plt.tight_layout()
        plt.savefig(os.path.join(reports_dir, "correlation_heatmap.png"), dpi=300, bbox_inches='tight')
        plt.close()
    
    # 10. Generate comprehensive visualizations
    generate_enhanced_visualizations(df, reports_dir, numeric_cols, categorical_cols, target_col)
    
    print(f"✅ Enhanced EDA completed. Reports saved to: {reports_dir}")
    return basic_info

In [5]:
def generate_enhanced_visualizations(df, reports_dir, numeric_cols, categorical_cols, target_col):
    """Generate comprehensive visualizations for all columns"""
    
    # Create visualizations subdirectory
    viz_dir = os.path.join(reports_dir, "visualizations")
    ensure_dir(viz_dir)
    
    # 1. Numeric columns - Histograms and Box plots
    for col in numeric_cols:
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        # Histogram
        df[col].hist(bins=50, ax=axes[0], alpha=0.7, edgecolor='black')
        axes[0].set_title(f'Histogram: {col}')
        axes[0].set_xlabel(col)
        axes[0].set_ylabel('Frequency')
        
        # Box plot
        df.boxplot(column=col, ax=axes[1])
        axes[1].set_title(f'Box Plot: {col}')
        axes[1].set_ylabel(col)
        
        plt.tight_layout()
        plt.savefig(os.path.join(viz_dir, f"numeric_{col}.png"), dpi=300, bbox_inches='tight')
        plt.close()
    
    # 2. Categorical columns - Bar plots
    for col in categorical_cols:
        plt.figure(figsize=(12, 6))
        
        # Get value counts
        value_counts = df[col].value_counts()
        
        # Limit to top 20 categories if too many
        if len(value_counts) > 20:
            value_counts = value_counts.head(20)
            title_suffix = " (Top 20)"
        else:
            title_suffix = ""
        
        # Create bar plot
        ax = value_counts.plot(kind='bar', color='skyblue', edgecolor='black')
        plt.title(f'Distribution: {col}{title_suffix}')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.xticks(rotation=45, ha='right')
        
        # Add value labels on bars
        for i, v in enumerate(value_counts.values):
            ax.text(i, v + max(value_counts.values) * 0.01, str(v), 
                   ha='center', va='bottom', fontsize=9)
        
        plt.tight_layout()
        plt.savefig(os.path.join(viz_dir, f"categorical_{col}.png"), dpi=300, bbox_inches='tight')
        plt.close()
    
    # 3. Target vs Features analysis (if target specified)
    if target_col and target_col in df.columns:
        target_viz_dir = os.path.join(viz_dir, "target_analysis")
        ensure_dir(target_viz_dir)
        
        # Numeric features vs target
        for col in numeric_cols:
            plt.figure(figsize=(12, 5))
            
            # Create subplots
            fig, axes = plt.subplots(1, 2, figsize=(15, 5))
            
            # Box plot by target
            df.boxplot(column=col, by=target_col, ax=axes[0])
            axes[0].set_title(f'{col} by {target_col}')
            
            # Histogram by target
            for target_val in df[target_col].unique():
                subset = df[df[target_col] == target_val][col]
                axes[1].hist(subset, alpha=0.7, label=f'{target_col}={target_val}', bins=30)
            
            axes[1].set_title(f'{col} Distribution by {target_col}')
            axes[1].set_xlabel(col)
            axes[1].set_ylabel('Frequency')
            axes[1].legend()
            
            plt.tight_layout()
            plt.savefig(os.path.join(target_viz_dir, f"target_vs_{col}.png"), dpi=300, bbox_inches='tight')
            plt.close()

In [6]:
# ---------- Enhanced Preprocessing Functions ----------
def detect_outliers_iqr(series, multiplier=1.5):
    """Detect outliers using IQR method"""
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    return (series < lower_bound) | (series > upper_bound)

def handle_outliers(df, numeric_cols, method='cap', multiplier=1.5):
    """Handle outliers in numeric columns"""
    df_clean = df.copy()
    outlier_info = {}
    
    for col in numeric_cols:
        outliers = detect_outliers_iqr(df_clean[col], multiplier)
        outlier_count = outliers.sum()
        
        if outlier_count > 0:
            if method == 'cap':
                # Cap outliers
                Q1 = df_clean[col].quantile(0.25)
                Q3 = df_clean[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - multiplier * IQR
                upper_bound = Q3 + multiplier * IQR
                
                df_clean[col] = np.where(df_clean[col] < lower_bound, lower_bound, df_clean[col])
                df_clean[col] = np.where(df_clean[col] > upper_bound, upper_bound, df_clean[col])
                
            elif method == 'remove':
                # Remove outliers (not recommended for large datasets)
                df_clean = df_clean[~outliers]
        
        outlier_info[col] = {
            'outlier_count': outlier_count,
            'outlier_percentage': round(outlier_count / len(df) * 100, 2),
            'method_applied': method if outlier_count > 0 else 'none'
        }
    
    return df_clean, outlier_info

In [7]:
def enhanced_imputation(df, numeric_cols, categorical_cols, target_col=None):
    """Enhanced imputation strategies for different column types"""
    df_imputed = df.copy()
    imputation_info = {}
    
    # Medical/Health-specific imputation logic
    medical_features = ['bmi', 'hbA1c_level', 'blood_glucose_level', 'sleep_hours']
    
    # Numeric imputation
    for col in numeric_cols:
        missing_count = df_imputed[col].isnull().sum()
        
        if missing_count > 0:
            if col in medical_features:
                # For medical features, use median within similar groups if possible
                if target_col and target_col in df.columns:
                    # Group by target and use median
                    df_imputed[col] = df_imputed.groupby(target_col)[col].transform(
                        lambda x: x.fillna(x.median()) if not x.median() != x.median() else x.fillna(df_imputed[col].median())
                    )
                else:
                    df_imputed[col] = df_imputed[col].fillna(df_imputed[col].median())
                imputation_method = 'group_median' if target_col else 'median'
            else:
                # Regular median imputation for other numeric
                df_imputed[col] = df_imputed[col].fillna(df_imputed[col].median())
                imputation_method = 'median'
            
            imputation_info[col] = {
                'missing_count': missing_count,
                'imputation_method': imputation_method
            }
    
    # Categorical imputation
    for col in categorical_cols:
        missing_count = df_imputed[col].isnull().sum()
        
        if missing_count > 0:
            # Use mode or 'Unknown' if no mode exists
            mode_val = df_imputed[col].mode()
            if len(mode_val) > 0:
                df_imputed[col] = df_imputed[col].fillna(mode_val[0])
                imputation_method = 'mode'
            else:
                df_imputed[col] = df_imputed[col].fillna('Unknown')
                imputation_method = 'unknown'
            
            imputation_info[col] = {
                'missing_count': missing_count,
                'imputation_method': imputation_method
            }
    
    return df_imputed, imputation_info

In [8]:
def feature_engineering(df, target_col=None):
    """Create additional engineered features"""
    df_engineered = df.copy()
    new_features = []
    
    # 1. BMI-related features
    if 'bmi' in df.columns:
        # BMI risk categories (more detailed)
        df_engineered['bmi_risk_level'] = pd.cut(df_engineered['bmi'], 
                                               bins=[0, 18.5, 25, 30, 35, float('inf')],
                                               labels=['underweight', 'normal', 'overweight', 'obese_1', 'obese_2'])
        new_features.append('bmi_risk_level')
    
    # 2. Age-related features
    if 'age' in df.columns:
        # Age risk for diabetes (medical domain knowledge)
        df_engineered['age_diabetes_risk'] = pd.cut(df_engineered['age'],
                                                  bins=[0, 35, 45, 65, float('inf')],
                                                  labels=['low_risk', 'moderate_risk', 'high_risk', 'very_high_risk'])
        new_features.append('age_diabetes_risk')
    
    # 3. Combined health risk score
    health_indicators = ['hypertension', 'heart_disease', 'family_history']
    available_indicators = [col for col in health_indicators if col in df.columns]
    
    if available_indicators:
        df_engineered['health_risk_score'] = df_engineered[available_indicators].sum(axis=1)
        new_features.append('health_risk_score')
    
    # 4. Lifestyle score
    lifestyle_factors = []
    
    # Physical activity scoring
    if 'physical_activity' in df.columns:
        activity_map = {'low': 0, 'moderate': 1, 'high': 2}
        df_engineered['activity_score'] = df_engineered['physical_activity'].map(activity_map).fillna(0)
        lifestyle_factors.append('activity_score')
        new_features.append('activity_score')
    
    # Sleep quality scoring
    if 'sleep_hours' in df.columns:
        # Optimal sleep is 7-9 hours
        df_engineered['sleep_quality'] = df_engineered['sleep_hours'].apply(
            lambda x: 2 if 7 <= x <= 9 else (1 if 6 <= x <= 10 else 0) if pd.notna(x) else 0
        )
        lifestyle_factors.append('sleep_quality')
        new_features.append('sleep_quality')
    
    # Combined lifestyle score
    if lifestyle_factors:
        df_engineered['lifestyle_score'] = df_engineered[lifestyle_factors].sum(axis=1)
        new_features.append('lifestyle_score')
    
    # 5. Geographic risk (if environmental_risk is available)
    if 'environmental_risk' in df.columns and 'urban_rural' in df.columns:
        # Combine environmental risk with urban/rural
        urban_risk_map = {'urban': 1.1, 'rural': 0.9}  # Urban areas might have higher risk
        df_engineered['location_risk'] = (df_engineered['environmental_risk'] * 
                                        df_engineered['urban_rural'].map(urban_risk_map).fillna(1.0))
        new_features.append('location_risk')
    
    return df_engineered, new_features

In [9]:
def handle_high_cardinality_categorical(df, categorical_cols, target_col=None, max_categories=10):
    """Handle high cardinality categorical variables"""
    df_processed = df.copy()
    encoding_info = {}
    
    for col in categorical_cols:
        unique_count = df_processed[col].nunique()
        
        if unique_count > max_categories:
            # For high cardinality columns like 'location' (states)
            if col == 'location':
                # Group by frequency - keep top states, others as 'Other'
                value_counts = df_processed[col].value_counts()
                top_categories = value_counts.head(max_categories).index.tolist()
                df_processed[col] = df_processed[col].apply(
                    lambda x: x if x in top_categories else 'Other'
                )
                encoding_info[col] = {
                    'method': 'frequency_grouping',
                    'kept_categories': len(top_categories) + 1,  # +1 for 'Other'
                    'original_categories': unique_count
                }
            
            elif target_col and target_col in df.columns:
                # Use target encoding for other high cardinality categorical variables
                # This is more sophisticated than frequency grouping
                target_encoder = TargetEncoder()
                df_processed[f'{col}_target_encoded'] = target_encoder.fit_transform(
                    df_processed[[col]], df_processed[target_col]
                )
                
                # Keep original column and add encoded version
                encoding_info[col] = {
                    'method': 'target_encoding',
                    'new_column': f'{col}_target_encoded',
                    'original_categories': unique_count
                }
            else:
                # Fallback to frequency grouping
                value_counts = df_processed[col].value_counts()
                top_categories = value_counts.head(max_categories).index.tolist()
                df_processed[col] = df_processed[col].apply(
                    lambda x: x if x in top_categories else 'Other'
                )
                encoding_info[col] = {
                    'method': 'frequency_grouping',
                    'kept_categories': len(top_categories) + 1,
                    'original_categories': unique_count
                }
    
    return df_processed, encoding_info

In [10]:
def enhanced_preprocessing(df: pd.DataFrame, outdir: str, target_col: str = None, 
                         handle_outliers_method='cap', use_feature_engineering=True):
    """Enhanced preprocessing pipeline with all improvements"""
    ensure_dir(outdir)
    
    print("🔄 Starting Enhanced Preprocessing Pipeline...")
    
    # 1. Initial cleaning
    print("  📋 Step 1: Basic cleaning...")
    df_clean = df.copy()
    initial_shape = df_clean.shape
    
    # Remove duplicates
    df_clean = df_clean.drop_duplicates().reset_index(drop=True)
    print(f"     Removed {initial_shape[0] - df_clean.shape[0]} duplicate rows")
    
    # 2. Identify column types
    print("  🔍 Step 2: Analyzing column types...")
    numeric_cols = [c for c in df_clean.columns 
                   if pd.api.types.is_numeric_dtype(df_clean[c]) and c != target_col]
    categorical_cols = [c for c in df_clean.columns 
                       if df_clean[c].dtype == "object" and c != target_col]
    
    # Special handling for 'year' column - treat as categorical
    if 'year' in numeric_cols:
        print("     Moving 'year' from numeric to categorical (ordinal treatment)")
        numeric_cols.remove('year')
        categorical_cols.append('year')
        # Convert year to string to treat as categorical
        df_clean['year'] = df_clean['year'].astype(str)
    
    # 3. Enhanced imputation
    print("  🩹 Step 3: Enhanced imputation...")
    df_clean, imputation_info = enhanced_imputation(df_clean, numeric_cols, categorical_cols, target_col)
    
    # Save imputation info
    imputation_df = pd.DataFrame.from_dict(imputation_info, orient='index').reset_index()
    imputation_df.columns = ['column', 'missing_count', 'imputation_method']
    imputation_df.to_csv(os.path.join(outdir, "imputation_report.csv"), index=False)
    
    # 4. Outlier handling for numeric columns
    if numeric_cols and handle_outliers_method != 'none':
        print(f"  🎯 Step 4: Handling outliers using {handle_outliers_method} method...")
        df_clean, outlier_info = handle_outliers(df_clean, numeric_cols, handle_outliers_method)
        
        # Save outlier info
        outlier_df = pd.DataFrame.from_dict(outlier_info, orient='index').reset_index()
        outlier_df.to_csv(os.path.join(outdir, "outlier_treatment_report.csv"), index=False)
    
    # 5. Feature Engineering
    if use_feature_engineering:
        print("  ⚙️  Step 5: Feature engineering...")
        df_clean, new_features = feature_engineering(df_clean, target_col)
        print(f"     Created {len(new_features)} new features: {new_features}")
        
        # Update column lists with new categorical features
        new_categorical = [f for f in new_features if df_clean[f].dtype == 'object']
        categorical_cols.extend(new_categorical)
        
        new_numeric = [f for f in new_features if f not in new_categorical]
        numeric_cols.extend(new_numeric)
    
    # 6. Handle high cardinality categorical variables
    print("  📊 Step 6: Handling high cardinality categorical variables...")
    df_clean, encoding_info = handle_high_cardinality_categorical(
        df_clean, categorical_cols, target_col, max_categories=15
    )
    
    # Save encoding info
    if encoding_info:
        encoding_df = pd.DataFrame.from_dict(encoding_info, orient='index').reset_index()
        encoding_df.to_csv(os.path.join(outdir, "encoding_report.csv"), index=False)
    
    # 7. Save human-readable version
    print("  💾 Step 7: Saving human-readable version...")
    readable_path = os.path.join(outdir, "diabetes_enhanced_readable.csv")
    df_clean.to_csv(readable_path, index=False)
    
    # 8. Prepare ML-ready version
    print("  🤖 Step 8: Preparing ML-ready version...")
    df_ml = df_clean.copy()
    
        # Get updated categorical columns (excluding target encoded columns for one-hot encoding)
    categorical_for_encoding = [col for col in categorical_cols 
                               if not any(f'{col}_target_encoded' in colname for colname in df_ml.columns)]
    
    # One-hot encoding for categorical variables
    if categorical_for_encoding:
        print(f"     Applying one-hot encoding to: {categorical_for_encoding}")
        df_ml = pd.get_dummies(df_ml, columns=categorical_for_encoding, drop_first=False)
    
    # Update numeric columns list (include target encoded features, exclude categorical features)
    target_encoded_cols = [col for col in df_ml.columns if 'target_encoded' in col]
    
    # Filter numeric_cols to only include truly numeric columns that exist in df_ml
    final_numeric_cols = []
    for col in numeric_cols:
        if col in df_ml.columns and pd.api.types.is_numeric_dtype(df_ml[col]):
            final_numeric_cols.append(col)
    
    # Add target encoded columns
    final_numeric_cols.extend(target_encoded_cols)
    
    # 9. Scaling numeric features (excluding year which is now categorical)
    if final_numeric_cols:
        print(f"     Scaling {len(final_numeric_cols)} numeric features...")
        scaler = StandardScaler()
        df_ml[final_numeric_cols] = scaler.fit_transform(df_ml[final_numeric_cols])
        
        # Save scaler for later use
        import joblib
        scaler_path = os.path.join(outdir, "feature_scaler.pkl")
        joblib.dump(scaler, scaler_path)
        print(f"     Scaler saved to: {scaler_path}")
    
    # 10. Feature selection (optional - select top K features for numeric columns only)
    if target_col and target_col in df_ml.columns and len(df_ml.columns) > 50:
        print("  🎯 Step 9: Feature selection (too many features detected)...")
        
        # Separate features and target
        X = df_ml.drop(columns=[target_col])
        y = df_ml[target_col]
        
        # Only apply feature selection to numeric columns
        numeric_feature_cols = [col for col in X.columns if pd.api.types.is_numeric_dtype(X[col])]
        categorical_feature_cols = [col for col in X.columns if not pd.api.types.is_numeric_dtype(X[col])]
        
        if numeric_feature_cols and len(numeric_feature_cols) > 30:
            # Select top K numeric features
            k = min(20, len(numeric_feature_cols))  # Select top 20 numeric or all available
            selector = SelectKBest(score_func=f_classif, k=k)
            X_numeric_selected = selector.fit_transform(X[numeric_feature_cols], y)
            
            # Get selected feature names
            selected_numeric_features = pd.Series(numeric_feature_cols)[selector.get_support()].tolist()
            
            # Combine selected numeric features with all categorical features and target
            selected_features = selected_numeric_features + categorical_feature_cols + [target_col]
            
            df_ml = df_ml[selected_features]
            
            # Save feature selection info
            feature_scores = pd.DataFrame({
                'feature': numeric_feature_cols,
                'score': selector.scores_,
                'selected': selector.get_support()
            }).sort_values('score', ascending=False)
            
            feature_scores.to_csv(os.path.join(outdir, "feature_selection_report.csv"), index=False)
            print(f"     Selected {k} numeric features out of {len(numeric_feature_cols)} (kept all {len(categorical_feature_cols)} categorical features)")
        else:
            print("     Skipping feature selection - not enough numeric features or features already manageable")
    
    # 11. Save ML-ready version
    print("  💾 Step 10: Saving ML-ready version...")
    ml_path = os.path.join(outdir, "diabetes_enhanced_ml_ready.csv")
    df_ml.to_csv(ml_path, index=False)
    
    # 12. Generate processing summary
    processing_summary = {
        'original_rows': initial_shape[0],
        'original_columns': initial_shape[1],
        'final_rows': df_ml.shape[0],
        'final_columns': df_ml.shape[1],
        'duplicates_removed': initial_shape[0] - df_clean.shape[0],
        'numeric_features': len([c for c in final_numeric_cols if c in df_ml.columns]),
        'categorical_features_encoded': len(categorical_for_encoding),
        'new_features_created': len(new_features) if use_feature_engineering else 0,
        'outlier_method': handle_outliers_method,
        'feature_selection_applied': 'Yes' if len(df_ml.columns) != len(df_clean.columns) else 'No',
        'total_final_features': len(df_ml.columns) - (1 if target_col in df_ml.columns else 0)
    }
    
    summary_df = pd.DataFrame.from_dict(processing_summary, orient='index', columns=['value'])
    summary_df.to_csv(os.path.join(outdir, "processing_summary.csv"))
    
    print(f"✅ Enhanced preprocessing completed!")
    print(f"   📁 Human-readable data: {readable_path}")
    print(f"   🤖 ML-ready data: {ml_path}")
    print(f"   📊 Final dataset: {df_ml.shape[0]} rows × {df_ml.shape[1]} columns")
    
    return readable_path, ml_path, processing_summary

In [11]:
# Interactive runner (notebook-friendly)
# This cell replaces the script-style `main()` function. Execute to load data and
# optionally run EDA and preprocessing steps interactively.

# --- Configuration (edit paths as needed) ---
raw_path = "../data/raw/diabetes_dataset_E.csv"  # relative to this notebook
reports_dir = "../data/reports_enhanced"
outdir = "../data/processed_enhanced"

# Safety toggles - set True to run the step when you execute this cell
run_eda = False
run_preprocessing = False

# Load dataset
print(f"📂 Loading dataset from: {raw_path}")
df = pd.read_csv(raw_path)
print(f"   Dataset loaded: {df.shape[0]} rows × {df.shape[1]} columns")
run_eda = True  # Enable EDA by default for interactive exploration
run_preprocessing = True  # Enable preprocessing by default for interactive exploration
# Auto-detect target (if any)
target_col = guess_target(df)
if target_col:
    print(f"🎯 Auto-detected target column: {target_col}")
else:
    print("⚠️  No target column auto-detected; pass `target_col` explicitly to functions if needed")

# Run EDA if requested
if run_eda:
    print("\n🔍 Running comprehensive EDA...")
    ensure_dir(reports_dir)
    basic_info = comprehensive_eda(df, reports_dir=reports_dir, target_col=target_col)
    print("EDA finished. Reports saved to:", reports_dir)

# Run preprocessing if requested
if run_preprocessing:
    print("\n🔄 Running enhanced preprocessing...")
    ensure_dir(outdir)
    readable_path, ml_path, summary = enhanced_preprocessing(
        df,
        outdir=outdir,
        target_col=target_col,
        handle_outliers_method='cap',
        use_feature_engineering=True,
    )
    print("Preprocessing finished. Outputs:")
    print(" - Human-readable:", readable_path)
    print(" - ML-ready:", ml_path)
    print(" - Summary:", summary)

print("\n✅ Interactive runner cell complete. Toggle `run_eda` / `run_preprocessing` and re-run to execute steps.")

📂 Loading dataset from: ../data/raw/diabetes_dataset_E.csv
   Dataset loaded: 100000 rows × 28 columns
🎯 Auto-detected target column: diabetes

🔍 Running comprehensive EDA...
🔍 Running Comprehensive EDA...
   Dataset loaded: 100000 rows × 28 columns
🎯 Auto-detected target column: diabetes

🔍 Running comprehensive EDA...
🔍 Running Comprehensive EDA...
✅ Enhanced EDA completed. Reports saved to: ../data/reports_enhanced
EDA finished. Reports saved to: ../data/reports_enhanced

🔄 Running enhanced preprocessing...
🔄 Starting Enhanced Preprocessing Pipeline...
  📋 Step 1: Basic cleaning...
✅ Enhanced EDA completed. Reports saved to: ../data/reports_enhanced
EDA finished. Reports saved to: ../data/reports_enhanced

🔄 Running enhanced preprocessing...
🔄 Starting Enhanced Preprocessing Pipeline...
  📋 Step 1: Basic cleaning...
     Removed 0 duplicate rows
  🔍 Step 2: Analyzing column types...
     Moving 'year' from numeric to categorical (ordinal treatment)
  🩹 Step 3: Enhanced imputation...

In [None]:
# --- RandomForest baseline + hyperparameter tuning (RandomizedSearchCV) ---
# Runs a randomized hyperparameter search, compares CV performance vs test to detect overfitting,
# and optionally trains a simple Keras MLP with BatchNormalization, Dropout and EarlyStopping

import os
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
import joblib

# Optional: neural net tools (try/except so notebook still works if TF not installed)
try:
    import tensorflow as tf
    from keras.models import Sequential
    from keras.layers import Dense, Dropout, BatchNormalization
    from keras.callbacks import EarlyStopping
    from sklearn.metrics import make_scorer, recall_score
    TF_AVAILABLE = True
except Exception as e:
    TF_AVAILABLE = False

# Paths (edit if needed)
ml_path = "../data/processed_enhanced/diabetes_enhanced_ml_ready.csv"
models_dir = "../models"
ensure_dir(models_dir)

if not os.path.exists(ml_path):
    print(f"ML-ready CSV not found at: {ml_path}. Run preprocessing first or update `ml_path`.")
else:
    df_ml = pd.read_csv(ml_path)
    print(f"Loaded ML-ready data: {df_ml.shape}")

    # Detect target
    target_col = guess_target(df_ml)
    if not target_col:
        raise ValueError("No target column found in ML-ready CSV. Please supply `target_col` or ensure preprocessing kept the target column.")
    print(f"Using target column: {target_col}")

    X = df_ml.drop(columns=[target_col])
    y = df_ml[target_col]
    
    # One-hot encode object columns if any remain
    obj_cols = X.select_dtypes(include=['object']).columns.tolist()
    if obj_cols:
        print('One-hot encoding leftover object columns:', obj_cols)
        X = pd.get_dummies(X, columns=obj_cols, drop_first=False)

    # Simple NaN check
    if X.isnull().any().any():
        raise ValueError('ML features contain NaNs. Please fix preprocessing before training.')

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

    # Baseline estimator (RandomForest with RandomizedSearchCV)
    base_clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, class_weight='balanced')

    # Parameter distributions for RandomizedSearch
    param_dist = {
        'n_estimators': [100, 200, 400, 800],
        'max_depth': [None, 5, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4, 8],
        'max_features': ['sqrt', 'log2', 0.2, 0.5],
        'bootstrap': [True, False]
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    scorer = make_scorer(recall_score, pos_label=1)
    rs = RandomizedSearchCV(
        estimator=base_clf,
        param_distributions=param_dist,
        n_iter=25,
        scoring='roc_auc',
        n_jobs=-1,
        cv=cv,
        random_state=42,
        verbose=1,
        return_train_score=True
    )

    print('Running RandomizedSearchCV (25 iterations) to reduce overfitting risk...')
    rs.fit(X_train, y_train)

    print('Best params:', rs.best_params_)
    best = rs.best_estimator_

    # Cross-validated performance of best estimator on training folds
    train_cv_scores = cross_val_score(best, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
    print(f"Train CV ROC-AUC: mean={train_cv_scores.mean():.4f} std={train_cv_scores.std():.4f}")

    # Evaluate on test set
    if hasattr(best, 'predict_proba'):
        y_prob = best.predict_proba(X_test)[:, 1]
    else:
        y_prob = best.predict(X_test)
#################################################################
    for thresh in [0.5, 0.4, 0.3]:
        print (f"Threshold: {thresh}")
        y_pred_thresh = (y_prob >= thresh).astype(int)
        print(classification_report(y_test, y_pred_thresh))

    test_roc = roc_auc_score(y_test, y_prob)
    test_pr = average_precision_score(y_test, y_prob)
    print(f"RF Test ROC-AUC: {test_roc:.4f}, RF Test PR-AUC: {test_pr:.4f}")
    print('Classification report (RF test set):')
    print(classification_report(y_test, best.predict(X_test)))

    # Overfitting check: compare training CV mean vs test
    delta = train_cv_scores.mean() - test_roc
    print(f"CV mean - Test ROC difference (RF): {delta:.4f} (positive → possible overfitting)")

    # Save RF best model and artifacts
    rf_model_path = os.path.join(models_dir, 'diabetes_rf_tuned.pkl')
    joblib.dump(best, rf_model_path)

    manifest = {'features': X.columns.tolist(), 'target': target_col}
    with open(os.path.join(models_dir, 'feature_columns.json'), 'w', encoding='utf-8') as f:
        json.dump(manifest, f, indent=2)

    report = {
        'rf_best_params': rs.best_params_,
        'rf_train_cv_mean': float(train_cv_scores.mean()),
        'rf_train_cv_std': float(train_cv_scores.std()),
        'rf_test_roc_auc': float(test_roc),
        'rf_test_pr_auc': float(test_pr),
        'rf_model_path': rf_model_path,
        'ml_path': ml_path
    }
    pd.DataFrame([report]).to_csv(os.path.join(models_dir, 'training_report.csv'), index=False)

    # Feature importances
    importances = pd.Series(best.feature_importances_, index=X.columns).sort_values(ascending=False)
    importances.head(50).to_csv(os.path.join(models_dir, 'feature_importances_top50.csv'))

    print('Saved tuned RF model to:', rf_model_path)
    print('Saved RF report and feature importances in', models_dir)

    # Quick suggestions based on RF results
    if delta > 0.05:
        print('\n⚠️  Significant drop from CV → test for RF (delta > 0.05). Suggestions:')
        print(' - Reduce model complexity (lower max_depth, increase min_samples_leaf).')
        print(' - Use stronger regularization or fewer features (SelectFromModel, Drop low-importance features).')
        print(' - Verify no data leakage and that preprocessing didn\'t use target information.')
        print(' - Try gradient boosting with early stopping (XGBoost/LightGBM/CatBoost) and proper validation folds.')

    # --- Optional: Train a small Keras MLP with BatchNorm, Dropout and EarlyStopping to compare ---
    run_nn = True  # Set to False to skip NN training
    if run_nn:
        if not TF_AVAILABLE:
            print('TensorFlow not available. Skipping NN training. Install tensorflow to enable this option.')
        else:
            # Ensure reproducibility
            tf.random.set_seed(42)
            np.random.seed(42)

            # Convert to numpy arrays (TF expects floats)
            Xtr = X_train.values.astype('float32')
            Xte = X_test.values.astype('float32')
            ytr = y_train.values.astype('float32')
            yte = y_test.values.astype('float32')

            input_dim = Xtr.shape[1]

            def build_mlp(input_dim, dropout_rate=0.4):
                model = Sequential()
                model.add(Dense(256, activation='relu', input_shape=(input_dim,)))
                model.add(BatchNormalization())
                model.add(Dropout(dropout_rate))
                model.add(Dense(128, activation='relu'))
                model.add(BatchNormalization())
                model.add(Dropout(dropout_rate))
                model.add(Dense(64, activation='relu'))
                model.add(BatchNormalization())
                model.add(Dropout(dropout_rate))
                model.add(Dense(1, activation='sigmoid'))
                model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(name='auc')])
                return model

            print('Training Keras MLP with BatchNorm, Dropout and EarlyStopping...')
            mlp = build_mlp(input_dim, dropout_rate=0.4)
            es = EarlyStopping(monitor='val_auc', mode='max', patience=10, restore_best_weights=True, verbose=1)

            history = mlp.fit(
                Xtr, ytr,
                validation_split=0.2,
                epochs=100,
                batch_size=32,
                callbacks=[es],
                verbose=1
            )

            # Evaluate NN on test set
            y_prob_nn = mlp.predict(Xte).ravel()
            test_roc_nn = roc_auc_score(yte, y_prob_nn)
            test_pr_nn = average_precision_score(yte, y_prob_nn)
            print(f"NN Test ROC-AUC: {test_roc_nn:.4f}, NN Test PR-AUC: {test_pr_nn:.4f}")

            # Overfitting check for NN: best validation AUC vs test AUC
            val_auc_hist = history.history.get('val_auc', [])
            best_val_auc = max(val_auc_hist) if val_auc_hist else None
            if best_val_auc is not None:
                delta_nn = best_val_auc - test_roc_nn
                print(f"Best Val AUC - Test ROC difference (NN): {delta_nn:.4f} (positive → possible overfitting)")
            else:
                delta_nn = None

            # Save NN model and report if desirable
            nn_model_path = os.path.join(models_dir, 'diabetes_nn_dropout_batchnorm.h5')
            mlp.save(nn_model_path)
            print('Saved NN model to:', nn_model_path)

            # Compare RF vs NN on test ROC and choose winner
            chosen = 'rf' if test_roc >= test_roc_nn else 'nn'
            print(f'Chosen model based on test ROC: {chosen} (rf={test_roc:.4f}, nn={test_roc_nn:.4f})')

            # Append NN info to report file
            extra = {
                'nn_test_roc_auc': float(test_roc_nn),
                'nn_test_pr_auc': float(test_pr_nn),
                'nn_model_path': nn_model_path,
                'nn_best_val_auc': float(best_val_auc) if best_val_auc is not None else None,
                'nn_val_minus_test': float(delta_nn) if delta_nn is not None else None,
                'chosen_model': chosen
            }
            # Merge and save full report
            report.update(extra)
            pd.DataFrame([report]).to_csv(os.path.join(models_dir, 'training_report.csv'), index=False)

    print('Done.')

Loaded ML-ready data: (100000, 23)
Using target column: diabetes
One-hot encoding leftover object columns: ['bmi_risk_level', 'age_diabetes_risk']
Train shape: (80000, 29) Test shape: (20000, 29)
Running RandomizedSearchCV (25 iterations) to reduce overfitting risk...
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best params: {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 0.5, 'max_depth': 10, 'bootstrap': True}
Best params: {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 0.5, 'max_depth': 10, 'bootstrap': True}
Train CV ROC-AUC: mean=0.9760 std=0.0013
Train CV ROC-AUC: mean=0.9760 std=0.0013
Threshold: 0.5
              precision    recall  f1-score   support

           0       0.99      0.90      0.94     18300
           1       0.46      0.90      0.61      1700

    accuracy                           0.90     20000
   macro avg       0.72      0.90      0.77     20000
weighted avg       0



NN Test ROC-AUC: 0.9738, NN Test PR-AUC: 0.8667
Best Val AUC - Test ROC difference (NN): -0.0006 (positive → possible overfitting)
Saved NN model to: ../models\diabetes_nn_dropout_batchnorm.h5
Chosen model based on test ROC: rf (rf=0.9748, nn=0.9738)
Done.
Done.


In [19]:
print (X.columns)

Index(['age', 'bmi', 'hbA1c_level', 'blood_glucose_level', 'gender_Female',
       'gender_Male', 'smoking_history_No Info', 'smoking_history_current',
       'smoking_history_ever', 'smoking_history_former',
       'smoking_history_never', 'smoking_history_not current',
       'bmi_category_Normal', 'bmi_category_Obese', 'bmi_category_Overweight',
       'bmi_category_Underweight', 'age_group_Adult', 'age_group_Child',
       'age_group_Middle-aged', 'age_group_Senior', 'bmi_risk_level_normal',
       'bmi_risk_level_obese_1', 'bmi_risk_level_obese_2',
       'bmi_risk_level_overweight', 'bmi_risk_level_underweight',
       'age_diabetes_risk_high_risk', 'age_diabetes_risk_low_risk',
       'age_diabetes_risk_moderate_risk', 'age_diabetes_risk_very_high_risk'],
      dtype='object')


In [16]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, average_precision_score, precision_score

# Get probabilities for positive class
y_proba = best.predict_proba(X_test)[:, 1]

# Compute PR curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
ap_score = average_precision_score(y_test, y_proba)

# Plot Precision–Recall vs Threshold
plt.figure(figsize=(8,6))
plt.plot(thresholds, precision[:-1], "b--", label="Precision")
plt.plot(thresholds, recall[:-1], "g-", label="Recall")
plt.xlabel("Decision Threshold")
plt.ylabel("Score")
plt.title(f"Precision-Recall vs Threshold (AP = {ap_score:.3f})")
plt.legend()
plt.grid(True)
plt.show()

# Print values at common thresholds
for t in [0.5, 0.4, 0.3]:
    
    y_pred = (y_proba >= t).astype(int)
    p = precision_score(y_test, y_pred)
    r = recall_score(y_test, y_pred)
    print(f"Threshold {t:.2f}: Precision={p:.3f}, Recall={r:.3f}")


Threshold 0.50: Precision=0.456, Recall=0.901
Threshold 0.40: Precision=0.378, Recall=0.936
Threshold 0.30: Precision=0.335, Recall=0.965


In [21]:
import numpy as np
import joblib

# Load model
rf_model = joblib.load("../models/diabetes_rf_tuned.pkl")

# 1️⃣ Numeric features
numeric_features = [
    0.8,    # age (normalized)
    1.2,    # bmi (normalized)
    1.5,    # hbA1c_level (high → indicates diabetes)
    1.3     # blood_glucose_level (high)
]

# Boolean / one-hot features (in exact training order)
boolean_features = [
    0,1,        # gender_Female, gender_Male
    0,0,1,0,0,0, # smoking_history_* (current smoker)
    0,0,1,0,    # bmi_category_* (Overweight)
    0,0,0,1,    # age_group_* (Senior)
    0,0,0,1,0,  # bmi_risk_level_* (overweight risk)
    0,0,1,0     # age_diabetes_risk_* (high_risk)
]

# ✅ Combine numeric + boolean features
sample_input = numeric_features + boolean_features

# Convert to 2D array
sample_input = np.array(sample_input).reshape(1, -1)

# 3️⃣ Make prediction
pred_class = rf_model.predict(sample_input)
pred_proba = rf_model.predict_proba(sample_input)

print("Predicted class:", pred_class[0])
print("Predicted probability (class 0, class 1):", pred_proba[0])


Predicted class: 1
Predicted probability (class 0, class 1): [0.00985749 0.99014251]
