# EDA, DP, FE & Baseline ML


In [None]:
# CONSOLIDATED SETUP CELL: ALL IMPORTS, CONFIGURATIONS & PATH MANAGEMENT

import os
import sys
from pathlib import Path
import unicodedata
import textwrap
import warnings
warnings.filterwarnings('ignore')

# Data Processing
import numpy as np
import pandas as pd
from collections import Counter

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Scientific Computing
from scipy.stats import chi2_contingency

# Scikit-learn: Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer

# Scikit-learn: Model Selection
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Scikit-learn: Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report
)

# Scikit-learn: Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Optional: XGBoost
try:
    import xgboost as xgb
    has_xgb = True
except Exception:
    has_xgb = False
    print("⚠ Warning: XGBoost not installed. Gradient Boosting model will be skipped.")

# Imbalanced-learn: Balancing techniques
try:
    from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler
    from imblearn.under_sampling import RandomUnderSampler, TomekLinks
    from imblearn.combine import SMOTETomek
    has_imblearn = True
except Exception:
    has_imblearn = False
    print("⚠ Warning: imbalanced-learn not installed. Run: pip install imbalanced-learn")

# Text processing
try:
    from rapidfuzz import process, fuzz
except Exception:
    print("⚠ Warning: rapidfuzz not installed. Run: pip install rapidfuzz")

# Display settings
pd.set_option('display.max_columns', 200)
sns.set(style='whitegrid')


# PATH MANAGEMENT: Dataset & Output Folder Setup

# Get notebook directory (where this notebook is located)
NOTEBOOK_DIR = Path.cwd()
print(f"Notebook Directory: {NOTEBOOK_DIR}")

# Dataset filename and location
DATASET_NAME = 'PPD_dataset_v2'
DATASET_FILENAME = f'{DATASET_NAME}.csv'
DATASET_PATH = NOTEBOOK_DIR / DATASET_FILENAME

# Create output folder (same location as notebook, subfolder named after dataset)
OUTPUT_FOLDER = NOTEBOOK_DIR / f'{DATASET_NAME}_outputs'
OUTPUT_FOLDER.mkdir(exist_ok=True)

print(f"Dataset Path: {DATASET_PATH}")
print(f"Output Folder: {OUTPUT_FOLDER}")

# Verify dataset exists
if not DATASET_PATH.exists():
    raise FileNotFoundError(f"Dataset not found: {DATASET_PATH}\nPlease ensure '{DATASET_FILENAME}' is in the notebook directory.")
else:
    print(f"✓ Dataset found: {DATASET_FILENAME}")

print("\n")
print(f"✓ Output files will be saved to: {OUTPUT_FOLDER}")
print("="*80)


In [None]:
# Load dataset safely with proper encoding
csv_path = str(DATASET_PATH)   # adjust path if needed

# Try UTF-8 first, fallback to Latin-1 if UnicodeDecodeError occurs
try:
    df = pd.read_csv(csv_path)
except UnicodeDecodeError:
    df = pd.read_csv(csv_path, encoding='latin1')

print("File loaded successfully!")
print("Shape:", df.shape)
display(df.head())

## Column type detection


In [None]:
# Auto-detect numeric and categorical columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Print column types
print(f"Numeric columns ({len(num_cols)}): {num_cols}")
print(f"Categorical columns ({len(cat_cols)}): {cat_cols}")

# Missing values summary with counts and percentages
missing = df.isnull().sum().sort_values(ascending=False)
missing = missing[missing > 0]

if not missing.empty:
    missing_df = pd.DataFrame({
        'Features': missing.index,
        'Missing Count': missing.values,
        'Missing %': round((missing / len(df)) * 100, 2).values
    })
    missing_df = missing_df.reset_index(drop=True)
    print("\nColumns with missing values:")
    display(missing_df)
else:
    print("\nNo missing values found in the dataset.")

## Plotting plan
- For **numerical: histograms**
- For **categorical: bar charts**

## Individual Histogram for the numerical features

In [None]:
# Exclude 'sr' or similar non-informative numeric columns
num_cols_clean = [c for c in num_cols if c.lower() not in ['sr', 's.no', 'serial', 'id']]

# Plot histograms for numeric columns
n = len(num_cols_clean)
if n == 0:
    print("No numeric columns detected (after removing ID fields).")
else:
    ncols = 3
    nrows = (n + ncols - 1) // ncols
    plt.figure(figsize=(5 * ncols, 4 * nrows))
    for i, c in enumerate(num_cols_clean, 1):
        plt.subplot(nrows, ncols, i)
        sns.histplot(df[c].dropna(), kde=True, bins=30)
        plt.title(f"{c}\n(skew={df[c].dropna().skew():.2f})")
    plt.tight_layout()
    plt.show()


## Top 5 numeric plots (choosing & statistically explanating)

In [None]:
# Replace 'target' with the actual target column name if known, e.g. 'PHQ9 Result' or 'EPDS Result'
target_col = 'EPDS Result' if 'EPDS Result' in df.columns else None  

# Identify ID-like non-informative numeric columns (plot them separately)
id_like = [c for c in num_cols if c.lower() in ['sr', 's.no', 'serial', 'id']]
num_cols_clean = [c for c in num_cols if c not in id_like]

# Select top 5 most important numeric features (by variance) from the informative set
if len(num_cols_clean) > 5:
    variances = df[num_cols_clean].var().sort_values(ascending=False)
    top5 = variances.head(5).index.tolist()
else:
    top5 = num_cols_clean

print("Top 5 numeric features (by variance):", top5)

# Plot each of the top-5 in its own figure for clearer inspection
for c in top5:
    data_c = df[c].dropna()
    skew_c = data_c.skew() if not data_c.empty else float('nan')
    var_c = data_c.var() if not data_c.empty else float('nan')
    mean_c = data_c.mean() if not data_c.empty else float('nan')
    median_c = data_c.median() if not data_c.empty else float('nan')
    min_c = data_c.min() if not data_c.empty else float('nan')
    max_c = data_c.max() if not data_c.empty else float('nan')
    plt.figure(figsize=(8,4))
    sns.histplot(data_c, kde=True, bins=30)
    plt.title(f"{c}\n(skew={skew_c:.2f}, var={var_c:.2f})")
    plt.tight_layout()
    plt.show()
    # quick textual explanation template
    print(f"Summary for {c}: mean={mean_c:.2f}, median={median_c:.2f}, min={min_c:.2f}, max={max_c:.2f}, skew={skew_c:.2f}, var={var_c:.2f}\n")

# Display ID-like / non-informative numeric columns separately with a clear title
if len(id_like) > 0:
    if len(id_like) == 1:
        c = id_like[0]
        plt.figure(figsize=(8,3))
        sns.histplot(df[c].dropna(), kde=False, bins=30)
        plt.title("non-informative numeric columns: " + c)
        plt.tight_layout()
        plt.show()
    else:
        plt.figure(figsize=(6 * len(id_like), 4))
        for i, c in enumerate(id_like, 1):
            plt.subplot(1, len(id_like), i)
            sns.histplot(df[c].dropna(), kde=False, bins=30)
            plt.title(c)
        plt.suptitle("non-informative numeric columns")
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.show()


## Text Normalization & Fuzzy Deduplication (Preserve NaN/None)


In [None]:
# Smart Text Normalization & Fuzzy Deduplication (Preserve NaN/None)


df.columns = df.columns.str.encode('ascii', 'ignore').str.decode('ascii')
plt.rcParams['font.family'] = 'DejaVu Sans'

# --- STEP 1: Fix encoding issues ---
def normalize_text(s):
    if isinstance(s, str):
        s = unicodedata.normalize('NFKD', s)
        s = s.replace('’', "'").replace('‘', "'").replace('“', '"').replace('”', '"')
        return s.strip()
    return s

df = df.map(normalize_text)

# --- STEP 2: Identify categorical columns ---
cat_cols = df.select_dtypes(exclude=['number', 'bool']).columns.tolist()

# --- STEP 3: Clean casing, spaces, punctuation ---
def clean_string(s):
    if isinstance(s, str):
        s = s.lower().strip()
        s = s.replace('-', ' ').replace('_', ' ')
        s = ' '.join(s.split())  # remove extra spaces
        return s
    return s

for col in cat_cols:
    df[col] = df[col].map(clean_string)

# --- STEP 4: Fuzzy deduplication preserving NaNs as a counted category ---
def fuzzy_standardize_column(series, threshold=85, sentinel='NaN', keep_missing_label=True):
    """
    - Treat missing values as a category by filling with `sentinel` before matching.
    - If keep_missing_label=True the output will contain the sentinel string (so missing counts as a category).
      If False, sentinel is converted back to np.nan at the end (preserve NaN).
    """
    # Work on a filled series so missing is included in unique values
    filled = series.fillna(sentinel).astype(object)
    # Process values in descending frequency so the most common form becomes canonical
    unique_vals = pd.Series(filled.value_counts().index.tolist())
    mapping = {}

    for val in unique_vals:
        # first mapped value becomes canonical
        if not mapping:
            mapping[val] = val
            continue
        # fuzzy match against existing canonical keys
        match, score, _ = process.extractOne(val, list(mapping.keys()), scorer=fuzz.ratio)
        if score >= threshold:
            mapping[val] = match
        else:
            mapping[val] = val

    # apply mapping to filled series
    mapped = filled.map(mapping)

    # optionally convert sentinel back to np.nan to preserve original NaN semantics
    if not keep_missing_label:
        mapped = mapped.replace({sentinel: np.nan})

    return mapped, mapping

# --- STEP 5: Apply fuzzy matching (keep_missing_label=True to count missing as a unique category) ---
for col in cat_cols:
    print(f"\nCleaning column: {col}")
    # count uniques including missing by using fillna(sentinel)
    before_unique = df[col].fillna('NaN').unique()
    before_count = len([x for x in before_unique if x is not None])
    df[col], mapping = fuzzy_standardize_column(df[col], threshold=85, sentinel='NaN', keep_missing_label=True)
    after_unique = df[col].fillna('NaN').unique()
    n_before = len([x for x in before_unique if pd.notna(x)])
    n_after = len([x for x in after_unique if pd.notna(x)])
    print(f"  Unique before (counting missing): {n_before} → after: {n_after}")
    print("  Example mappings (changed values):")
    shown = 0
    for k, v in mapping.items():
        if k != v:
            print(f"    '{k}' → '{v}'")
            shown += 1
            if shown >= 6:
                break

# --- STEP 6: Check for non-ASCII characters ---
for col in cat_cols:
    bad_chars = df[col].astype(str).str.contains(r'[^\x00-\x7F]', regex=True)
    if bad_chars.any():
        print(f"Warning: Non-ASCII characters remain in column '{col}'")

print("\nNormalization & fuzzy deduplication complete (missing values can be counted as a category).")



## Individual Bar Chart for the categorical features

In [None]:
# For categorical features: value counts
if len(cat_cols) == 0:
    print("No categorical columns detected.")
else:
    for c in cat_cols:
        vc = df[c].value_counts(dropna=False)
        # limit plotting for very high-cardinality columns
        plt.figure(figsize=(8,3))
        if len(vc) <= 30:
            sns.barplot(x=vc.index.astype(str), y=vc.values)
            plt.xticks(rotation=45, ha='right')
        else:
            # show top 20 categories
            sns.barplot(x=vc.head(20).index.astype(str), y=vc.head(20).values)
            plt.xticks(rotation=45, ha='right')
        plt.title(f"Value counts for {c} (unique={df[c].nunique()})")
        plt.tight_layout()
        plt.show()


## Top 5 categorical plots (choosing & explaining including statistics)


In [None]:
# --- Top 5 conceptually relevant categorical plots for PPD detection ---
top5_cat = [
    'Feeling about motherhood',
    'Relationship with husband',
    'Relationship with husband',
    'Recieved Support',
    'Depression during pregnancy (PHQ2)'
]
# Explanations dictionary
explanations = {
    'Depression during pregnancy (PHQ2)': "Pre-existing or concurrent depression is the strongest predictor of PPD.",
    'Recieved Support': "Social support (from husband, family, friends) buffers against PPD.",
    'Relationship with husband': "Marital relationship quality is crucial in Bangladeshi households where extended family often influences daily life.",
    'History of pregnancy loss': "Prior loss is stressful and can increase anxiety and PPD risk.",
    'Feeling about motherhood': "Maternal feelings toward the newborn and self-efficacy are direct psychological indicators of vulnerability."
}

print("Top-5 categorical columns selected for PPD analysis:", top5_cat)

for c in top5_cat:
    vc = df[c].value_counts(dropna=False)
    vc_norm = df[c].value_counts(normalize=True).round(3)
    
    plt.figure(figsize=(8,4))
    sns.barplot(x=vc.index.astype(str), y=vc.values)
    plt.xticks(rotation=45, ha='right')
    plt.ylabel("Count")
    plt.title(f"{c} (unique={df[c].nunique()})")
    plt.tight_layout()
    plt.show()

        # Print explanation
    print("\nExplanation:")
    print(explanations[c])
    
    print("\n" + "-"*80 + "\n")
    
    print(f"Value counts for '{c}':")
    print(vc)
    print("\nNormalized (proportion) counts:")
    print(vc_norm)
    print("\n" + "-"*50 + "\n")


## Categorical vs Categorical


In [None]:
# --- Categorical vs Categorical analysis for PPD (single figure, blue/orange) ---
target_col = 'EPDS Result'

# Increase figure size for readability
fig, axes = plt.subplots(3, 2, figsize=(24, 20))  # bigger width & height
axes = axes.flatten()

for i, c in enumerate(top5_cat):
    ax = axes[i]
    
    # Countplot
    sns.countplot(data=df, x=c, hue=target_col, ax=ax)
    
    # Rotate x-axis labels safely
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right', fontsize=12)
    
    # Improve y-axis and title font sizes
    ax.set_ylabel("Count", fontsize=14)
    ax.set_title(f"{c} vs {target_col}", fontsize=16)
    
    # Legend fonts
    ax.legend(title=target_col, fontsize=12, title_fontsize=14)
    
    # --- Text outputs below each subplot ---
    print(f"Analyzing '{c}' vs {target_col}")
    
    # Cross-tabulation
    ct = pd.crosstab(df[c], df[target_col])
    print("\nCross-tabulation:")
    display(ct)
    
    # Normalized proportions
    ct_norm = pd.crosstab(df[c], df[target_col], normalize='index').round(3)
    print("\nNormalized proportions (row-wise):")
    display(ct_norm)
    
    # Explanation printed below tables
    print(f"Explanation for '{c}': {explanations[c]}")
    print("\n" + "-"*80 + "\n")

# Remove empty subplot if exists
if len(top5_cat) < len(axes):
    for j in range(len(top5_cat), len(axes)):
        fig.delaxes(axes[j])

# Adjust spacing between subplots
plt.tight_layout()
plt.show()


## Categorical vs Numerical


In [None]:
# --- Categorical vs Numerical analysis for PPD (big square layout, no duplicates) ---
numeric_cols = ['Age', 'Number of the latest pregnancy', 'PHQ9 Score', 'EPDS Score']

for num_col in numeric_cols:
    fig, axes = plt.subplots(2, 3, figsize=(24, 16))  # 2 rows x 3 columns
    axes = axes.flatten()
    
    print(f"Analyzing '{num_col}' vs ")
    for i, cat_col in enumerate(top5_cat):
        ax = axes[i]
        
        # Boxplot: numeric vs categorical
        sns.boxplot(data=df, x=cat_col, y=num_col, hue='EPDS Result', ax=ax, palette='pastel')
        
        # Titles and labels
        ax.set_title(f"{num_col} vs {cat_col}", fontsize=14)
        ax.set_xlabel(cat_col, fontsize=12)
        ax.set_ylabel(num_col, fontsize=12)
        plt.setp(ax.get_xticklabels(), rotation=45, ha='right', fontsize=11)
        ax.tick_params(axis='y', labelsize=11)
        
        # Legend only for first subplot
        if i != 0:
            ax.get_legend().remove()
        else:
            ax.legend(title='EPDS Result', fontsize=11, title_fontsize=12)
        
        # --- Summary statistics (one print per numeric x categorical pair) ---
        display(df.groupby(cat_col)[num_col].describe())
        print("\n" + "-"*100 + "\n")
    
    # Remove any empty subplot (6th subplot in 2x3 grid)
    if len(top5_cat) < len(axes):
        for j in range(len(top5_cat), len(axes)):
            fig.delaxes(axes[j])
    
    plt.tight_layout()
    plt.show()


## Numerical vs Numerical pair plot


In [None]:
# --- Numerical vs Numerical analysis for PPD (darker pastel) ---
numeric_cols = ['Age', 'Number of the latest pregnancy', 'PHQ9 Score', 'EPDS Score']
target_col = 'EPDS Result'

# Create slightly darker pastel palette
n_classes = df[target_col].nunique()
base_palette = sns.color_palette("pastel", n_colors=n_classes)
darker_palette = [(r*0.8, g*0.8, b*0.8) for r, g, b in base_palette]

# Pairplot with hue = EPDS Result
plt.figure(figsize=(12, 12))
sns.pairplot(df[numeric_cols + [target_col]], 
             hue=target_col, 
             palette=darker_palette,  # darker pastel
             diag_kind='kde', 
             corner=False)  # corner=False shows full matrix

plt.suptitle("Numerical vs Numerical Pair Plot for PPD", fontsize=18, y=1.02)
plt.show()


## Missing data handling strategy
1. Show missing % per column.  
2. Impute numeric columns with median (configurable).  
3. Impute categorical with mode or 'missing' label.  
4. Optionally use KNNImputer for more advanced imputation (not included by default).


In [None]:
# --- Strip leading/trailing spaces from column names ---
df.columns = df.columns.str.strip()
cat_cols = [c.strip() for c in cat_cols]
num_cols = [c.strip() for c in num_cols if c.strip() != 'sr']  # remove 'sr'

# --- Missing values BEFORE imputation (using same approach as before) ---
missing_before = df.isnull().sum().sort_values(ascending=False)
missing_before = missing_before[missing_before > 0]

missing_before_df = pd.DataFrame({
    'Features': missing_before.index,
    'Missing Count (Before)': missing_before.values,
    'Missing % (Before)': round((missing_before / len(df)) * 100, 2).values
}).reset_index(drop=True)

# --- Copy dataframe for preprocessing ---
df_proc = df.copy()

# --- Numeric imputation ---
use_knn = False  # Set True to use KNN imputer, False to use median

if len(num_cols) > 0:
    if use_knn:
        knn_imputer = KNNImputer(n_neighbors=5)
        df_proc[num_cols] = knn_imputer.fit_transform(df_proc[num_cols])
    else:
        num_imputer = SimpleImputer(strategy='median')
        df_proc[num_cols] = num_imputer.fit_transform(df_proc[num_cols])

# --- Categorical imputation ---
for c in cat_cols:
    df_proc[c] = df_proc[c].fillna('Missing')

# --- Missing values AFTER imputation (same columns as before) ---
missing_after = df_proc[missing_before.index].isnull().sum()
missing_after_df = pd.DataFrame({
    'Features': missing_after.index,
    'Missing Count (After)': missing_after.values,
    'Missing % (After)': round((missing_after / len(df_proc)) * 100, 2).values
}).reset_index(drop=True)

# --- Combined table: Before vs After ---
missing_summary = pd.concat([
    missing_before_df[['Features']],
    missing_after_df[['Missing Count (After)', 'Missing % (After)']]
], axis=1)

print("Missing Data Handling: After Imputation")
display(missing_summary)



## Stratified sampling
We will split data into train/test preserving the target distribution using `stratify=y`.  
**Important:** set `target_col` to the actual target variable name in your dataset.


In [None]:
# --- Target column ---
target_col = 'EPDS Result'   # replace if needed

if target_col not in df_proc.columns:
    raise ValueError(f"Target column '{target_col}' not found. Update target_col variable.")

# Features & target
X = df_proc.drop(columns=[target_col])
y = df_proc[target_col]

# Original class distribution
print("Original class distribution:")
display(Counter(y))

# --- Stratified train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,       # 20% test
    random_state=42,
    stratify=y           # preserves class distribution
)

# Verify class distribution
print("\nTrain class distribution:")
display(Counter(y_train))
print("\nTest class distribution:")
display(Counter(y_test))



## One-Hot Encoding


In [None]:
# --- Identify categorical columns to encode ---
cat_cols_to_encode = [c for c in X_train.columns if c in cat_cols]

print(f"Categorical columns to encode ({len(cat_cols_to_encode)}): {cat_cols_to_encode}")

# --- One-hot encoder (handle unknown categories in test set) ---
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit on training data
X_train_encoded = ohe.fit_transform(X_train[cat_cols_to_encode])
X_test_encoded = ohe.transform(X_test[cat_cols_to_encode])

# Column names after encoding
encoded_cols = ohe.get_feature_names_out(cat_cols_to_encode)

# Convert to DataFrame
X_train_ohe = pd.DataFrame(X_train_encoded, columns=encoded_cols, index=X_train.index)
X_test_ohe = pd.DataFrame(X_test_encoded, columns=encoded_cols, index=X_test.index)

# --- Drop original categorical columns and append encoded ones ---
X_train_final = pd.concat([X_train.drop(columns=cat_cols_to_encode), X_train_ohe], axis=1)
X_test_final  = pd.concat([X_test.drop(columns=cat_cols_to_encode), X_test_ohe], axis=1)

print("Training set shape after one-hot encoding:", X_train_final.shape)
print("Test set shape after one-hot encoding:", X_test_final.shape)

print("\n--- Sample of One-Hot Encoded Features (first 5 rows) ---")
print(f"Number of one-hot encoded columns: {len(encoded_cols)}")
display(X_train_ohe.head())

print("\n--- One-Hot Encoded Feature Names ---")
print(f"Total encoded features: {len(encoded_cols)}")
print("All encoded column names:")
import textwrap
col_list_str = ', '.join(encoded_cols)
wrapped_cols = textwrap.fill(col_list_str, width=120)
print(wrapped_cols)



##  Standardization
We will StandardScale numeric features: ['Age', 'Number of the latest pregnancy', 'PHQ9 Score', 'EPDS Score']

In [None]:
# Numeric columns to scale
num_cols_to_scale = ['Age', 'Number of the latest pregnancy', 'PHQ9 Score', 'EPDS Score']

# --- Separate numeric data ---
X_train_num = X_train[num_cols_to_scale].copy()
X_test_num  = X_test[num_cols_to_scale].copy()

# --- Standardization ---
scaler = StandardScaler()
X_train_num = pd.DataFrame(scaler.fit_transform(X_train_num), 
                           columns=num_cols_to_scale, index=X_train.index)
X_test_num  = pd.DataFrame(scaler.transform(X_test_num), 
                           columns=num_cols_to_scale, index=X_test.index)

# --- Separate categorical (one-hot) columns ---
X_train_cat = X_train_ohe.drop(columns=num_cols_to_scale, errors='ignore')
X_test_cat  = X_test_ohe.drop(columns=num_cols_to_scale, errors='ignore')

# --- Combine numeric + categorical ---
X_train_final = pd.concat([X_train_num, X_train_cat], axis=1)
X_test_final  = pd.concat([X_test_num, X_test_cat], axis=1)

# --- Display after standardization ---
print("Training set (first 5 rows) - Standardized Numeric Features:")
display(X_train_num.head())

print("\nTest set (first 5 rows) - Standardized Numeric Features:")
display(X_test_num.head())

print("\nFinal training set shape:", X_train_final.shape)
print("Final test set shape:", X_test_final.shape)



## Balancing techniques
- SMOTE-NC (synthetic oversampling)


In [None]:
if not has_imblearn:
    print("imbalanced-learn not installed. Install with `pip install imbalanced-learn` to use SMOTE-NC.")
else:
    # Combine numeric and one-hot encoded categorical features for SMOTE-NC
    # X_train has both numeric columns and one-hot encoded categorical columns
    X_tr_combined = X_train_final.values  # Combined numeric + one-hot encoded categorical
    y_tr = y_train.copy()
    
    # Encode target if it's categorical
    if y_tr.dtype == 'O' or y_tr.dtype.name == 'category':
        le = LabelEncoder()
        y_tr_enc = le.fit_transform(y_tr)
        classes = le.classes_
    else:
        y_tr_enc = y_tr.values
        classes = np.unique(y_tr_enc)
    
    print("Original training distribution:", Counter(y_tr))
    
    # Identify categorical feature indices (one-hot encoded columns)
    # One-hot encoded features are binary (0 or 1), all others are numeric
    categorical_indices = [i for i, col in enumerate(X_train_final.columns) if col not in ['Age', 'Number of the latest pregnancy', 'PHQ9 Score', 'EPDS Score']]
    
    print(f"\nNumeric features: {['Age', 'Number of the latest pregnancy', 'PHQ9 Score', 'EPDS Score']}")
    print(f"One-hot encoded categorical features: {len(categorical_indices)} columns")
    
    # Apply SMOTE-NC (Synthetic Minority Over-sampling Technique for Nominal and Continuous)
    sm = SMOTENC(categorical_features=categorical_indices, random_state=42)
    X_train_bal, y_train_bal_enc = sm.fit_resample(X_tr_combined, y_tr_enc)
    
    # Decode back if target was categorical
    if y_tr.dtype == 'O' or y_tr.dtype.name == 'category':
        y_train_bal = le.inverse_transform(y_train_bal_enc)
    else:
        y_train_bal = y_train_bal_enc
    
    print("\nAfter SMOTE-NC (balanced training set):", Counter(y_train_bal))
    print("X_train_bal shape:", X_train_bal.shape)
    print("y_train_bal shape:", y_train_bal.shape)
    
    # Convert balanced data back to DataFrame for display
    X_train_bal_df = pd.DataFrame(X_train_bal, columns=X_train_final.columns)
    print("\n--- Sample of Features After SMOTE-NC Balancing (first 5 rows) ---")
    display(X_train_bal_df.head())



## Correlation matrix & feature drop
- Approximate mutual information between features and target (for numeric & categorical).
- Then drop highly collinear features (threshold configurable).


In [None]:
# --- Feature Correlation Analysis & Feature Drop Suggestions ---

# --- Define numeric and categorical columns (exclude serial/ID columns) ---
numeric_cols = ['Age', 'Number of the latest pregnancy', 'PHQ9 Score', 'EPDS Score']
target_col = 'EPDS Result'

# Exclude serial/ID-like non-informative columns
id_like_cols = [c for c in df_proc.columns if c.lower() in ['sr', 's.no', 'serial', 'id']]
print(f"Excluding ID-like columns: {id_like_cols}")

# --- Numeric correlation with target ---
corr_num = df_proc[numeric_cols].corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr_num, annot=True, fmt=".2f", cmap='Blues', cbar=True)
plt.title("Correlation Matrix (Numeric Features)")
plt.show()

# Identify highly correlated numeric features (threshold > 0.85)
high_corr_pairs = []
threshold = 0.85
for i in range(len(numeric_cols)):
    for j in range(i+1, len(numeric_cols)):
        if abs(corr_num.iloc[i,j]) > threshold:
            high_corr_pairs.append((numeric_cols[i], numeric_cols[j], corr_num.iloc[i,j]))

if high_corr_pairs:
    print("Highly correlated numeric feature pairs (consider dropping one):")
    for pair in high_corr_pairs:
        print(f"{pair[0]} <--> {pair[1]} | corr = {pair[2]:.2f}")
else:
    print("No highly correlated numeric feature pairs found.")

# --- Categorical correlation using Cramér's V ---
cat_cols = [c for c in df_proc.columns if c not in numeric_cols + [target_col] + id_like_cols]

def cramers_v(x, y):
    """Robust Cramér's V with bias correction.
    Returns 0.0 for degenerate tables and guards against divide-by-zero.
    """
    cm = pd.crosstab(x, y)
    n = cm.values.sum()
    if n == 0:
        return 0.0
    r, k = cm.shape
    if r == 1 or k == 1:
        return 0.0
    chi2 = chi2_contingency(cm, correction=False)[0]
    phi2 = chi2 / n
    phi2corr = max(0.0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) if n > 1 else 0.0
    rcorr = r - ((r - 1) ** 2) / (n - 1) if n > 1 else r
    kcorr = k - ((k - 1) ** 2) / (n - 1) if n > 1 else k
    denom = min((kcorr - 1), (rcorr - 1))
    if denom <= 0:
        return 0.0
    return float(np.sqrt(phi2corr / denom))

# Compute Cramér's V among categorical features (only top 20 to save time)
top_cat = cat_cols[:20]
cramers_matrix = pd.DataFrame(index=top_cat, columns=top_cat)

for col1 in top_cat:
    for col2 in top_cat:
        if col1 == col2:
            cramers_matrix.loc[col1,col2] = 1.0
        else:
            cramers_matrix.loc[col1,col2] = cramers_v(df_proc[col1], df_proc[col2])

cramers_matrix = cramers_matrix.astype(float)
plt.figure(figsize=(12,10))
sns.heatmap(cramers_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title("Cramér's V Matrix (Categorical Features - Top 20)")
plt.show()

# --- Categorical correlation with target ---
cramers_target = {}
for c in cat_cols:
    cramers_target[c] = cramers_v(df_proc[c], df_proc[target_col])
cramers_target = pd.Series(cramers_target).sort_values(ascending=False)
print("Cramér's V with target (top correlated categorical features):")
display(cramers_target.head(15))

# --- COMBINED CORRELATION MATRIX: Numeric features vs Target ---
print("\n" + "="*80)
print("COMBINED CORRELATION MATRIX: Numeric Features with Target")
print("="*80)

# Compute correlation of numeric features with target (encoded)
le_temp = LabelEncoder()
target_encoded = le_temp.fit_transform(df_proc[target_col])

numeric_target_corr = {}
for col in numeric_cols:
    numeric_target_corr[col] = np.corrcoef(df_proc[col].values, target_encoded)[0, 1]

numeric_target_df = pd.DataFrame({
    'Feature': list(numeric_target_corr.keys()),
    'Correlation with Target': list(numeric_target_corr.values())
}).sort_values(by='Correlation with Target', key=abs, ascending=False)

print("\nNumeric Features - Correlation with Target:")
display(numeric_target_df)

# Combined visualization: Numeric correlations + categorical associations with target
fig, axes = plt.subplots(1, 2, figsize=(18, 10))

# Left: Numeric features correlation with target
numeric_target_sorted = pd.Series(numeric_target_corr).sort_values()
axes[0].barh(numeric_target_sorted.index, numeric_target_sorted.values, color='steelblue')
axes[0].set_xlabel('Correlation Coefficient', fontsize=11)
axes[0].set_title('Numeric Features - Correlation with Target', fontsize=12, fontweight='bold')
axes[0].axvline(x=0, color='black', linestyle='--', linewidth=0.8)
axes[0].grid(axis='x', alpha=0.3)

# Right: Top 15 categorical features association with target
top_cat_target = cramers_target.head(15)
axes[1].barh(range(len(top_cat_target)), top_cat_target.values, color='coral')
axes[1].set_yticks(range(len(top_cat_target)))
axes[1].set_yticklabels(top_cat_target.index, fontsize=11)
axes[1].set_xlabel("Cramér's V", fontsize=11)
axes[1].set_title("Top 15 Categorical Features - Association with Target", fontsize=12, fontweight='bold')
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print("\n" + "="*80)
print("SUMMARY: Features Most Important for Predicting Target")
print("="*80)
print(f"Top numeric feature: {numeric_target_df.iloc[0]['Feature']} (corr = {numeric_target_df.iloc[0]['Correlation with Target']:.4f})")
print(f"Top categorical feature: {cramers_target.index[0]} (Cramér's V = {cramers_target.iloc[0]:.4f})")

# --- Feature drop suggestions ---
drop_numeric = [pair[1] for pair in high_corr_pairs] if high_corr_pairs else []
drop_categorical = list(cramers_target[cramers_target < 0.05].index)  # weakly correlated

print("\nSuggested numeric features to drop (highly correlated):", drop_numeric)
print("Suggested categorical features to drop (low association with target):", drop_categorical)


##  Outlier removal and Feature Drop

In [None]:
# --- Final Cleaned Feature List & Outlier Removal ---

# --- Numeric and categorical columns ---
numeric_cols = ['Age', 'Number of the latest pregnancy', 'PHQ9 Score', 'EPDS Score']
target_col = 'EPDS Result'
cat_cols = [c for c in df_proc.columns if c not in numeric_cols + [target_col]]

# --- Step 1: Drop highly correlated numeric features ---
corr_num = df_proc[numeric_cols].corr()
threshold = 0.9
drop_numeric = []

for i in range(len(numeric_cols)):
    for j in range(i+1, len(numeric_cols)):
        if abs(corr_num.iloc[i,j]) > threshold:
            drop_numeric.append(numeric_cols[j])

final_numeric = [c for c in numeric_cols if c not in drop_numeric]

# --- Step 2: Drop low-importance categorical features (Cramér's V with target) ---
def cramers_v(x, y):
    """Robust Cramér's V with bias correction.
    Returns 0.0 for degenerate tables and guards against divide-by-zero.
    """
    cm = pd.crosstab(x, y)
    n = cm.values.sum()
    if n == 0:
        return 0.0
    r, k = cm.shape
    if r == 1 or k == 1:
        return 0.0
    chi2 = chi2_contingency(cm, correction=False)[0]
    phi2 = chi2 / n
    phi2corr = max(0.0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) if n > 1 else 0.0
    rcorr = r - ((r - 1) ** 2) / (n - 1) if n > 1 else r
    kcorr = k - ((k - 1) ** 2) / (n - 1) if n > 1 else k
    denom = min((kcorr - 1), (rcorr - 1))
    if denom <= 0:
        return 0.0
    return float(np.sqrt(phi2corr / denom))

cat_corr = {c: cramers_v(df_proc[c], df_proc[target_col]) for c in cat_cols}
cat_corr = pd.Series(cat_corr).sort_values(ascending=False)
drop_categorical = list(cat_corr[cat_corr < 0.05].index)
final_categorical = [c for c in cat_cols if c not in drop_categorical]

# --- Step 3: Remove outliers from numeric columns using IQR ---
df_clean = df_proc.copy()
for col in final_numeric:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    before_count = df_clean.shape[0]
    df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
    after_count = df_clean.shape[0]
    if before_count != after_count:
        print(f"Removed {before_count - after_count} outliers from '{col}'")

# --- Step 4: Generate final cleaned feature set ---
X_final = df_clean[final_numeric + final_categorical]
y_final = df_clean[target_col]



print("\n--- Final Cleaned Feature List ---")
print("Dropped numeric features (highly correlated):", drop_numeric)
print("Dropped categorical features (low association with target):", drop_categorical)
print(f"Remaining numeric features ({len(final_numeric)}):", final_numeric)
print(f"Remaining categorical features ({len(final_categorical)}):", final_categorical)
print(f"Final cleaned dataset shape: {X_final.shape}")

display(X_final.head())


## Model training plan
Train 6 ML models (with tuned hyperparameters):
- Logistic Regression (Multinomial)
- Random Forest
- Gradient Boosting (XGBoost)
- Support Vector Machine (SVM, RBF kernel)
- k-Nearest Neighbors (KNN)
- Decision Tree

We will train on the preprocessed, encoded, scaled training data and evaluate on test set with Accuracy, Precision, Recall and F1-score metrics.


In [None]:
# --- Data Preparation ---
# Use X_final and y_final (clean, feature-selected data from correlation analysis)
X_data = X_final.copy()
y_data = y_final.copy()

# Identify categorical and numeric columns in X_final
final_cat_cols = [c for c in X_final.columns if c not in final_numeric]

# Train-test split (stratified)
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(
    X_data, y_data, test_size=0.2, random_state=42, stratify=y_data
)

# One-hot encode categorical features
if final_cat_cols:
    ohe_final = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_train_cat_enc = ohe_final.fit_transform(X_train_ml[final_cat_cols])
    X_test_cat_enc = ohe_final.transform(X_test_ml[final_cat_cols])
    
    # Get encoded column names
    cat_encoded_cols = ohe_final.get_feature_names_out(final_cat_cols)
    
    # Convert to DataFrames with numeric columns
    X_train_ml_numeric = X_train_ml[final_numeric].reset_index(drop=True)
    X_test_ml_numeric = X_test_ml[final_numeric].reset_index(drop=True)
    
    X_train_cat_df = pd.DataFrame(X_train_cat_enc, columns=cat_encoded_cols)
    X_test_cat_df = pd.DataFrame(X_test_cat_enc, columns=cat_encoded_cols)
    
    # Combine numeric and encoded categorical
    X_train_ml = pd.concat([X_train_ml_numeric, X_train_cat_df], axis=1)
    X_test_ml = pd.concat([X_test_ml_numeric, X_test_cat_df], axis=1)

# Scale numeric features
scaler = StandardScaler()
X_train_ml[final_numeric] = scaler.fit_transform(X_train_ml[final_numeric])
X_test_ml[final_numeric] = scaler.transform(X_test_ml[final_numeric])

# Encode target variable if categorical
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train_ml)
y_test_enc = le.transform(y_test_ml)
n_classes = len(le.classes_)

print(f"Training set shape: {X_train_ml.shape}")
print(f"Test set shape: {X_test_ml.shape}")
print(f"Number of classes: {n_classes}")
print(f"Classes: {le.classes_}")

# --- Define 6 ML Models ---
models = {
    "Logistic Regression (Multinomial)": LogisticRegression(
        multi_class='multinomial', max_iter=1000, random_state=42, solver='lbfgs'
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=200, max_depth=20, random_state=42, n_jobs=-1
    ),
    "Gradient Boosting (XGBoost)": xgb.XGBClassifier(
        n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42, 
        eval_metric='mlogloss', use_label_encoder=False, verbosity=0
    ) if has_xgb else None,
    "Support Vector Machine (SVM)": SVC(
        kernel='rbf', C=1.0, probability=True, random_state=42
    ),
    "k-Nearest Neighbors (KNN)": KNeighborsClassifier(
        n_neighbors=7, weights='distance', n_jobs=-1
    ),
    "Decision Tree": DecisionTreeClassifier(
        max_depth=20, min_samples_split=5, random_state=42
    )
}

# Remove None models if XGBoost not available
models = {k: v for k, v in models.items() if v is not None}

# --- Train models and collect results ---
results_list = []
trained_models = {}

for model_name, model in models.items():
    print(f"\nTraining: {model_name}...")
    
    try:
        # Train
        model.fit(X_train_ml, y_train_enc)
        trained_models[model_name] = model
        
        # Predict
        y_pred = model.predict(X_test_ml)
        
        # Compute metrics
        accuracy = accuracy_score(y_test_enc, y_pred)
        precision = precision_score(y_test_enc, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test_enc, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test_enc, y_pred, average='weighted', zero_division=0)
        
        # ROC-AUC (only if binary, else use ovr or ovo)
        try:
            if n_classes == 2:
                roc_auc = roc_auc_score(y_test_enc, model.predict_proba(X_test_ml)[:, 1])
            else:
                roc_auc = roc_auc_score(y_test_enc, model.predict_proba(X_test_ml), multi_class='ovr', zero_division=0)
        except:
            roc_auc = np.nan
        
        results_list.append({
            "Model": model_name,
            "Accuracy": round(accuracy, 4),
            "Precision": round(precision, 4),
            "Recall": round(recall, 4),
            "F1-Score": round(f1, 4)
        })
        
        print(f"✓ {model_name} - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
        
    except Exception as e:
        print(f"✗ Error training {model_name}: {str(e)}")

print("\n" + "="*80)
print("Model training complete!")
# --- Create Comparison Table ---
results_df = pd.DataFrame(results_list)

# Sort by F1-Score (descending)
results_df = results_df.sort_values(by="F1-Score", ascending=False).reset_index(drop=True)

print("\n" + "="*80)
print("MODEL COMPARISON TABLE")
print("="*80)
display(results_df)

# --- CONFUSION MATRICES FOR EACH MODEL ---
print("\n" + "="*80)
print("CONFUSION MATRICES - Individual Models")
print("="*80)

n_models = len(trained_models)
fig, axes = plt.subplots((n_models + 1) // 2, 2, figsize=(16, 5 * ((n_models + 1) // 2)))
if n_models == 1:
    axes = [axes]
else:
    axes = axes.flatten()

confusion_matrices = {}

for idx, (model_name, model) in enumerate(trained_models.items()):
    y_pred = model.predict(X_test_ml)
    cm = confusion_matrix(y_test_enc, y_pred)
    confusion_matrices[model_name] = cm
    
    ax = axes[idx]
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax, 
                xticklabels=le.classes_, yticklabels=le.classes_,
                cbar_kws={'label': 'Count'})
    ax.set_title(f"Confusion Matrix: {model_name}\n(F1-Score: {results_df[results_df['Model'] == model_name]['F1-Score'].values[0]:.4f})")
    ax.set_ylabel('True Label')
    ax.set_xlabel('Predicted Label')

# Remove extra subplots if odd number of models
if n_models % 2 == 1:
    fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()

# --- DETAILED CLASSIFICATION REPORTS ---
print("\n" + "="*80)
print("DETAILED CLASSIFICATION REPORTS - Per Model")
print("="*80)

for model_name, model in trained_models.items():
    y_pred = model.predict(X_test_ml)
    print(f"\n{model_name}:")
    print("-" * 80)
    print(classification_report(y_test_enc, y_pred, target_names=le.classes_, digits=4))

print("\n" + "="*80)

# Save to CSV
results_df.to_csv(OUTPUT_FOLDER / 'model_comparison_table.csv', index=False)
print("\n✓ Saved model comparison table to: model_comparison_table.csv")

# Additional summary statistics
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"Best Model (by F1-Score): {results_df.iloc[0]['Model']}")
print(f"Best F1-Score: {results_df.iloc[0]['F1-Score']}")
print(f"\nAverage Metrics Across All Models:")
print(f"  Accuracy:  {results_df['Accuracy'].mean():.4f}")
print(f"  Precision: {results_df['Precision'].mean():.4f}")
print(f"  Recall:    {results_df['Recall'].mean():.4f}")
print(f"  F1-Score:  {results_df['F1-Score'].mean():.4f}")


## Saving outputs

In [None]:
# Use the OUTPUT_FOLDER defined in the setup cell
print(f"Saving datasets to: {OUTPUT_FOLDER}")

# --- Save processed training/test datasets ---
# Use X_train_ml, X_test_ml from model training (already encoded and scaled)
X_train_ml.to_csv(OUTPUT_FOLDER / 'X_train_processed.csv', index=False)
X_test_ml.to_csv(OUTPUT_FOLDER / 'X_test_processed.csv', index=False)

# Save target variables
pd.Series(y_train_ml, name='EPDS Result').to_csv(OUTPUT_FOLDER / 'y_train_processed.csv', index=False)
pd.Series(y_test_ml, name='EPDS Result').to_csv(OUTPUT_FOLDER / 'y_test_processed.csv', index=False)

print("✓ Saved processed training/test sets:")
print(f"  - X_train_processed.csv ({X_train_ml.shape})")
print(f"  - X_test_processed.csv ({X_test_ml.shape})")
print(f"  - y_train_processed.csv ({len(y_train_ml)} samples)")
print(f"  - y_test_processed.csv ({len(y_test_ml)} samples)")

# --- Save feature information ---
feature_info = pd.DataFrame({
    'Feature': X_train_ml.columns,
    'Type': ['Numeric' if col in final_numeric else 'Categorical (One-Hot Encoded)' for col in X_train_ml.columns]
})
feature_info.to_csv(OUTPUT_FOLDER / 'feature_info.csv', index=False)
print(f"\n✓ Saved feature information: feature_info.csv ({len(feature_info)} features)")

# --- Save model comparison results ---
# results_df is already saved to model_comparison_table.csv from earlier
print(f"\n✓ Model comparison table already saved: model_comparison_table.csv")
print(f"  Best Model: {results_df.iloc[0]['Model']}")
print(f"  Best F1-Score: {results_df.iloc[0]['F1-Score']}")

# --- Save metadata ---
metadata = {
    'Dataset': 'PPD_dataset_v2.csv',
    'Original Shape': str(df.shape),
    'After Preprocessing': str(df_proc.shape),
    'After Feature Selection': str(X_final.shape),
    'Final Train Set Shape': str(X_train_ml.shape),
    'Final Test Set Shape': str(X_test_ml.shape),
    'Target Variable': 'EPDS Result',
    'Target Classes': str(list(le.classes_)),
    'Train/Test Split': '80/20 (Stratified)',
    'Numeric Features': str(final_numeric),
    'Categorical Features Dropped': str(drop_categorical),
    'Numeric Features Dropped': str(drop_numeric)
}

metadata_df = pd.DataFrame(list(metadata.items()), columns=['Key', 'Value'])
metadata_df.to_csv(OUTPUT_FOLDER / 'preprocessing_metadata.csv', index=False)
print(f"\n✓ Saved preprocessing metadata: preprocessing_metadata.csv")

print("\n" + "="*80)
print("ALL OUTPUTS SAVED SUCCESSFULLY!")


# Pipeline Summary & Results

In [None]:
# --- DYNAMIC PIPELINE SUMMARY ---

# Gather all dynamic data from previous cells
summary_md = ""

# --- Data Processing Pipeline ---
summary_md += "#### Data Processing Pipeline\n"
summary_md += f"- **Original Dataset**: {df.shape[0]} samples × {df.shape[1]} features\n"
summary_md += f"- **After Preprocessing**: {df_proc.shape[0]} samples × {df_proc.shape[1]} features (missing values imputed)\n"
summary_md += f"- **After Feature Selection**: {X_final.shape[0]} samples × {X_final.shape[1]} features (highly correlated features dropped)\n"
summary_md += f"- **Final Train Set**: {X_train_ml.shape[0]} samples × {X_train_ml.shape[1]} features (encoded & scaled)\n"
summary_md += f"- **Final Test Set**: {X_test_ml.shape[0]} samples × {X_test_ml.shape[1]} features (encoded & scaled)\n\n"

# --- Model Rankings ---
summary_md += "#### ML Models Trained & Ranked by F1-Score\n\n"
summary_md += "| Rank | Model | Accuracy | Precision | Recall | F1-Score |\n"
summary_md += "|------|-------|----------|-----------|--------|----------|\n"
for idx, row in results_df.iterrows():
    summary_md += f"| {idx+1} | {row['Model']} | {row['Accuracy']:.4f} | {row['Precision']:.4f} | {row['Recall']:.4f} | {row['F1-Score']:.4f} |\n"

# --- Overall Statistics ---
summary_md += "\n**Overall Statistics**:\n"
summary_md += f"- Average Accuracy: {results_df['Accuracy'].mean() * 100:.2f}%\n"
summary_md += f"- Average Precision: {results_df['Precision'].mean():.4f}\n"
summary_md += f"- Average Recall: {results_df['Recall'].mean():.4f}\n"
summary_md += f"- Average F1-Score: {results_df['F1-Score'].mean():.4f}\n"
summary_md += f"- Best Model: **{results_df.iloc[0]['Model']}** (F1: {results_df.iloc[0]['F1-Score']:.4f})\n\n"

# --- Output Files Saved ---
summary_md += f"#### Output Files Saved to: `{OUTPUT_FOLDER}`\n\n"
summary_md += "| File | Shape/Details | Purpose |\n"
summary_md += "|------|---|----------|\n"
summary_md += f"| `X_train_processed.csv` | {X_train_ml.shape} | Processed training features (encoded & scaled) |\n"
summary_md += f"| `X_test_processed.csv` | {X_test_ml.shape} | Processed test features (encoded & scaled) |\n"
summary_md += f"| `y_train_processed.csv` | {len(y_train_ml)} samples | Training target labels (EPDS Result) |\n"
summary_md += f"| `y_test_processed.csv` | {len(y_test_ml)} samples | Test target labels (EPDS Result) |\n"
summary_md += f"| `model_comparison_table.csv` | {len(results_df)} models | Model performance metrics |\n"
summary_md += f"| `feature_info.csv` | {X_train_ml.shape[1]} features | Feature names and types |\n"
summary_md += f"| `preprocessing_metadata.csv` | Key-value pairs | Pipeline configuration & details |\n\n"

# --- Target Variable ---
summary_md += "#### Target Variable\n"
summary_md += f"- **Name**: EPDS Result (Edinburgh Postpartum Depression Scale)\n"
summary_md += f"- **Classes**: {', '.join(le.classes_)}\n"
summary_md += f"- **Class Distribution**: {dict(Counter(y_final))}\n"
summary_md += f"- **Class Balance**: Stratified train/test split (80/20)\n"
summary_md += f"- **Prediction Task**: Multi-class classification of postpartum depression risk\n\n"

# --- Features Summary ---
summary_md += "#### Feature Summary\n"
summary_md += f"- **Numeric Features Used**: {', '.join(final_numeric)}\n"
summary_md += f"- **Numeric Features Dropped**: {drop_numeric if drop_numeric else 'None'}\n"
summary_md += f"- **Categorical Features Used**: {len(final_categorical)} features\n"
summary_md += f"- **Categorical Features Dropped**: {len(drop_categorical)} features (low correlation with target)\n\n"

# Display as markdown
from IPython.display import Markdown, display
display(Markdown(summary_md))

