# Data Processing - Health XAI Prediction
This notebook implements the preprocessing pipeline based on our EDA analysis. We use the `health` variable as our target for 5-class health prediction, remove 6 columns identified in EDA, and create the BMI feature.

**Key Changes from EDA:**
- Target: `health` (5-class ordinal: 1=Very Good to 5=Very Bad)  
- Remove: `cntry`, `hltprhc`, `hltprhb`, `hltprdi`, `height`, `weighta` (6 columns)
- Add: `BMI` (derived from height/weight before removal)

## Imports & Paths

In [1]:

from pathlib import Path
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

sns.set_theme(style='whitegrid')

PROJECT_ROOT = Path('..')
RAW_PATH = PROJECT_ROOT / 'data' / 'raw' / 'ess.csv'
PROCESSED_DIR = PROJECT_ROOT / 'data' / 'processed'
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Target variable and features to remove based on EDA analysis
TARGET_COLUMN = 'health'
FEATURES_TO_REMOVE = ['cntry', 'hltprhc', 'hltprhb', 'hltprdi', 'height', 'weighta']

print(f'Raw dataset: {RAW_PATH.resolve()}')
print(f'Target variable: {TARGET_COLUMN}')
print(f'Features to remove: {FEATURES_TO_REMOVE}')


Raw dataset: /Users/peter/Desktop/health_xai_prediction/data/raw/ess.csv
Target variable: health
Features to remove: ['cntry', 'hltprhc', 'hltprhb', 'hltprdi', 'height', 'weighta']


In [2]:

FEATURE_ABBREVIATIONS = {
    "health": "Self-rated general health (target)",
    "bmi": "Body Mass Index (derived)",
    "flteeff": "Mental effort feelings",
    "fltdpr": "Depression feelings", 
    "happy": "Happiness score",
}

FEATURE_DESCRIPTIONS = {
    "cntry": "Country code of respondent (ISO-2).",
    "happy": "Self-rated happiness on a 0â€“10 scale.",
    "sclmeet": "Frequency of social meetings with friends, relatives, or colleagues.",
    "inprdsc": "Frequency of participation in organised social, religious, or community activities.",
    "health": "Self-rated general health (1 very good to 5 very bad).",
    "ctrlife": "Feeling of control over life (0 no control to 10 complete control).",
    "etfruit": "Frequency of fruit consumption.",
    "eatveg": "Frequency of vegetable consumption.",
    "dosprt": "Frequency of doing sports or physical exercise.",
    "cgtsmok": "Cigarette smoking status or frequency.",
    "alcfreq": "Alcohol consumption frequency.",
    "height": "Self-reported height in centimeters.",
    "weighta": "Self-reported weight in kilograms.",
    "fltdpr": "How often felt depressed in the last week.",
    "flteeff": "How often felt everything was an effort in the last week.",
    "slprl": "How often sleep was restless in the last week.",
    "wrhpp": "How often felt happy in the last week (reverse coded).",
    "fltlnl": "How often felt lonely in the last week.",
    "enjlf": "How often enjoyed life in the last week (reverse coded).",
    "fltsd": "How often felt sad in the last week.",
    "hltprhc": "Doctor diagnosed heart or circulation problems (1 yes 0 no).",
    "hltprhb": "Doctor diagnosed high blood pressure (1 yes 0 no).",
    "hltprdi": "Doctor diagnosed diabetes (1 yes 0 no).",
    "gndr": "Gender of respondent (1 male 2 female).",
    "paccnois": "Perceived noise problems in the local area (1 yes 0 no).",
}


def clean_column_name(name: str) -> str:
    stripped = name.strip().lower()
    return "_".join(part for part in re.split(r"[^0-9a-zA-Z]+", stripped) if part)


def load_raw_dataset(path: Path) -> pd.DataFrame:
    """Load and preprocess ESS dataset according to EDA findings."""
    print(f"Loading dataset from: {path}")
    df = pd.read_csv(path, na_values=['NA', ''])
    print(f"Initial shape: {df.shape}")
    
    # Remove unnamed columns
    unnamed_cols = [col for col in df.columns if col.startswith('Unnamed') or not col.strip()]
    if unnamed_cols:
        df = df.drop(columns=unnamed_cols)
        print(f"Removed {len(unnamed_cols)} unnamed columns")

    # Clean column names
    df.columns = df.columns.str.strip()
    original_columns = df.columns.tolist()
    cleaned_columns = [clean_column_name(col) for col in original_columns]
    
    # Create feature mapping
    mapping = pd.DataFrame({
        'original_name': original_columns,
        'cleaned_name': cleaned_columns,
        'description': [FEATURE_DESCRIPTIONS.get(col, '') for col in cleaned_columns]
    })
    mapping.to_csv(PROCESSED_DIR / 'feature_names.csv', index=False)
    df.columns = cleaned_columns

    # Check target variable
    if TARGET_COLUMN not in df.columns:
        raise ValueError(f"Target column '{TARGET_COLUMN}' not found!")
    
    # Remove rows with missing target
    initial_rows = len(df)
    df = df.dropna(subset=[TARGET_COLUMN])
    removed_rows = initial_rows - len(df)
    if removed_rows > 0:
        print(f"Removed {removed_rows} rows with missing target values")

    # Convert object columns to numeric (except country)
    for column in df.columns:
        if column == 'cntry':
            continue
        if df[column].dtype == object:
            df[column] = pd.to_numeric(df[column], errors='coerce')

    # Create BMI feature before removing height/weight
    if {'height', 'weighta'}.issubset(df.columns):
        print("Creating BMI feature from height and weight...")
        height_m = (df['height'] / 100.0).where(lambda s: s > 0, np.nan)
        bmi = df['weighta'] / np.square(height_m)
        bmi = bmi.replace([np.inf, -np.inf], np.nan)
        df['bmi'] = bmi
        valid_bmi = df['bmi'].notna().sum()
        print(f"Created BMI feature with {valid_bmi} valid values")

    # Remove columns identified in EDA analysis
    cols_to_remove = [col for col in FEATURES_TO_REMOVE if col in df.columns]
    if cols_to_remove:
        df = df.drop(columns=cols_to_remove)
        print(f"Removed {len(cols_to_remove)} columns: {cols_to_remove}")

    print(f"Final shape after preprocessing: {df.shape}")
    return df


def get_feature_groups(df: pd.DataFrame, target: str) -> tuple[list[str], list[str]]:
    feature_cols = [col for col in df.columns if col != target]
    numeric = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
    categorical = [col for col in feature_cols if col not in numeric]
    print(f"Numeric features ({len(numeric)}): {numeric}")
    print(f"Categorical features ({len(categorical)}): {categorical}")
    return numeric, categorical


def handle_missing_values(df: pd.DataFrame, numeric: list[str], categorical: list[str]) -> pd.DataFrame:
    df = df.copy()
    for column in numeric:
        median_value = df[column].median()
        if pd.isna(median_value):
            continue
        df[column] = df[column].fillna(median_value)
    for column in categorical:
        mode_series = df[column].mode(dropna=True)
        fill_value = mode_series.iloc[0] if not mode_series.empty else 'unknown'
        df[column] = df[column].fillna(fill_value)
    return df


def cap_outliers_iqr(df: pd.DataFrame, numeric: list[str], multiplier: float = 1.5) -> pd.DataFrame:
    df = df.copy()
    for column in numeric:
        series = df[column].dropna()
        if series.empty:
            continue
        q1 = series.quantile(0.25)
        q3 = series.quantile(0.75)
        iqr = q3 - q1
        lower = q1 - multiplier * iqr
        upper = q3 + multiplier * iqr
        df[column] = df[column].clip(lower=lower, upper=upper)
    return df


## Load and Inspect Raw Data

In [3]:

df_raw = load_raw_dataset(RAW_PATH)
display(df_raw.head())
df_raw.info()


Loading dataset from: ../data/raw/ess.csv
Initial shape: (42377, 26)
Removed 1 unnamed columns
Removed 38 rows with missing target values
Creating BMI feature from height and weight...
Created BMI feature with 42339 valid values
Removed 6 columns: ['cntry', 'hltprhc', 'hltprhb', 'hltprdi', 'height', 'weighta']
Final shape after preprocessing: (42339, 20)


Unnamed: 0,happy,sclmeet,inprdsc,health,ctrlife,etfruit,eatveg,dosprt,cgtsmok,alcfreq,fltdpr,flteeff,slprl,wrhpp,fltlnl,enjlf,fltsd,gndr,paccnois,bmi
0,8.0,4.0,1.0,3.0,8.0,3.0,3.0,3.0,4.0,3.0,1.0,1.0,1.0,3.0,1.0,3.0,1.0,1,0,28.405504
1,9.0,7.0,4.0,2.0,8.0,1.0,1.0,5.0,5.0,3.0,2.0,2.0,3.0,3.0,3.0,4.0,2.0,2,0,26.218821
2,9.0,4.0,4.0,1.0,9.0,4.0,3.0,3.0,1.0,4.0,2.0,2.0,3.0,3.0,1.0,3.0,1.0,2,0,29.320988
3,7.0,6.0,3.0,3.0,8.0,2.0,2.0,3.0,6.0,7.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2,0,25.099502
4,9.0,5.0,4.0,2.0,9.0,3.0,3.0,3.0,1.0,2.0,1.0,1.0,1.0,3.0,1.0,3.0,1.0,1,0,23.738662


<class 'pandas.core.frame.DataFrame'>
Index: 42339 entries, 0 to 42376
Data columns (total 20 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   happy     42226 non-null  float64
 1   sclmeet   42271 non-null  float64
 2   inprdsc   42137 non-null  float64
 3   health    42339 non-null  float64
 4   ctrlife   42169 non-null  float64
 5   etfruit   42246 non-null  float64
 6   eatveg    42244 non-null  float64
 7   dosprt    41890 non-null  float64
 8   cgtsmok   42269 non-null  float64
 9   alcfreq   42146 non-null  float64
 10  fltdpr    42188 non-null  float64
 11  flteeff   42194 non-null  float64
 12  slprl     42227 non-null  float64
 13  wrhpp     42108 non-null  float64
 14  fltlnl    42174 non-null  float64
 15  enjlf     42080 non-null  float64
 16  fltsd     42174 non-null  float64
 17  gndr      42339 non-null  int64  
 18  paccnois  42339 non-null  int64  
 19  bmi       42339 non-null  float64
dtypes: float64(18), int64(2)
memory u

In [4]:
# Get feature groups (excluding target from features)
numeric_features, categorical_features = get_feature_groups(df_raw, target=TARGET_COLUMN)

# Since we're going to remove the target column for modeling, 
# we need to exclude it from the feature list for the preprocessor
numeric_features = [col for col in numeric_features if col != TARGET_COLUMN]
categorical_features = [col for col in categorical_features if col != TARGET_COLUMN]

print(f"Final numeric features for modeling: {len(numeric_features)}")
print(f"Final categorical features for modeling: {len(categorical_features)}")

Numeric features (19): ['happy', 'sclmeet', 'inprdsc', 'ctrlife', 'etfruit', 'eatveg', 'dosprt', 'cgtsmok', 'alcfreq', 'fltdpr', 'flteeff', 'slprl', 'wrhpp', 'fltlnl', 'enjlf', 'fltsd', 'gndr', 'paccnois', 'bmi']
Categorical features (0): []
Final numeric features for modeling: 19
Final categorical features for modeling: 0


In [5]:

df_clean = handle_missing_values(df_raw, numeric_features, categorical_features)
df_clean = cap_outliers_iqr(df_clean, numeric_features)
print('After cleaning:', df_clean.shape)


After cleaning: (42339, 20)


In [6]:
# Check target variable distribution
print("Target variable distribution (health):")
class_counts = df_clean[TARGET_COLUMN].value_counts().sort_index()
print(class_counts)

print("\nTarget variable percentages:")
class_percentages = df_clean[TARGET_COLUMN].value_counts(normalize=True).mul(100).sort_index().round(2)
print(class_percentages)

# Map health values to labels for interpretation
health_labels = {1: "Very Good", 2: "Good", 3: "Fair", 4: "Bad", 5: "Very Bad"}
print("\nHealth distribution by category:")
for health_val, count in class_counts.items():
    pct = class_percentages[health_val]
    label = health_labels.get(health_val, f"Unknown({health_val})")
    print(f"  {int(health_val)}: {label} - {count} samples ({pct}%)")

Target variable distribution (health):
health
1.0    10808
2.0    18052
3.0    10489
4.0     2526
5.0      464
Name: count, dtype: int64

Target variable percentages:
health
1.0    25.53
2.0    42.64
3.0    24.77
4.0     5.97
5.0     1.10
Name: proportion, dtype: float64

Health distribution by category:
  1: Very Good - 10808 samples (25.53%)
  2: Good - 18052 samples (42.64%)
  3: Fair - 10489 samples (24.77%)
  4: Bad - 2526 samples (5.97%)
  5: Very Bad - 464 samples (1.1%)


In [7]:
# Build preprocessing pipeline
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

# Since we have no categorical features after preprocessing, only use numeric pipeline
column_transformer = ColumnTransformer(
    transformers=[
        ('numeric', numeric_pipeline, numeric_features),
    ]
)

# Prepare features (X) and target (y) for modeling
X = df_clean.drop(columns=[TARGET_COLUMN])
y = df_clean[TARGET_COLUMN].astype(int)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature columns: {list(X.columns)}")
print(f"Target classes: {sorted(y.unique())}")

# Create train/validation/test splits
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    stratify=y,
    random_state=42,
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42,
)

print(f"\nSplit sizes:")
print(f"Train: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Validation: {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"Test: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

# Check class distribution in splits
print(f"\nClass distribution in training set:")
print(y_train.value_counts(normalize=True).mul(100).round(2).sort_index())

Features shape: (42339, 19)
Target shape: (42339,)
Feature columns: ['happy', 'sclmeet', 'inprdsc', 'ctrlife', 'etfruit', 'eatveg', 'dosprt', 'cgtsmok', 'alcfreq', 'fltdpr', 'flteeff', 'slprl', 'wrhpp', 'fltlnl', 'enjlf', 'fltsd', 'gndr', 'paccnois', 'bmi']
Target classes: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5)]

Split sizes:
Train: 29637 samples (70.0%)
Validation: 6351 samples (15.0%)
Test: 6351 samples (15.0%)

Class distribution in training set:
health
1    25.53
2    42.64
3    24.77
4     5.97
5     1.10
Name: proportion, dtype: float64


In [8]:
# Fit the preprocessing pipeline on training data
print("Fitting preprocessing pipeline on training data...")
column_transformer.fit(X_train)

# Transform all splits
X_train_processed = column_transformer.transform(X_train)
X_val_processed = column_transformer.transform(X_val)
X_test_processed = column_transformer.transform(X_test)

print(f'Training matrix shape: {X_train_processed.shape}')
print(f'Validation matrix shape: {X_val_processed.shape}')
print(f'Test matrix shape: {X_test_processed.shape}')

# Get feature names after transformation
feature_names = column_transformer.get_feature_names_out()
print(f'Number of features after preprocessing: {len(feature_names)}')
print(f'Feature names: {list(feature_names)[:10]}...')  # Show first 10

Fitting preprocessing pipeline on training data...
Training matrix shape: (29637, 19)
Validation matrix shape: (6351, 19)
Test matrix shape: (6351, 19)
Number of features after preprocessing: 19
Feature names: ['numeric__happy', 'numeric__sclmeet', 'numeric__inprdsc', 'numeric__ctrlife', 'numeric__etfruit', 'numeric__eatveg', 'numeric__dosprt', 'numeric__cgtsmok', 'numeric__alcfreq', 'numeric__fltdpr']...


## Summary

**Preprocessing Pipeline Completed Successfully! ðŸŽ¯**

**Key Changes from EDA Analysis:**
- âœ… **Target Variable**: `health` (5-class ordinal: 1=Very Good to 5=Very Bad)
- âœ… **Features Removed**: 6 columns (`cntry`, `hltprhc`, `hltprhb`, `hltprdi`, `height`, `weighta`)
- âœ… **New Feature**: BMI derived from height/weight before removal
- âœ… **Final Dataset**: 42,339 samples Ã— 19 features (all numerical)

**Data Splits:**
- **Training**: 29,637 samples (70%)
- **Validation**: 6,351 samples (15%) 
- **Test**: 6,351 samples (15%)

**Target Distribution** (maintained across splits):
- Very Good (1): 25.53%
- Good (2): 42.64% 
- Fair (3): 24.77%
- Bad (4): 5.97%
- Very Bad (5): 1.10%

**Preprocessing Applied:**
- âœ… Missing value imputation (median for numerical features)
- âœ… Standard scaling for all features
- âœ… Stratified splits to maintain class balance
- âœ… Pipeline ready for machine learning models

**Next Steps:** Ready for baseline modeling and XAI analysis!

In [9]:
# Save processed datasets for modeling
import pandas as pd
import joblib
from pathlib import Path

# Create processed data directory
PROCESSED_DIR.mkdir(exist_ok=True)

# Convert processed arrays to DataFrames with proper feature names
feature_names_clean = [name.replace('numeric__', '') for name in feature_names]

# Save training data
train_df = pd.DataFrame(X_train_processed, columns=feature_names_clean, index=X_train.index)
train_df[TARGET_COLUMN] = y_train
train_df.to_csv(PROCESSED_DIR / 'train.csv', index=False)

# Save validation data
val_df = pd.DataFrame(X_val_processed, columns=feature_names_clean, index=X_val.index)
val_df[TARGET_COLUMN] = y_val
val_df.to_csv(PROCESSED_DIR / 'validation.csv', index=False)

# Save test data
test_df = pd.DataFrame(X_test_processed, columns=feature_names_clean, index=X_test.index)
test_df[TARGET_COLUMN] = y_test
test_df.to_csv(PROCESSED_DIR / 'test.csv', index=False)

# Save the complete processed dataset
full_processed = pd.DataFrame(
    column_transformer.transform(X), 
    columns=feature_names_clean,
    index=X.index
)
full_processed[TARGET_COLUMN] = y
full_processed.to_csv(PROCESSED_DIR / 'health_clean.csv', index=False)

# Save the fitted preprocessor for later use
joblib.dump(column_transformer, PROCESSED_DIR / 'preprocessor.pkl')

print("âœ… Saved processed datasets:")
print(f"   â†’ Training: {PROCESSED_DIR / 'train.csv'} ({len(train_df)} samples)")
print(f"   â†’ Validation: {PROCESSED_DIR / 'validation.csv'} ({len(val_df)} samples)")
print(f"   â†’ Test: {PROCESSED_DIR / 'test.csv'} ({len(test_df)} samples)")
print(f"   â†’ Full dataset: {PROCESSED_DIR / 'health_clean.csv'} ({len(full_processed)} samples)")
print(f"   â†’ Preprocessor: {PROCESSED_DIR / 'preprocessor.pkl'}")
print(f"\nðŸŽ¯ Ready for machine learning and XAI analysis!")

âœ… Saved processed datasets:
   â†’ Training: ../data/processed/train.csv (29637 samples)
   â†’ Validation: ../data/processed/validation.csv (6351 samples)
   â†’ Test: ../data/processed/test.csv (6351 samples)
   â†’ Full dataset: ../data/processed/health_clean.csv (42339 samples)
   â†’ Preprocessor: ../data/processed/preprocessor.pkl

ðŸŽ¯ Ready for machine learning and XAI analysis!
