In [2]:
# 🧹 Alzheimer's Preprocessing Notebook - Cross-sectional Clinical Data

# ====================
# 1. Imports
# ====================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import os

# ====================
# 2. Load Raw Dataset
# ====================
cross_sec = pd.read_csv("../data/raw/clinical/oasis_cross-sectional.csv")
cross_sec.columns = cross_sec.columns.str.strip()  # remove extra spaces

# Create 'Group' from 'CDR' if not present
if 'Group' not in cross_sec.columns and 'CDR' in cross_sec.columns:
    def map_group(cdr):
        if pd.isnull(cdr): return np.nan
        elif cdr == 0.0: return 'Nondemented'
        elif cdr == 0.5: return 'Very Mild Dementia'
        elif cdr == 1.0: return 'Mild Dementia'
        elif cdr == 2.0: return 'Moderate Dementia'
        else: return 'Unknown'
    
    cross_sec['Group'] = cross_sec['CDR'].apply(map_group)

# ====================
# 3. Save Raw Merged
# ====================
os.makedirs("../data/processed/clinical", exist_ok=True)
cross_sec.to_csv("../data/processed/clinical/1_combined.csv", index=False)
print("✅ Saved: 1_combined.csv")

# ====================
# 4. Handle Missing Values
# ====================
# Separate numeric and categorical columns
num_cols = cross_sec.select_dtypes(include=np.number).columns.tolist()
cat_cols = cross_sec.select_dtypes(include='object').columns.tolist()

# Fill numeric nulls with mean
cross_sec[num_cols] = cross_sec[num_cols].fillna(cross_sec[num_cols].mean())

# Fill categorical nulls with mode
for col in cat_cols:
    cross_sec[col] = cross_sec[col].fillna(cross_sec[col].mode()[0])

# Save cleaned version
cross_sec.to_csv("../data/processed/clinical/2_cleaned.csv", index=False)
print("✅ Saved: 2_cleaned.csv")

# ====================
# 5. Encode Categorical Columns
# ====================
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    cross_sec[col] = le.fit_transform(cross_sec[col])
    label_encoders[col] = le

# ====================
# 6. Normalize Numerical Features
# ====================
scaler = MinMaxScaler()
cross_sec[num_cols] = scaler.fit_transform(cross_sec[num_cols])

# Save normalized data
cross_sec.to_csv("../data/processed/clinical/3_normalized.csv", index=False)
print("✅ Saved: 3_normalized.csv")

# ====================
# 7. Train-Test Split
# ====================
split_dir = "../data/processed/clinical/splits"
os.makedirs(split_dir, exist_ok=True)

# Use 'ID' if available, else fallback to index
id_col = 'ID' if 'ID' in cross_sec.columns else cross_sec.index
ids = cross_sec[id_col]

train_ids, test_ids = train_test_split(ids, test_size=0.2, random_state=42)

pd.DataFrame({'ID': train_ids}).to_csv(f"{split_dir}/train_ids.csv", index=False)
pd.DataFrame({'ID': test_ids}).to_csv(f"{split_dir}/test_ids.csv", index=False)

print("✅ Saved: train_ids.csv and test_ids.csv")


✅ Saved: 1_combined.csv
✅ Saved: 2_cleaned.csv
✅ Saved: 3_normalized.csv
✅ Saved: train_ids.csv and test_ids.csv
