# Sleep Disorder Classification v2 — Research-Grade Pipeline

**Key fixes from v1**: Deduplication, class weights (no SMOTE), Stratified 5-Fold CV, improved architectures, XGBoost added.

> Set runtime to **T4 GPU**: Runtime → Change runtime type

In [None]:
# 1. Setup
!pip install -q optuna xgboost seaborn

import torch, os, glob, joblib, warnings
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, confusion_matrix, classification_report)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
import torch.nn as nn, torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
warnings.filterwarnings('ignore')

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
USE_AMP = torch.cuda.is_available()
BATCH_SIZE = 256 if torch.cuda.is_available() else 64
print(f'Device: {DEVICE}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

In [None]:
# 2. Mount Drive & Upload CSV
from google.colab import drive
drive.mount('/content/drive')

SAVE_DIR = '/content/drive/MyDrive/Sleep_Disorder_Project'
CKPT_DIR = os.path.join(SAVE_DIR, 'checkpoints_v2')
os.makedirs(CKPT_DIR, exist_ok=True)

csv_path = 'sleep_dataset.csv'
if not os.path.exists(csv_path):
    # Check Drive first
    drive_csv = os.path.join(SAVE_DIR, 'sleep_dataset.csv')
    if os.path.exists(drive_csv):
        csv_path = drive_csv
    else:
        from google.colab import files
        print('Upload sleep_dataset.csv:')
        uploaded = files.upload()
        for fn in uploaded: csv_path = fn

print(f'Using CSV: {csv_path}')

In [None]:
# 3. Load, DEDUPLICATE, & Preprocess
df = pd.read_csv(csv_path)
print(f'Raw dataset: {len(df)} rows')
print(f'Columns: {list(df.columns)}')

# CRITICAL FIX: Remove duplicates
if 'Person ID' in df.columns:
    df = df.drop(columns=['Person ID'])
df = df.drop_duplicates()
print(f'After deduplication: {len(df)} unique rows')

# Handle target
df['Sleep Disorder'] = df['Sleep Disorder'].fillna('None')
print(f'\nClass distribution:')
print(df['Sleep Disorder'].value_counts())

# Feature engineering
if 'Blood Pressure' in df.columns:
    df[['Systolic_BP', 'Diastolic_BP']] = df['Blood Pressure'].str.split('/', expand=True).astype(float)
    df = df.drop(columns=['Blood Pressure'])
if 'BMI Category' in df.columns:
    df['BMI Category'] = df['BMI Category'].replace({'Normal Weight': 'Normal'})

# Encode target
le = LabelEncoder()
target = 'Sleep Disorder'
y = le.fit_transform(df[target])
print(f'\nEncoded classes: {dict(zip(le.classes_, range(len(le.classes_))))}')

# Preprocess features
cat_cols = ['Gender', 'Occupation', 'BMI Category']
num_cols = [c for c in df.columns if c not in cat_cols + [target]]
preprocessor = ColumnTransformer([
    ('num', Pipeline([('scaler', StandardScaler())]), num_cols),
    ('cat', Pipeline([('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), cat_cols)
])
X = preprocessor.fit_transform(df.drop(columns=[target]))
print(f'Feature matrix: {X.shape}')

# Save label encoder
joblib.dump(le, os.path.join(SAVE_DIR, 'label_encoder_v2.joblib'))
print('Done!')

In [None]:
# 4. Utility Functions

class SleepDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

def plot_cm(model_name, y_true, y_pred, class_names, fold, split, ax):
    """Plot confusion matrix on given axis."""
    cm = confusion_matrix(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names, ax=ax)
    ax.set_title(f'Fold {fold+1} {split}\nAcc:{acc:.3f} F1:{f1:.3f}', fontsize=10)
    ax.set_ylabel('True'); ax.set_xlabel('Pred')
    return acc, f1

def get_ckpt_ver(name, d):
    files = glob.glob(os.path.join(d, f'{name}_v*.*'))
    nums = []
    for f in files:
        try: nums.append(int(os.path.basename(f).split('_v')[1].split('.')[0]))
        except: pass
    return max(nums)+1 if nums else 1

def save_ckpt(model, name, ckpt_dir, pytorch=False, meta=None):
    ver = get_ckpt_ver(name, ckpt_dir)
    if pytorch:
        path = os.path.join(ckpt_dir, f'{name}_v{ver}.pth')
        torch.save({'state_dict': model.state_dict(), 'v': ver, 'meta': meta or {}}, path)
    else:
        path = os.path.join(ckpt_dir, f'{name}_v{ver}.joblib')
        joblib.dump({'model': model, 'v': ver, 'meta': meta or {}}, path)
    print(f'  Checkpoint: {os.path.basename(path)}')
    return path

def compute_weights(y_train):
    """Compute class weights for imbalanced data."""
    cw = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    return cw

print('Utilities loaded.')

In [None]:
# 5. Model Definitions

class ANN(nn.Module):
    """3-layer MLP with BatchNorm, LeakyReLU, Dropout."""
    def __init__(self, dim, nc):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, 128), nn.BatchNorm1d(128), nn.LeakyReLU(0.1), nn.Dropout(0.3),
            nn.Linear(128, 64), nn.BatchNorm1d(64), nn.LeakyReLU(0.1), nn.Dropout(0.3),
            nn.Linear(64, 32), nn.BatchNorm1d(32), nn.LeakyReLU(0.1), nn.Dropout(0.2),
            nn.Linear(32, nc)
        )
    def forward(self, x): return self.net(x)

class CNN(nn.Module):
    """2-layer 1D CNN with GlobalAvgPool."""
    def __init__(self, dim, nc):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=3, padding=1), nn.BatchNorm1d(32), nn.ReLU(),
            nn.Conv1d(32, 64, kernel_size=3, padding=1), nn.BatchNorm1d(64), nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)
        )
        self.classifier = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(64, nc)
        )
    def forward(self, x):
        x = self.features(x.unsqueeze(1)).squeeze(-1)
        return self.classifier(x)

def train_torch(model, X_tr, y_tr, X_te, y_te, class_weights, epochs=100, lr=0.001):
    """Train PyTorch model with class-weighted loss, AMP, and LR scheduling."""
    model.to(DEVICE)
    w = torch.tensor(class_weights, dtype=torch.float32).to(DEVICE)
    criterion = nn.CrossEntropyLoss(weight=w)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, factor=0.5)
    scaler = GradScaler(enabled=USE_AMP)

    train_dl = DataLoader(SleepDataset(X_tr, y_tr), batch_size=BATCH_SIZE,
                          shuffle=True, pin_memory=USE_AMP)
    best_f1, best_state = 0, None

    for ep in range(epochs):
        model.train()
        for Xb, yb in train_dl:
            Xb, yb = Xb.to(DEVICE, non_blocking=True), yb.to(DEVICE, non_blocking=True)
            optimizer.zero_grad(set_to_none=True)
            with autocast(enabled=USE_AMP):
                loss = criterion(model(Xb), yb)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

        # Validate
        model.eval()
        with torch.no_grad():
            Xt = torch.tensor(X_te, dtype=torch.float32).to(DEVICE)
            preds = model(Xt).argmax(1).cpu().numpy()
        vf1 = f1_score(y_te, preds, average='weighted')
        scheduler.step(1 - vf1)

        if vf1 > best_f1:
            best_f1 = vf1
            best_state = {k: v.clone() for k, v in model.state_dict().items()}

        if (ep+1) % 25 == 0:
            print(f'    Ep {ep+1}/{epochs} | F1: {vf1:.4f} | Best: {best_f1:.4f}')

    model.load_state_dict(best_state)
    return model

def predict_torch(model, X):
    model.eval()
    with torch.no_grad(), autocast(enabled=USE_AMP):
        Xt = torch.tensor(X, dtype=torch.float32).to(DEVICE)
        return model(Xt).argmax(1).cpu().numpy()

print('Models defined.')

In [None]:
# 6. Stratified 5-Fold CV Training Pipeline

N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
input_dim = X.shape[1]
n_classes = len(le.classes_)

# Define all models
def get_models(cw_dict, cw_array):
    """Return dict of model name -> (model, is_pytorch)."""
    # XGBoost sample weights will be handled separately
    return {
        'KNN': (KNeighborsClassifier(n_neighbors=5, weights='distance', metric='minkowski', n_jobs=-1), False),
        'SVM': (SVC(C=10.0, kernel='rbf', gamma='scale', class_weight='balanced'), False),
        'RF':  (RandomForestClassifier(n_estimators=300, max_depth=15, min_samples_split=5,
                                       class_weight='balanced', random_state=42, n_jobs=-1), False),
        'XGBoost': (XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.1,
                                   subsample=0.8, colsample_bytree=0.8,
                                   scale_pos_weight=1, eval_metric='mlogloss',
                                   random_state=42, n_jobs=-1), False),
        'ANN': (ANN(input_dim, n_classes), True),
        'CNN': (CNN(input_dim, n_classes), True),
    }

print(f'Running {N_FOLDS}-Fold CV on {len(X)} samples, {input_dim} features, {n_classes} classes')

In [None]:
# 7. TRAIN ALL MODELS

model_names = ['KNN', 'SVM', 'RF', 'XGBoost', 'ANN', 'CNN']
all_results = {m: {'acc': [], 'f1': [], 'prec': [], 'rec': []} for m in model_names}
best_models = {}
best_f1s = {m: 0 for m in model_names}

for m_name in model_names:
    print(f"\n{'='*60}")
    print(f"  {m_name} — Stratified {N_FOLDS}-Fold CV")
    print(f"{'='*60}")

    fig, axes = plt.subplots(2, N_FOLDS, figsize=(4*N_FOLDS, 8))
    fig.suptitle(f'{m_name} — Per-Fold Confusion Matrices', fontsize=14, fontweight='bold')

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        X_tr, X_te = X[train_idx], X[test_idx]
        y_tr, y_te = y[train_idx], y[test_idx]

        cw = compute_weights(y_tr)
        cw_dict = dict(zip(np.unique(y_tr), cw))

        # Create fresh model for each fold
        models = get_models(cw_dict, cw)
        model, is_pytorch = models[m_name]

        print(f'  Fold {fold+1}/{N_FOLDS} (train={len(y_tr)}, test={len(y_te)})')

        if is_pytorch:
            # Deep copy for fresh weights
            import copy
            model = copy.deepcopy(models[m_name][0])
            model = train_torch(model, X_tr, y_tr, X_te, y_te, cw, epochs=100, lr=0.001)
            train_preds = predict_torch(model, X_tr)
            test_preds = predict_torch(model, X_te)
        else:
            if m_name == 'XGBoost':
                # XGBoost uses sample_weight
                sw = np.array([cw_dict[yi] for yi in y_tr])
                model.fit(X_tr, y_tr, sample_weight=sw)
            else:
                model.fit(X_tr, y_tr)
            train_preds = model.predict(X_tr)
            test_preds = model.predict(X_te)

        # Metrics
        acc = accuracy_score(y_te, test_preds)
        f1 = f1_score(y_te, test_preds, average='weighted')
        prec = precision_score(y_te, test_preds, average='weighted')
        rec = recall_score(y_te, test_preds, average='weighted')
        all_results[m_name]['acc'].append(acc)
        all_results[m_name]['f1'].append(f1)
        all_results[m_name]['prec'].append(prec)
        all_results[m_name]['rec'].append(rec)

        print(f'    Test Acc: {acc:.4f} | F1: {f1:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f}')

        # Plot confusion matrices
        plot_cm(m_name, y_tr, train_preds, le.classes_, fold, 'Train', axes[0, fold])
        plot_cm(m_name, y_te, test_preds, le.classes_, fold, 'Test', axes[1, fold])

        # Save best model
        if f1 > best_f1s[m_name]:
            best_f1s[m_name] = f1
            if is_pytorch:
                best_models[m_name] = copy.deepcopy(model)
            else:
                import copy as cp
                best_models[m_name] = cp.deepcopy(model)

    plt.tight_layout()
    cm_path = os.path.join(CKPT_DIR, f'{m_name}_confusion_v2.png')
    plt.savefig(cm_path, dpi=150, bbox_inches='tight')
    plt.show()
    print(f'  Saved: {cm_path}')

    # Save best model as checkpoint
    best = best_models[m_name]
    meta = {k: f'{np.mean(v):.4f}±{np.std(v):.4f}' for k, v in all_results[m_name].items()}
    save_ckpt(best, m_name, CKPT_DIR, pytorch=(m_name in ['ANN', 'CNN']), meta=meta)

    # Classification report for last fold
    print(f'\n  {m_name} Last Fold Report:')
    print(classification_report(y_te, test_preds, target_names=le.classes_))

In [None]:
# 8. Final Summary
print('\n' + '='*70)
print('  FINAL RESULTS — Stratified 5-Fold CV  (mean ± std)')
print('='*70)

summary = []
for m in model_names:
    r = all_results[m]
    summary.append({
        'Model': m,
        'Accuracy': f"{np.mean(r['acc']):.4f} ± {np.std(r['acc']):.4f}",
        'Precision': f"{np.mean(r['prec']):.4f} ± {np.std(r['prec']):.4f}",
        'Recall': f"{np.mean(r['rec']):.4f} ± {np.std(r['rec']):.4f}",
        'F1-Score': f"{np.mean(r['f1']):.4f} ± {np.std(r['f1']):.4f}",
    })

results_df = pd.DataFrame(summary).set_index('Model')
print(results_df.to_string())

# Paper comparison
print('\n--- Paper Baselines (GA-optimized) ---')
paper = {'KNN': 83.19, 'SVM': 92.04, 'RF': 91.15, 'ANN': 92.92}
for m, baseline in paper.items():
    our = np.mean(all_results[m]['acc']) * 100
    diff = our - baseline
    status = '✅ BEAT' if diff > 0 else '❌ BELOW'
    print(f'  {m:8s}: Paper={baseline:.2f}%  Ours={our:.2f}%  ({diff:+.2f}%)  [{status}]')

In [None]:
# 9. Feature Importance (Random Forest)
if 'RF' in best_models:
    rf_model = best_models['RF']
    feat_names = (num_cols +
                  list(preprocessor.named_transformers_['cat']
                       .named_steps['ohe'].get_feature_names_out(cat_cols)))
    importances = rf_model.feature_importances_
    idx = np.argsort(importances)[::-1][:15]

    plt.figure(figsize=(10, 5))
    plt.bar(range(len(idx)), importances[idx])
    plt.xticks(range(len(idx)), [feat_names[i] for i in idx], rotation=45, ha='right')
    plt.title('Top 15 Feature Importances (Random Forest)')
    plt.tight_layout()
    plt.savefig(os.path.join(CKPT_DIR, 'feature_importance.png'), dpi=150)
    plt.show()

In [None]:
# 10. List Checkpoints
print('\nSaved checkpoints:')
for f in sorted(os.listdir(CKPT_DIR)):
    sz = os.path.getsize(os.path.join(CKPT_DIR, f)) / 1024
    print(f'  {f} ({sz:.1f} KB)')