In [14]:
# =========================
# Cell 1: Imports, config, load data
# =========================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

# Optional libraries (only used if installed)
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False

try:
    import shap
    SHAP_AVAILABLE = True
except ImportError:
    SHAP_AVAILABLE = False

TSNE_AVAILABLE = False
UMAP_AVAILABLE = False
try:
    from openTSNE import TSNE
    TSNE_AVAILABLE = True
except ImportError:
    pass

try:
    import umap
    UMAP_AVAILABLE = True
except ImportError:
    pass

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

# -------- Load dataset --------
DATA_PATH = "MentalHealthSurvey.csv"  # change if your filename is different

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(
        f"Could not find {DATA_PATH}. Put the CSV next to this notebook or update DATA_PATH."
    )

df = pd.read_csv(DATA_PATH)

print("=== Basic dataset info ===")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nHead:")
print(df.head())

print("\n=== Missing values per column ===")
print(df.isna().sum())


=== Basic dataset info ===
Shape: (87, 21)
Columns: ['gender', 'age', 'university', 'degree_level', 'degree_major', 'academic_year', 'cgpa', 'residential_status', 'campus_discrimination', 'sports_engagement', 'average_sleep', 'study_satisfaction', 'academic_workload ', 'academic_pressure', 'financial_concerns', 'social_relationships', 'depression', 'anxiety', 'isolation', 'future_insecurity', 'stress_relief_activities']

Head:
   gender  age university   degree_level      degree_major academic_year  \
0    Male   20         PU  Undergraduate      Data Science      2nd year   
1    Male   20        UET   Postgraduate  Computer Science      3rd year   
2    Male   20       FAST  Undergraduate  Computer Science      3rd year   
3    Male   20        UET  Undergraduate  Computer Science      3rd year   
4  Female   20        UET  Undergraduate  Computer Science      3rd year   

      cgpa residential_status campus_discrimination sports_engagement  ...  \
0  3.0-3.5         Off-Campus     

In [None]:
# =========================
# Cell 2: Basic EDA
# =========================

print("=== Descriptive statistics (numeric) ===")
print(df.describe(include=[np.number]))

print("\n=== Descriptive statistics (categorical) ===")
print(df.describe(include=["object"]))

# Helper plot functions
def plot_hist(column, bins=10):
    if column not in df.columns:
        return
    plt.figure()
    df[column].dropna().astype(float, errors="ignore").hist(bins=bins)
    plt.title(f"Histogram of {column}")
    plt.xlabel(column)
    plt.ylabel("Count")
    plt.show()

def plot_bar(column):
    if column not in df.columns:
        return
    plt.figure()
    df[column].value_counts().plot(kind="bar")
    plt.title(f"Counts of {column}")
    plt.xlabel(column)
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

# Choose numeric/ordinal-looking columns from your dataset
num_like_cols = [
    "age", "cgpa", "study_satisfaction", "academic_workload",
    "academic_pressure", "financial_concerns", "social_relationships",
    "depression", "anxiety"
]
for col in num_like_cols:
    if col in df.columns:
        plot_hist(col)

cat_cols = [
    "gender", "university", "degree_level", "residential_status",
    "sports_engagement", "average_sleep", "stress_relief_activities"
]
for col in cat_cols:
    if col in df.columns:
        plot_bar(col)

# Correlation matrix for numeric columns
numeric_cols_raw = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols_raw) > 1:
    plt.figure(figsize=(8, 6))
    corr = df[numeric_cols_raw].corr()
    im = plt.imshow(corr, interpolation="nearest")
    plt.xticks(range(len(numeric_cols_raw)), numeric_cols_raw, rotation=45, ha="right")
    plt.yticks(range(len(numeric_cols_raw)), numeric_cols_raw)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.title("Correlation matrix (numeric features)")
    plt.tight_layout()
    plt.show()
    print("\n=== Correlation matrix ===")
    print(corr)


In [None]:
# =========================
# Cell 2.5: Data cleaning & quality checks
# =========================

import re

print("Shape BEFORE cleaning:", df.shape)

# 1) Strip whitespace from string columns
obj_cols = df.select_dtypes(include="object").columns
for col in obj_cols:
    df[col] = df[col].astype(str).str.strip()

# 2) Check and remove duplicate rows
dup_mask = df.duplicated()
print("Number of duplicate rows:", dup_mask.sum())

if dup_mask.sum() > 0:
    df = df[~dup_mask].copy()
    print("Duplicates removed.")
else:
    print("No duplicate rows found.")

print("Shape AFTER removing duplicates:", df.shape)

# 3) Optional: normalize some categorical text (just stripping, not forcing lowercase globally)
# (You can add more columns here if needed)
clean_cats = [
    "gender",
    "university",
    "degree_level",
    "residential_status",
    "sports_engagement",
    "average_sleep",
    "campus_discrimination",
]
for col in clean_cats:
    if col in df.columns:
        df[col] = df[col].str.strip()

# 4) Convert CGPA ranges (e.g., "3.0-3.5") into numeric midpoint
def cgpa_to_numeric(val):
    if pd.isna(val):
        return np.nan
    if isinstance(val, (int, float)):
        return float(val)

    s = str(val)
    # Case like "3.0-3.5"
    if "-" in s or "–" in s:
        parts = re.split(r"[-–]", s)
        nums = []
        for p in parts:
            p = p.strip()
            if not p:
                continue
            try:
                nums.append(float(p))
            except ValueError:
                pass
        if len(nums) == 2:
            return (nums[0] + nums[1]) / 2.0

    # Fallback: extract number like "3.2" or "3.2+"
    s_clean = re.sub(r"[^0-9.]", "", s)
    try:
        return float(s_clean)
    except ValueError:
        return np.nan

if "cgpa" in df.columns:
    df["cgpa"] = df["cgpa"].apply(cgpa_to_numeric)

# 5) Convert average_sleep (e.g., "4-6 hrs") -> numeric hours
def sleep_to_hours(val):
    if pd.isna(val):
        return np.nan
    s = str(val).lower()

    # Case like "4-6 hrs"
    if "-" in s or "–" in s:
        parts = re.split(r"[-–]", s)
        nums = []
        for p in parts:
            p = re.sub(r"[^0-9.]", "", p)
            if p:
                try:
                    nums.append(float(p))
                except ValueError:
                    pass
        if len(nums) == 2:
            return (nums[0] + nums[1]) / 2.0

    # Some custom phrases (you can tweak these if your dataset has them)
    if "less" in s and "4" in s:
        return 3.0
    if "more" in s and "9" in s:
        return 9.0

    # Fallback: pull any number
    s_clean = re.sub(r"[^0-9.]", "", s)
    if s_clean:
        try:
            return float(s_clean)
        except ValueError:
            return np.nan

    return np.nan

if "average_sleep" in df.columns:
    df["average_sleep_hours"] = df["average_sleep"].apply(sleep_to_hours)

# 6) Show basic value-counts for key categorical columns (for EDA & label checking)
print("\n=== Value counts for key categorical columns ===")
for col in ["gender", "university", "degree_level", "residential_status",
            "sports_engagement", "average_sleep"]:
    if col in df.columns:
        print(f"\nColumn: {col}")
        print(df[col].value_counts())

# 7) Missing values BEFORE imputation
print("\nMissing values BEFORE imputation:")
print(df.isna().sum())

# 8) Simple missing-value handling
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

# For numeric: fill with median
for col in num_cols:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)

# For categorical: fill with mode, or "Unknown" if no mode
for col in cat_cols:
    mode = df[col].mode()
    if not mode.empty:
        df[col].fillna(mode[0], inplace=True)
    else:
        df[col].fillna("Unknown", inplace=True)

print("\nMissing values AFTER imputation:")
print(df.isna().sum())

print("\nShape AFTER cleaning + imputation:", df.shape)


In [None]:
# =========================
# Cell 3: Target & preprocessing (no sklearn)
# =========================

TARGET_COL = "depression"

if TARGET_COL not in df.columns:
    raise KeyError(f"Target column '{TARGET_COL}' not found in dataset.")

# Drop rows with missing target
df = df.dropna(subset=[TARGET_COL]).copy()

# Binary target: 0 = low/moderate (<=2) ; 1 = high (>=3)
def make_binary_target(x):
    try:
        x_float = float(x)
    except Exception:
        return np.nan
    return 1 if x_float >= 3 else 0

y = df[TARGET_COL].apply(make_binary_target)
df = df[~y.isna()].copy()
y = y[~y.isna()].astype(int).values  # numpy array

print("Target value counts (0=low/moderate, 1=high):")
unique, counts = np.unique(y, return_counts=True)
for u, c in zip(unique, counts):
    print(f"{u}: {c}")

# Features
X = df.drop(columns=[TARGET_COL])

# numeric vs categorical
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X.columns if c not in numeric_features]

print("\nNumeric features:", numeric_features)
print("Categorical features:", categorical_features)

# One-hot encode categoricals using pandas
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=False)

# Standardize all columns
X_values = X_encoded.values.astype(float)
feature_means = X_values.mean(axis=0)
feature_stds = X_values.std(axis=0) + 1e-6
X_scaled = (X_values - feature_means) / feature_stds

feature_names = X_encoded.columns.tolist()
print("\nEncoded feature matrix shape:", X_scaled.shape)


In [None]:
# =========================
# Cell 4: Train/test split & metric functions
# =========================

def train_test_split_manual(X, y, test_size=0.2, random_state=42):
    np.random.seed(random_state)
    n_samples = X.shape[0]
    indices = np.random.permutation(n_samples)
    test_count = int(n_samples * test_size)
    test_idx = indices[:test_count]
    train_idx = indices[test_count:]
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

X_train, X_test, y_train, y_test = train_test_split_manual(
    X_scaled, y, test_size=0.2, random_state=RANDOM_STATE
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# ---- metric functions (FIXED SHAPE ISSUE) ----
def classification_metrics(y_true, y_pred):
    # Convert everything to 1D int arrays
    y_true = np.array(y_true).astype(int).reshape(-1)
    y_pred = np.array(y_pred).astype(int).reshape(-1)

    assert y_true.shape == y_pred.shape

    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    accuracy = (tp + tn) / (tp + tn + fp + fn + 1e-8)
    precision = tp / (tp + fp + 1e-8) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn + 1e-8) if (tp + fn) > 0 else 0.0
    f1 = (2 * precision * recall / (precision + recall + 1e-8)
          if (precision + recall) > 0 else 0.0)

    cm = np.array([[tn, fp],
                   [fn, tp]])
    return accuracy, precision, recall, f1, cm

def plot_confusion_matrix(cm, title="Confusion Matrix"):
    plt.figure()
    plt.imshow(cm, interpolation="nearest")
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i, j], ha="center", va="center")
    plt.colorbar()
    plt.tight_layout()
    plt.show()

# Convert to torch tensors for models
X_train_t = torch.tensor(X_train, dtype=torch.float32)
X_test_t  = torch.tensor(X_test,  dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test_t  = torch.tensor(y_test,  dtype=torch.float32).view(-1, 1)

input_dim = X_train.shape[1]


In [None]:
# =========================
# Cell 4: Train/test split & metric functions
# =========================

def train_test_split_manual(X, y, test_size=0.2, random_state=42):
    np.random.seed(random_state)
    n_samples = X.shape[0]
    indices = np.random.permutation(n_samples)
    test_count = int(n_samples * test_size)
    test_idx = indices[:test_count]
    train_idx = indices[test_count:]
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

X_train, X_test, y_train, y_test = train_test_split_manual(
    X_scaled, y, test_size=0.2, random_state=RANDOM_STATE
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# ---- metric functions (FIXED SHAPE ISSUE) ----
def classification_metrics(y_true, y_pred):
    # Convert everything to 1D int arrays
    y_true = np.array(y_true).astype(int).reshape(-1)
    y_pred = np.array(y_pred).astype(int).reshape(-1)

    assert y_true.shape == y_pred.shape

    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    accuracy = (tp + tn) / (tp + tn + fp + fn + 1e-8)
    precision = tp / (tp + fp + 1e-8) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn + 1e-8) if (tp + fn) > 0 else 0.0
    f1 = (2 * precision * recall / (precision + recall + 1e-8)
          if (precision + recall) > 0 else 0.0)

    cm = np.array([[tn, fp],
                   [fn, tp]])
    return accuracy, precision, recall, f1, cm

def plot_confusion_matrix(cm, title="Confusion Matrix"):
    plt.figure()
    plt.imshow(cm, interpolation="nearest")
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i, j], ha="center", va="center")
    plt.colorbar()
    plt.tight_layout()
    plt.show()

# Convert to torch tensors for models
X_train_t = torch.tensor(X_train, dtype=torch.float32)
X_test_t  = torch.tensor(X_test,  dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test_t  = torch.tensor(y_test,  dtype=torch.float32).view(-1, 1)

input_dim = X_train.shape[1]


In [None]:
# =========================
# Cell 5: Logistic Regression (PyTorch)
# =========================

class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)  # logits

def train_model(model, X_train, y_train, X_val, y_val,
                lr=1e-3, epochs=500, batch_size=16, weight_decay=0.0,
                verbose=False, model_name="model"):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    n_samples = X_train.shape[0]
    best_f1 = -1.0
    best_state = None

    for epoch in range(epochs):
        model.train()
        perm = torch.randperm(n_samples)
        for i in range(0, n_samples, batch_size):
            idx = perm[i:i+batch_size]
            batch_x = X_train[idx]
            batch_y = y_train[idx]

            optimizer.zero_grad()
            logits = model(batch_x)
            loss = criterion(logits, batch_y)
            loss.backward()
            optimizer.step()

        # validation (using test set as val)
        model.eval()
        with torch.no_grad():
            val_logits = model(X_val)
            val_probs = torch.sigmoid(val_logits).cpu().numpy().flatten()
            val_pred  = (val_probs >= 0.5).astype(int)
            _, _, _, f1, _ = classification_metrics(y_val, val_pred)
            if f1 > best_f1:
                best_f1 = f1
                best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

        if verbose and (epoch + 1) % 50 == 0:
            print(f"{model_name} Epoch {epoch+1}/{epochs}, best F1 so far: {best_f1:.4f}")

    if best_state is not None:
        model.load_state_dict(best_state)

    return model

log_reg_model = LogisticRegressionModel(input_dim)
log_reg_model = train_model(
    log_reg_model,
    X_train_t, y_train_t,
    X_test_t,  y_test_t,
    lr=1e-3,
    epochs=300,
    batch_size=8,
    weight_decay=1e-4,
    verbose=True,
    model_name="LogisticRegression"
)

# evaluation
log_reg_model.eval()
with torch.no_grad():
    logits = log_reg_model(X_test_t)
    probs  = torch.sigmoid(logits).cpu().numpy().flatten()
    y_pred_lr = (probs >= 0.5).astype(int)

acc_lr, prec_lr, rec_lr, f1_lr, cm_lr = classification_metrics(y_test, y_pred_lr)

print("\n=== Logistic Regression performance ===")
print("Accuracy :", acc_lr)
print("Precision:", prec_lr)
print("Recall   :", rec_lr)
print("F1-score :", f1_lr)
print("Confusion matrix:\n", cm_lr)

plot_confusion_matrix(cm_lr, title="Logistic Regression - Confusion Matrix")


In [None]:
# =========================
# Cell 6: Neural Network (MLP)
# =========================

class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims=(64, 32), dropout=0.1):
        super().__init__()
        layers = []
        prev = input_dim
        for h in hidden_dims:
            layers.append(nn.Linear(prev, h))
            layers.append(nn.ReLU())
            if dropout > 0:
                layers.append(nn.Dropout(dropout))
            prev = h
        layers.append(nn.Linear(prev, 1))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

mlp_model = MLPClassifier(input_dim=input_dim, hidden_dims=(64, 32), dropout=0.1)

mlp_model = train_model(
    mlp_model,
    X_train_t, y_train_t,
    X_test_t,  y_test_t,
    lr=1e-3,
    epochs=500,
    batch_size=8,
    weight_decay=1e-4,
    verbose=True,
    model_name="NeuralNetwork"
)

mlp_model.eval()
with torch.no_grad():
    logits = mlp_model(X_test_t)
    probs  = torch.sigmoid(logits).cpu().numpy().flatten()
    y_pred_mlp = (probs >= 0.5).astype(int)

acc_mlp, prec_mlp, rec_mlp, f1_mlp, cm_mlp = classification_metrics(y_test, y_pred_mlp)

print("\n=== Neural Network (MLP) performance ===")
print("Accuracy :", acc_mlp)
print("Precision:", prec_mlp)
print("Recall   :", rec_mlp)
print("F1-score :", f1_mlp)
print("Confusion matrix:\n", cm_mlp)

plot_confusion_matrix(cm_mlp, title="Neural Network (MLP) - Confusion Matrix")

print("\n=== Model comparison ===")
print(f"LogReg -> acc={acc_lr:.4f}, prec={prec_lr:.4f}, rec={rec_lr:.4f}, f1={f1_lr:.4f}")
print(f"MLP    -> acc={acc_mlp:.4f}, prec={prec_mlp:.4f}, rec={rec_mlp:.4f}, f1={f1_mlp:.4f}")
best_model = "MLP" if f1_mlp >= f1_lr else "Logistic Regression"
print("Best model based on F1:", best_model)


In [None]:
# =========================
# Cell 6.5: Model Comparison + Error Statistics + Deep Analysis
# =========================

print("\n====================")
print("DETAILED MODEL ANALYSIS")
print("====================\n")

# 1) Comparison Table
comparison = pd.DataFrame({
    "Model": ["Logistic Regression", "Neural Network (MLP)"],
    "Accuracy": [acc_lr, acc_mlp],
    "Precision": [prec_lr, prec_mlp],
    "Recall": [rec_lr, rec_mlp],
    "F1-score": [f1_lr, f1_mlp],
})

print("=== Model Performance Comparison ===")
print(comparison.to_string(index=False))

# Identify best and worst model
if f1_mlp > f1_lr:
    best_model = "Neural Network (MLP)"
    worst_model = "Logistic Regression"
else:
    best_model = "Logistic Regression"
    worst_model = "Neural Network (MLP)"

print(f"\nBest Model Based on F1-score: {best_model}")
print(f"Worst Model Based on F1-score: {worst_model}")

# 2) Error statistics for classification
# Convert predictions to numeric error indicator:
#   error = |true - predicted|
# This gives:
#   correct prediction → 0
#   wrong prediction   → 1

lr_errors = np.abs(y_test - y_pred_lr)
mlp_errors = np.abs(y_test - y_pred_mlp)

print("\n=== Error Statistics (Classification Equivalent) ===")
print("Logistic Regression Error Rate:", lr_errors.mean())
print("Neural Network Error Rate:", mlp_errors.mean())

print("\nStandard Deviation of Errors:")
print("Logistic Regression STD:", lr_errors.std())
print("Neural Network STD:", mlp_errors.std())

# 3) Confusion matrices already plotted earlier, but print numeric versions again
print("\n=== Confusion Matrices ===")
print("Logistic Regression Confusion Matrix:\n", cm_lr)
print("Neural Network Confusion Matrix:\n", cm_mlp)

# 4) Additional Insight: False Positives & False Negatives
def analyze_confusion(cm, name):
    tn, fp = cm[0]
    fn, tp = cm[1]
    print(f"\n{name} Error Breakdown:")
    print(f"True Positives:  {tp}")
    print(f"True Negatives:  {tn}")
    print(f"False Positives: {fp}  (predict high depression but actually low)")
    print(f"False Negatives: {fn}  (predict low depression but actually high)")

analyze_confusion(cm_lr, "Logistic Regression")
analyze_confusion(cm_mlp, "Neural Network")

print("\n=== Interpretation Summary ===")
if best_model == "Neural Network (MLP)":
    print("- The MLP generalizes better with higher F1-score.")
    print("- Logistic Regression is simpler but struggles with class boundaries.")
else:
    print("- Logistic Regression performs slightly better in this dataset.")
    print("- The NN may be overfitting due to small dataset size.")


In [None]:
# =========================
# Cell X: Extra model comparison using scikit-learn
# =========================

# If sklearn is not installed:
#   pip install scikit-learn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)

# Use the same features and labels as before: X_scaled, y
# (Re-splitting here with stratification for fair comparison)
X_train_skl, X_test_skl, y_train_skl, y_test_skl = train_test_split(
    X_scaled, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

# Define sklearn models
sk_models = {
    "LogReg (sklearn)": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        random_state=RANDOM_STATE,
        class_weight="balanced"
    ),
    "SVM (RBF)": SVC(
        kernel="rbf",
        probability=True,
        class_weight="balanced",
        random_state=RANDOM_STATE
    ),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5)
}

results = []
cms_skl = {}

for name, model in sk_models.items():
    model.fit(X_train_skl, y_train_skl)
    y_pred = model.predict(X_test_skl)

    acc = accuracy_score(y_test_skl, y_pred)
    prec = precision_score(y_test_skl, y_pred, zero_division=0)
    rec = recall_score(y_test_skl, y_pred, zero_division=0)
    f1 = f1_score(y_test_skl, y_pred, zero_division=0)

    cm = confusion_matrix(y_test_skl, y_pred)
    cms_skl[name] = cm

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-score": f1
    })

# ---- Show table of metrics ----
results_df = pd.DataFrame(results)
print("=== Sklearn Model Performance Comparison ===")
print(results_df.to_string(index=False))

# ---- Bar chart of F1-scores ----
plt.figure(figsize=(8, 5))
plt.bar(results_df["Model"], results_df["F1-score"])
plt.title("F1-score comparison (sklearn models)")
plt.ylabel("F1-score")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.show()

# ---- Confusion matrices for each model ----
for name, cm in cms_skl.items():
    plt.figure()
    plt.imshow(cm, interpolation="nearest")
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i, j], ha="center", va="center")
    plt.colorbar()
    plt.tight_layout()
    plt.show()

# Identify best sklearn model by F1
best_row = results_df.iloc[results_df["F1-score"].idxmax()]
print("\nBest sklearn model based on F1-score:")
print(best_row)


In [None]:
# =========================
# Cell 7: Dimensionality reduction (PCA, Random Projection, t-SNE/UMAP)
# =========================

# ----- PCA from scratch -----
def pca_from_scratch(X, n_components=2):
    # X should be (n_samples, n_features)
    Xc = X - X.mean(axis=0)
    cov = np.dot(Xc.T, Xc) / (Xc.shape[0] - 1)
    eigvals, eigvecs = np.linalg.eigh(cov)
    idx = np.argsort(eigvals)[::-1]
    eigvals = eigvals[idx]
    eigvecs = eigvecs[:, idx]
    comps = eigvecs[:, :n_components]
    X_pca = np.dot(Xc, comps)
    var_ratio = eigvals[:n_components] / eigvals.sum()
    return X_pca, var_ratio

# Run PCA
X_pca, pca_var = pca_from_scratch(X_scaled, n_components=2)

plt.figure()
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, alpha=0.8)
plt.title("PCA (2 components)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.show()

print("PCA explained variance ratio:", pca_var)


# ----- Random Projection (simple linear DR) -----
def random_projection(X, n_components=2, random_state=42):
    """
    Simple random linear projection:
    X_proj = X @ W, where W has random normal entries.
    Used as a very simple baseline for dimensionality reduction.
    """
    rng = np.random.RandomState(random_state)
    W = rng.normal(size=(X.shape[1], n_components))  # (n_features, n_components)
    X_rp = np.dot(X, W)
    return X_rp

X_rp = random_projection(X_scaled, n_components=2, random_state=RANDOM_STATE)

plt.figure()
plt.scatter(X_rp[:, 0], X_rp[:, 1], c=y, alpha=0.8)
plt.title("Random Projection (2D)")
plt.xlabel("RP1")
plt.ylabel("RP2")
plt.tight_layout()
plt.show()


# ----- t-SNE or UMAP (if installed) -----
if TSNE_AVAILABLE:
    print("Running t-SNE (openTSNE)...")
    tsne = TSNE(
        n_components=2,
        learning_rate="auto",
        initialization="random",
        random_state=RANDOM_STATE,
        perplexity=min(30, max(5, X_scaled.shape[0] // 3))
    )
    X_tsne = np.asarray(tsne.fit(X_scaled))
    plt.figure()
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, alpha=0.8)
    plt.title("t-SNE (2D)")
    plt.xlabel("Dim 1")
    plt.ylabel("Dim 2")
    plt.tight_layout()
    plt.show()

elif UMAP_AVAILABLE:
    print("Running UMAP...")
    reducer = umap.UMAP(
        n_components=2,
        random_state=RANDOM_STATE,
        n_neighbors=10,
        min_dist=0.1
    )
    X_umap = reducer.fit_transform(X_scaled)
    plt.figure()
    plt.scatter(X_umap[:, 0], X_umap[:, 1], c=y, alpha=0.8)
    plt.title("UMAP (2D)")
    plt.xlabel("Dim 1")
    plt.ylabel("Dim 2")
    plt.tight_layout()
    plt.show()

else:
    print("No openTSNE or UMAP installed. Install one for non-linear DR:")
    print("  pip install openTSNE")
    print("or")
    print("  pip install umap-learn")


In [None]:
# =========================
# Cell 8: Feature importance (XGBoost + SHAP)
# =========================

if XGBOOST_AVAILABLE:
    print("Training XGBoost classifier for feature importance...")
    xgb_model = xgb.XGBClassifier(
        max_depth=4,
        learning_rate=0.1,
        n_estimators=200,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        objective="binary:logistic",
        random_state=RANDOM_STATE,
        eval_metric="logloss"
    )
    xgb_model.fit(X_train, y_train)

    importances = xgb_model.feature_importances_
    sorted_idx = np.argsort(importances)[::-1]
    top_k = min(15, len(feature_names))
    top_idx = sorted_idx[:top_k]

    plt.figure(figsize=(8, 6))
    plt.barh(range(len(top_idx)), importances[top_idx][::-1])
    plt.yticks(range(len(top_idx)), [feature_names[i] for i in top_idx][::-1])
    plt.title("XGBoost Feature Importances (Top 15)")
    plt.xlabel("Importance")
    plt.tight_layout()
    plt.show()

    print("\nTop features by importance:")
    for i in top_idx:
        print(f"{feature_names[i]}: {importances[i]:.4f}")

    if SHAP_AVAILABLE:
        print("\nRunning SHAP TreeExplainer...")
        explainer = shap.TreeExplainer(xgb_model)
        shap_values = explainer.shap_values(X_train)
        shap.summary_plot(shap_values, X_train, feature_names=feature_names, show=True)
    else:
        print("SHAP not installed. Install with: pip install shap")
else:
    print("XGBoost not installed. Install with: pip install xgboost")
