In [23]:
# ======================================================
# 0. Imports & basic setup
# ======================================================
import os
import time
import numpy as np
import pandas as pd

import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.kernel_approximation import Nystroem

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# ------------------------------------------------------
# Logging helper
# ------------------------------------------------------
last_time = time.time()
def log(msg):
    global last_time
    now = time.time()
    elapsed = now - last_time
    print(f"[{time.strftime('%H:%M:%S')}] {msg} | +{elapsed:.2f}s")
    last_time = now

# ------------------------------------------------------
# Reproducibility
# ------------------------------------------------------
def set_seed(seed: int = 42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
log(f"Using device: {device}")

# ======================================================
# 1. Load data
# ======================================================
log("Loading datasets...")
train_path = "/kaggle/input/start-up-founder-retention-prediction/train.csv"
test_path  = "/kaggle/input/start-up-founder-retention-prediction/test.csv"

df = pd.read_csv(train_path)
df_ext_test = pd.read_csv(test_path)

log(f"Train shape: {df.shape}, Test shape: {df_ext_test.shape}")

# ======================================================
# 2. Column groups & target
# ======================================================
log("Setting column groups...")

numeric_cols = [
    "founder_id", "founder_age", "years_with_startup",
    "monthly_revenue_generated", "funding_rounds_led",
    "distance_from_investor_hub", "num_dependents"
]

ordinal_cols = [
    "founder_visibility", "startup_reputation", "team_size_category",
    "startup_stage", "startup_performance_rating", "venture_satisfaction",
    "work_life_balance_rating"
]

categorical_cols = [
    "founder_gender", "founder_role", "education_background",
    "personal_status", "innovation_support"
]

boolean_cols = ["working_overtime", "remote_operations", "leadership_scope"]
target_col = "retention_status"

# ======================================================
# 3. Normalise boolean columns
# ======================================================
log("Normalizing boolean columns...")

def normalize_boolean(col):
    return (
        col.astype(str)
           .str.strip()
           .str.lower()
           .map({"true":1,"false":0,"yes":1,"no":0,"1":1,"0":0})
           .astype("Int64")
    )

for col in boolean_cols:
    df[col] = normalize_boolean(df[col])
    df_ext_test[col] = normalize_boolean(df_ext_test[col])

log("Boolean normalization done.")

# ======================================================
# 4. Ordinal encoding
# ======================================================
log("Applying ordinal encoding...")

ordinal_mappings = {
    "founder_visibility":      {"low":0,"medium":1,"high":2,"very high":3},
    "startup_reputation":      {"poor":0,"fair":1,"good":2,"excellent":3},
    "team_size_category":      {"small":0,"medium":1,"large":2},
    "startup_stage":           {"entry":0,"mid":1,"senior":2},
    "startup_performance_rating": {"low":0,"below average":1,"average":2,"high":3},
    "venture_satisfaction":    {"low":0,"medium":1,"high":2,"very high":3},
    "work_life_balance_rating":{"fair":0,"good":1,"excellent":2}
}

def apply_ordinal(df_local, col, mapping):
    df_local[col] = (
        df_local[col]
        .astype(str)
        .str.strip()
        .str.lower()
        .map(mapping)
    )

for col in ordinal_cols:
    apply_ordinal(df, col, ordinal_mappings[col])
    apply_ordinal(df_ext_test, col, ordinal_mappings[col])

log("Ordinal encoding done.")

# ======================================================
# 5. Convert target to {0, 1}
# ======================================================
log("Converting target variable...")
df[target_col] = df[target_col].map({"Stayed":1, "Left":0})

X_full = df.drop(columns=[target_col])
y_full = df[target_col].values

# ======================================================
# 6. Preprocessing: ColumnTransformer
# ======================================================
log("Building preprocessing pipelines...")

# Highly skewed numeric columns → log1p
skewed_numeric_cols = ["monthly_revenue_generated", "distance_from_investor_hub"]
skewed_numeric_cols = [c for c in skewed_numeric_cols if c in numeric_cols]
skew_indices = [numeric_cols.index(c) for c in skewed_numeric_cols]

def log_transform_selected(X):
    X = X.copy().astype(float)
    for idx in skew_indices:
        X[:, idx] = np.log1p(np.clip(X[:, idx], 0, None))
    return X

numeric_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("log", FunctionTransformer(log_transform_selected, validate=False)),
    ("scale", StandardScaler())
])

def make_ohe():
    # Keep compatibility with older sklearn
    if sklearn.__version__ >= "1.2":
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    return OneHotEncoder(handle_unknown="ignore", sparse=False)

categorical_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe", make_ohe())
])

boolean_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent"))
])

ordinal_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("scale", StandardScaler())
])

preprocessor_template = ColumnTransformer([
    ("num",  numeric_pipeline,   numeric_cols),
    ("ord",  ordinal_pipeline,   ordinal_cols),
    ("cat",  categorical_pipeline, categorical_cols),
    ("bool", boolean_pipeline,   boolean_cols)
])

log("Preprocessing setup ready.")

# ======================================================
# 7. Torch Linear SVM (for Nystroem features)
# ======================================================
class TorchLinearSVM(nn.Module):
    """
    Simple linear SVM implemented in PyTorch.
    Uses a single Linear layer and hinge loss.
    """
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)  # includes bias term

    def forward(self, x):
        return self.linear(x).squeeze(1)  # shape: (batch,)

def hinge_loss(outputs, targets):
    """
    outputs: raw scores, shape (batch,)
    targets: in {-1, +1}, shape (batch,)
    """
    margins = 1.0 - targets * outputs
    return torch.clamp(margins, min=0).mean()

def train_torch_svm(
    X_train_np,
    y_train_np,
    epochs=100,
    batch_size=512,
    lr=1e-3,
    weight_decay=1e-2,
    patience=50,
    verbose=True
):
    """
    Train a linear SVM on top of Nystroem features using PyTorch.
    Includes early stopping based on validation macro F1.
    """

    # -----------------------------------------------------------
    # Convert to torch tensors
    # -----------------------------------------------------------
    X = torch.tensor(X_train_np, dtype=torch.float32)
    y_signed = np.where(y_train_np == 1, 1.0, -1.0)
    y = torch.tensor(y_signed, dtype=torch.float32)

    # -----------------------------------------------------------
    # Create small validation split (10% of training data)
    # -----------------------------------------------------------
    N = len(X)
    val_size = int(0.10 * N)
    train_size = N - val_size

    perm = torch.randperm(N)
    X = X[perm]
    y = y[perm]

    X_tr, X_val = X[:train_size], X[train_size:]
    y_tr, y_val = y[:train_size], y[train_size:]

    # Dataloaders
    train_ds = TensorDataset(X_tr.to(device), y_tr.to(device))
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

    # -----------------------------------------------------------
    # Model + Optimizer
    # -----------------------------------------------------------
    model = TorchLinearSVM(input_dim=X_train_np.shape[1]).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    # -----------------------------------------------------------
    # Early Stopping tracking
    # -----------------------------------------------------------
    best_f1 = -1
    best_state = None
    no_improve = 0

    # -----------------------------------------------------------
    # Training loop
    # -----------------------------------------------------------
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0.0

        for xb, yb in train_loader:
            optimizer.zero_grad()
            out = model(xb)
            loss = hinge_loss(out, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * xb.size(0)

        avg_loss = total_loss / train_size

        # ---------------------------
        # Validation evaluation
        # ---------------------------
        model.eval()
        with torch.no_grad():
            val_scores = model(X_val.to(device)).cpu().numpy()
            val_pred = (val_scores >= 0).astype(int)
            val_true = (y_val.cpu().numpy() == 1).astype(int)

            val_f1 = f1_score(val_true, val_pred, average="macro")

        if verbose:
            print(f"Epoch {epoch:03d} | Loss={avg_loss:.4f} | Val Macro F1={val_f1:.4f}")

        # ---------------------------
        # Early stopping check
        # ---------------------------
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_state = model.state_dict()
            no_improve = 0
        else:
            no_improve += 1

        if no_improve >= patience:
            if verbose:
                print(f"Early stopping triggered at epoch {epoch}.")
            break

    # Load best model
    if best_state is not None:
        model.load_state_dict(best_state)

    if verbose:
        print(f"Best Val Macro F1: {best_f1:.4f}")

    return model


def svm_predict(model, X_np):
    """
    Predict class labels {0,1} using linear SVM model.
    """
    model.eval()
    X_t = torch.tensor(X_np, dtype=torch.float32).to(device)
    with torch.no_grad():
        scores = model(X_t).cpu().numpy()
    # sign threshold at 0: score >= 0 → class 1 ("Stayed")
    y_pred = (scores >= 0.0).astype(int)
    return y_pred, scores  # scores used for AUC (via sigmoid)

# ======================================================
# 8. Nystroem + Torch SVM experiment helper
# ======================================================
def run_experiment(
    experiment_name,
    X,
    y,
    train_fraction=0.8,
    n_components=512,
    gamma="scale",
    epochs=25
):
    """
    Runs one experiment:
      - train_fraction of data used for training
      - rest used for evaluation
      - preprocessing + Nystroem + Torch SVM
      - K-fold on eval split (same as in original SVM script)
    """
    log(f"==== {experiment_name}: train_fraction={train_fraction:.2f} ====")

    X_train, X_eval, y_train, y_eval = train_test_split(
        X,
        y,
        train_size=train_fraction,
        stratify=y,
        random_state=42
    )

    log(f"{experiment_name}: Train size = {X_train.shape}, Eval size = {X_eval.shape}")

    # Clone the preprocessor template (fresh instance)
    import copy
    preprocessor = copy.deepcopy(preprocessor_template)

    # ------------------------------
    # Fit preprocessor on train only
    # ------------------------------
    log(f"{experiment_name}: Fitting preprocessor on train...")
    preprocessor.fit(X_train)
    X_train_proc = preprocessor.transform(X_train)
    X_eval_proc  = preprocessor.transform(X_eval)

    # Ensure dense
    X_train_proc = np.array(X_train_proc)
    X_eval_proc  = np.array(X_eval_proc)
    log(f"{experiment_name}: Preprocessed shapes: train={X_train_proc.shape}, eval={X_eval_proc.shape}")

    # ------------------------------
    # Nystroem kernel approximation
    # ------------------------------
    log(f"{experiment_name}: Computing gamma='scale' numeric value...")

    # Compute gamma similar to SVC(gamma='scale')
    var = X_train_proc.var()
    n_features = X_train_proc.shape[1]
    gamma_value = 1.0 / (n_features * var)
    
    log(f"{experiment_name}: Using gamma={gamma_value:.6f}")
    
    log(f"{experiment_name}: Fitting Nystroem (approximate RBF kernel)...")
    nystroem = Nystroem(
        kernel="rbf",
        gamma=float(gamma_value),
        n_components=n_components,
        random_state=42
    )
    X_train_nys = nystroem.fit_transform(X_train_proc)
    X_eval_nys  = nystroem.transform(X_eval_proc)


    log(f"{experiment_name}: Nystroem shapes: train={X_train_nys.shape}, eval={X_eval_nys.shape}")

    # ------------------------------
    # Train Torch Linear SVM on Nystroem features
    # ------------------------------
    log(f"{experiment_name}: Training Torch Linear SVM on GPU (if available)...")
    model = train_torch_svm(
        X_train_np=X_train_nys,
        y_train_np=y_train,
        epochs=epochs,
        batch_size=512,
        lr=1e-3,
        weight_decay=1e-2,
        patience=50,
        verbose=True
    )

    # ------------------------------
    # Evaluation on eval split
    # ------------------------------
    log(f"{experiment_name}: Evaluating on eval split...")
    y_pred_eval, scores_eval = svm_predict(model, X_eval_nys)
    # Use sigmoid(scores) as pseudo-probabilities for class 1
    probs_eval = 1.0 / (1.0 + np.exp(-scores_eval))

    f1 = f1_score(y_eval, y_pred_eval, average="macro")
    auc = roc_auc_score(y_eval, probs_eval)
    cm = confusion_matrix(y_eval, y_pred_eval)

    print(f"\n[{experiment_name}] Eval F1 = {f1:.4f}, Eval AUC = {auc:.4f}")
    print(f"[{experiment_name}] Confusion matrix:\n{cm}")

    # ------------------------------
    # K-fold on eval split (like original script)
    # ------------------------------
    log(f"{experiment_name}: Starting K-fold evaluation on eval split...")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_f1, fold_auc = [], []
    combined_cm = np.zeros((2, 2), dtype=int)

    X_eval_df = X_eval.reset_index(drop=True)
    y_eval_arr = pd.Series(y_eval).reset_index(drop=True).to_numpy()

    # For each fold, we *re-use* the same trained model and precomputed Nystroem representation
    # just like the original SVM script reused the same classifier.
    for fold, (_, idx) in enumerate(skf.split(X_eval_df, y_eval_arr), 1):
        log(f"{experiment_name}: K-Fold {fold}...")

        X_fold = X_eval_nys[idx]
        y_fold = y_eval_arr[idx]

        y_pred_fold, scores_fold = svm_predict(model, X_fold)
        probs_fold = 1.0 / (1.0 + np.exp(-scores_fold))

        f1_fold = f1_score(y_fold, y_pred_fold, average="macro")
        auc_fold = roc_auc_score(y_fold, probs_fold)
        cm_fold = confusion_matrix(y_fold, y_pred_fold)

        fold_f1.append(f1_fold)
        fold_auc.append(auc_fold)
        combined_cm += cm_fold

        print(f"  Fold {fold}: F1={f1_fold:.4f}, AUC={auc_fold:.4f}")
        print(f"  Confusion matrix:\n{cm_fold}")

    avg_f1 = np.mean(fold_f1)
    avg_auc = np.mean(fold_auc)

    print(f"\n[{experiment_name}] K-fold Avg F1 = {avg_f1:.4f}, Avg AUC = {avg_auc:.4f}")
    print(f"[{experiment_name}] K-fold combined confusion matrix:\n{combined_cm}")

    results = {
        "experiment": experiment_name,
        "train_fraction": train_fraction,
        "eval_F1": f1,
        "eval_AUC": auc,
        "kfold_F1": avg_f1,
        "kfold_AUC": avg_auc,
        "eval_confusion_matrix": cm,
        "kfold_confusion_matrix": combined_cm,
        "preprocessor": preprocessor,
        "nystroem": nystroem,
        "model": model
    }
    return results

# ======================================================
# 9. Run experiments:
#    A) 80/20 on FULL dataset
#    B) 80/20 on MINI dataset (20% of full)
# ======================================================

log("===== Creating MINI-DATASET (20% of full dataset) =====")

# Take 20% of full dataset to simulate "small dataset"
X_mini, _, y_mini, _ = train_test_split(
    X_full,
    y_full,
    train_size=0.20,     # mini dataset = 20% of full
    stratify=y_full,
    random_state=42
)

log(f"Mini dataset shape: {X_mini.shape}, y_mini shape: {y_mini.shape}")

# ======================================================
# A) Full dataset 80/20 experiment
# ======================================================
results_full_80_20 = run_experiment(
    experiment_name="Full_80_20",
    X=X_full,
    y=y_full,
    train_fraction=0.80,   # always 80/20
    n_components=512,
    gamma="scale"
)

# ======================================================
# B) Mini dataset 80/20 experiment
# ======================================================
results_mini_80_20 = run_experiment(
    experiment_name="Mini_80_20",
    X=X_mini,
    y=y_mini,
    train_fraction=0.80,   # 80/20 split on mini dataset
    n_components=512,
    gamma="scale",
    epochs=100
)

log("===== Both experiments completed =====")

print("\n================= COMPARISON =================")
print(f"Full_80_20: Train size=80% of 59611 → ~47,000 samples")
print(f"  Eval F1 = {results_full_80_20['eval_F1']:.4f}  |  KFold F1 = {results_full_80_20['kfold_F1']:.4f}")

print(f"\nMini_80_20: Train size=80% of 20% mini dataset (~9,500 samples → train ≈ 7,600)")
print(f"  Eval F1 = {results_mini_80_20['eval_F1']:.4f}  |  KFold F1 = {results_mini_80_20['kfold_F1']:.4f}")





[16:57:35] Using device: cuda | +0.00s
[16:57:35] Loading datasets... | +0.00s
[16:57:35] Train shape: (59611, 24), Test shape: (14900, 23) | +0.18s
[16:57:35] Setting column groups... | +0.00s
[16:57:35] Normalizing boolean columns... | +0.00s
[16:57:35] Boolean normalization done. | +0.07s
[16:57:35] Applying ordinal encoding... | +0.00s
[16:57:35] Ordinal encoding done. | +0.18s
[16:57:35] Converting target variable... | +0.00s
[16:57:35] Building preprocessing pipelines... | +0.01s
[16:57:35] Preprocessing setup ready. | +0.00s
[16:57:35] ===== Creating MINI-DATASET (20% of full dataset) ===== | +0.00s
[16:57:35] Mini dataset shape: (11922, 23), y_mini shape: (11922,) | +0.03s
[16:57:35] ==== Full_80_20: train_fraction=0.80 ==== | +0.00s
[16:57:36] Full_80_20: Train size = (47688, 23), Eval size = (11923, 23) | +0.03s
[16:57:36] Full_80_20: Fitting preprocessor on train... | +0.00s
[16:57:36] Full_80_20: Preprocessed shapes: train=(47688, 34), eval=(11923, 34) | +0.30s
[16:57:36] F

In [13]:
# ======================================================
# 0. Imports & basic setup
# ======================================================
import os
import time
import numpy as np
import pandas as pd

import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.kernel_approximation import Nystroem

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader



# ------------------------------------------------------
# Logging helper
# ------------------------------------------------------
last_time = time.time()
def log(msg):
    global last_time
    now = time.time()
    elapsed = now - last_time
    print(f"[{time.strftime('%H:%M:%S')}] {msg} | +{elapsed:.2f}s")
    last_time = now


# ------------------------------------------------------
# Reproducibility
# ------------------------------------------------------
def set_seed(seed: int = 42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
log(f"Using device: {device}")


# ======================================================
# 1. Load data
# ======================================================
log("Loading datasets...")
train_path = "/kaggle/input/start-up-founder-retention-prediction/train.csv"
test_path  = "/kaggle/input/start-up-founder-retention-prediction/test.csv"

df = pd.read_csv(train_path)
df_ext_test = pd.read_csv(test_path)
log(f"Train shape: {df.shape}, Test shape: {df_ext_test.shape}")


# ======================================================
# 2. Column groups & target
# ======================================================
numeric_cols = [
    "founder_id", "founder_age", "years_with_startup",
    "monthly_revenue_generated", "funding_rounds_led",
    "distance_from_investor_hub", "num_dependents"
]

ordinal_cols = [
    "founder_visibility", "startup_reputation", "team_size_category",
    "startup_stage", "startup_performance_rating", "venture_satisfaction",
    "work_life_balance_rating"
]

categorical_cols = [
    "founder_gender", "founder_role", "education_background",
    "personal_status", "innovation_support"
]

boolean_cols = ["working_overtime", "remote_operations", "leadership_scope"]
target_col = "retention_status"



# ======================================================
# 3. Normalise boolean columns
# ======================================================
def normalize_boolean(col):
    return (
        col.astype(str)
           .str.strip()
           .str.lower()
           .map({"true":1,"false":0,"yes":1,"no":0,"1":1,"0":0})
           .astype("Int64")
    )

for col in boolean_cols:
    df[col] = normalize_boolean(df[col])
    df_ext_test[col] = normalize_boolean(df_ext_test[col])



# ======================================================
# 4. Ordinal encoding
# ======================================================
ordinal_mappings = {
    "founder_visibility":      {"low":0,"medium":1,"high":2,"very high":3},
    "startup_reputation":      {"poor":0,"fair":1,"good":2,"excellent":3},
    "team_size_category":      {"small":0,"medium":1,"large":2},
    "startup_stage":           {"entry":0,"mid":1,"senior":2},
    "startup_performance_rating": {"low":0,"below average":1,"average":2,"high":3},
    "venture_satisfaction":    {"low":0,"medium":1,"high":2,"very high":3},
    "work_life_balance_rating":{"fair":0,"good":1,"excellent":2}
}

def apply_ordinal(df_local, col, mapping):
    df_local[col] = (
        df_local[col]
        .astype(str)
        .str.strip()
        .str.lower()
        .map(mapping)
    )

for col in ordinal_cols:
    apply_ordinal(df, col, ordinal_mappings[col])
    apply_ordinal(df_ext_test, col, ordinal_mappings[col])



# ======================================================
# 5. Convert target to {0,1}
# ======================================================
df[target_col] = df[target_col].map({"Stayed":1, "Left":0})

X_full = df.drop(columns=[target_col])
y_full = df[target_col].values



# ======================================================
# 6. Preprocessing: ColumnTransformer
# ======================================================
skewed_numeric_cols = ["monthly_revenue_generated", "distance_from_investor_hub"]
skewed_numeric_cols = [c for c in skewed_numeric_cols if c in numeric_cols]
skew_indices = [numeric_cols.index(c) for c in skewed_numeric_cols]

def log_transform_selected(X):
    X = X.copy().astype(float)
    for idx in skew_indices:
        X[:, idx] = np.log1p(np.clip(X[:, idx], 0, None))
    return X

numeric_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("log", FunctionTransformer(log_transform_selected, validate=False)),
    ("scale", StandardScaler())
])

def make_ohe():
    if sklearn.__version__ >= "1.2":
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    return OneHotEncoder(handle_unknown="ignore", sparse=False)

categorical_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe", make_ohe())
])

boolean_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent"))
])

ordinal_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("scale", StandardScaler())
])

preprocessor_template = ColumnTransformer([
    ("num", numeric_pipeline, numeric_cols),
    ("ord", ordinal_pipeline, ordinal_cols),
    ("cat", categorical_pipeline, categorical_cols),
    ("bool", boolean_pipeline, boolean_cols)
])

log("Preprocessing setup ready.")



# ======================================================
# 7. Neural Network Equivalent to SVM (MLP classifier)
# ======================================================
class NystroemMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(hidden_dim // 2, 2)   # 2 classes: Left / Stayed
        )

    def forward(self, x):
        return self.net(x)

class HungryNystroemMLP(nn.Module):
    def __init__(self, input_dim, dropout=0.8):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(dropout),

            nn.Linear(512, 512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(dropout),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Dropout(dropout),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.Dropout(dropout),

            nn.Linear(128, 2)
        )

    def forward(self, x):
        return self.model(x)


# ======================================================
# 8. Train NN (early stopping on val macro F1)
# ======================================================
def train_nystroem_nn(
    X_train_np,
    y_train_np,
    epochs=100,
    batch_size=512,
    lr=1e-3,
    weight_decay=1e-1,
    patience=50,
    verbose=True
):

    # Convert
    X = torch.tensor(X_train_np, dtype=torch.float32)
    y = torch.tensor(y_train_np, dtype=torch.long)

    # Validation split (10%)
    N = len(X)
    val_size = int(0.10 * N)
    train_size = N - val_size

    perm = torch.randperm(N)
    X = X[perm]
    y = y[perm]

    X_tr, X_val = X[:train_size], X[train_size:]
    y_tr, y_val = y[:train_size], y[train_size:]

    train_ds = TensorDataset(X_tr.to(device), y_tr.to(device))
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

    input_dim = X_train_np.shape[1]
    model = HungryNystroemMLP(input_dim=input_dim).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    best_state = None
    best_f1 = -1
    no_improve = 0

    # Train loop
    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0

        for xb, yb in train_loader:
            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * xb.size(0)

        train_loss = running_loss / train_size

        # Validation macro F1
        model.eval()
        with torch.no_grad():
            logits_val = model(X_val.to(device)).cpu()
            preds_val = logits_val.argmax(dim=1).numpy()
            true_val = y_val.numpy()

            val_f1 = f1_score(true_val, preds_val, average="macro")

        if verbose:
            print(f"Epoch {epoch:03d} | Loss={train_loss:.4f} | Val Macro F1={val_f1:.4f}")

        # Early stopping
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_state = model.state_dict()
            no_improve = 0
        else:
            no_improve += 1

        if no_improve >= patience:
            print(f"Early stopping at epoch {epoch}.")
            break

    # Restore best model
    model.load_state_dict(best_state)
    print(f"Best validation macro F1: {best_f1:.4f}")
    return model



def nn_predict(model, X_np):
    X_t = torch.tensor(X_np, dtype=torch.float32).to(device)
    model.eval()
    with torch.no_grad():
        logits = model(X_t).cpu().numpy()
    preds = np.argmax(logits, axis=1)
    scores = logits[:,1]  # probability-like logit of class 1
    return preds, scores



# ======================================================
# 9. Experiment function (same as SVM version)
# ======================================================
def run_experiment(
    experiment_name,
    X,
    y,
    train_fraction=0.8,
    n_components=512,
    gamma="scale",
    epochs=60
):
    log(f"==== {experiment_name}: train_fraction={train_fraction:.2f} ====")

    # Train/eval split
    X_train, X_eval, y_train, y_eval = train_test_split(
        X,
        y,
        train_size=train_fraction,
        stratify=y,
        random_state=42
    )
    log(f"{experiment_name}: Train={X_train.shape}, Eval={X_eval.shape}")

    # Preprocessing
    import copy
    preprocessor = copy.deepcopy(preprocessor_template)
    preprocessor.fit(X_train)

    X_train_proc = np.array(preprocessor.transform(X_train))
    X_eval_proc  = np.array(preprocessor.transform(X_eval))

    # Gamma = 'scale'
    var = X_train_proc.var()
    n_features = X_train_proc.shape[1]
    gamma_value = 1.0 / (n_features * var)

    # Nystroem
    nys = Nystroem(
        kernel="rbf",
        gamma=float(gamma_value),
        n_components=n_components,
        random_state=42
    )

    X_train_nys = nys.fit_transform(X_train_proc)
    X_eval_nys  = nys.transform(X_eval_proc)

    # Train NN
    model = train_nystroem_nn(
        X_train_np=X_train_nys,
        y_train_np=y_train,
        epochs=epochs,
        batch_size=512,
        lr=1e-3,
        weight_decay=1e-2,
        patience=50,
        verbose=True
    )

    # Eval
    y_pred, scores = nn_predict(model, X_eval_nys)
    probs = 1 / (1 + np.exp(-scores))

    f1 = f1_score(y_eval, y_pred, average="macro")
    auc = roc_auc_score(y_eval, probs)
    cm = confusion_matrix(y_eval, y_pred)

    print(f"\n[{experiment_name}] Eval Macro F1 = {f1:.4f}, AUC={auc:.4f}")
    print(f"[{experiment_name}] Confusion Matrix:\n{cm}")

    return {
        "experiment": experiment_name,
        "eval_F1": f1,
        "eval_AUC": auc,
        "confusion_matrix": cm,
        "preprocessor": preprocessor,
        "nystroem": nys,
        "model": model
    }



# ======================================================
# 10. Build mini-dataset (20%)
# ======================================================
log("Creating MINI dataset (20%)")
X_mini, _, y_mini, _ = train_test_split(
    X_full, y_full,
    train_size=0.20,
    stratify=y_full,
    random_state=42
)



# ======================================================
# 11. Run experiments
# ======================================================
results_full = run_experiment(
    experiment_name="Full_80_20",
    X=X_full,
    y=y_full,
    train_fraction=0.80,
    n_components=512,
    epochs=60
)

results_mini = run_experiment(
    experiment_name="Mini_80_20",
    X=X_mini,
    y=y_mini,
    train_fraction=0.80,
    n_components=512,
    epochs=60
)



# ======================================================
# 12. Comparison
# ======================================================
print("\n======= COMPARISON RESULTS =======")
print(f"Full_80_20: Macro F1 = {results_full['eval_F1']:.4f}")
print(f"Mini_80_20: Macro F1 = {results_mini['eval_F1']:.4f}")



[16:37:17] Using device: cuda | +0.00s
[16:37:17] Loading datasets... | +0.00s
[16:37:17] Train shape: (59611, 24), Test shape: (14900, 23) | +0.19s
[16:37:18] Preprocessing setup ready. | +0.26s
[16:37:18] Creating MINI dataset (20%) | +0.00s
[16:37:18] ==== Full_80_20: train_fraction=0.80 ==== | +0.03s
[16:37:18] Full_80_20: Train=(47688, 23), Eval=(11923, 23) | +0.02s
Epoch 001 | Loss=0.7223 | Val Macro F1=0.6997
Epoch 002 | Loss=0.5936 | Val Macro F1=0.7349
Epoch 003 | Loss=0.5459 | Val Macro F1=0.7436
Epoch 004 | Loss=0.5305 | Val Macro F1=0.7453
Epoch 005 | Loss=0.5223 | Val Macro F1=0.7460
Epoch 006 | Loss=0.5202 | Val Macro F1=0.7457
Epoch 007 | Loss=0.5188 | Val Macro F1=0.7429
Epoch 008 | Loss=0.5144 | Val Macro F1=0.7460
Epoch 009 | Loss=0.5122 | Val Macro F1=0.7454
Epoch 010 | Loss=0.5116 | Val Macro F1=0.7462
Epoch 011 | Loss=0.5100 | Val Macro F1=0.7434
Epoch 012 | Loss=0.5083 | Val Macro F1=0.7489
Epoch 013 | Loss=0.5082 | Val Macro F1=0.7501
Epoch 014 | Loss=0.5089 | Va