In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_sample_weight

from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from tabpfn import TabPFNClassifier

In [5]:
SEED = 42
np.random.seed(SEED)

DATA_DIR = "../Data"  # Folder

# Covertype
COV_TRAIN_RESAMPLED = os.path.join("../Data_Preprocessed", "covtype_resampled.csv")

# /home/ronlakeman/Applied Machine Learning/UvA-AML-2025/AppliedMachineLearning/Data_Preprocessed/covtype_resampled.csv

COV_TRAIN_NORMALIZED = os.path.join("../Data_Preprocessed", "covtype_normalized.csv")
COV_TRAIN_SCALED = os.path.join("../Data_Preprocessed", "covtype_scaled.csv")

COV_TEST  = os.path.join(DATA_DIR, "covtype_test.csv")

# HELOC
HELOC_TRAIN = os.path.join(DATA_DIR, "heloc_train.csv")
HELOC_TEST  = os.path.join(DATA_DIR, "heloc_test.csv")

# HIGGS
HIGGS_TRAIN = os.path.join(DATA_DIR, "higgs_train.csv")
HIGGS_TEST  = os.path.join(DATA_DIR, "higgs_test.csv")

# Sample submissions (optional)
COV_SAMPLE_SUB   = os.path.join(DATA_DIR, "covtype_test_submission.csv")
HELOC_SAMPLE_SUB = os.path.join(DATA_DIR, "heloc_test_submission.csv")
HIGGS_SAMPLE_SUB = os.path.join(DATA_DIR, "higgs_test_submission.csv")

# print("Data directory:", os.path.abspath(DATA_DIR))
print("Covertype train:", COV_TRAIN_RESAMPLED)
print("HELOC train:", HELOC_TRAIN)
print("HIGGS train:", HIGGS_TRAIN)

Covertype train: ../Data_Preprocessed/covtype_resampled.csv
HELOC train: ../Data/heloc_train.csv
HIGGS train: ../Data/higgs_train.csv


In [6]:
def load_covertype():
    """Load and preprocess the CoverType Preprocessed dataset.

    Expects:
    - training.csv with a column Cover_Type or CoverType as target.
    - test.csv with the same feature columns (without target).
    """
    
    df_train = pd.read_csv(COV_TRAIN_RESAMPLED)
    df_test  = pd.read_csv(COV_TEST)

    # Target column can be 'Cover_Type' or 'CoverType'
    if "Cover_Type" in df_train.columns:
        y_col = "Cover_Type"
    elif "CoverType" in df_train.columns:
        y_col = "CoverType"
    else:
        raise ValueError("Could not find CoverType label column in covertype training.csv")

    y = df_train[y_col].values
    X = df_train.drop(columns=[y_col])
    X_test = df_test.copy()

    return X, y, X_test

X_cov, y_cov, X_cov_test = load_covertype()

print("CoverType:", X_cov.shape, y_cov.shape)
display(X_cov)
display(X_cov_test)

CoverType: (130047, 54) (130047,)


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,3351,206,27,726,124,3813,192,252,180,2271,...,0,0,0,0,0,0,0,1,0,0
1,3232,111,10,541,78,1342,237,229,122,2270,...,0,0,0,0,0,0,0,1,0,0
2,3176,144,17,67,19,4191,242,234,115,3172,...,0,0,0,0,0,0,0,0,0,0
3,2962,21,14,150,20,5622,210,209,133,3768,...,0,0,0,0,0,0,0,0,0,0
4,3171,123,16,175,45,3108,246,226,103,3648,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130042,3379,300,7,285,29,1585,199,236,176,802,...,0,0,0,0,0,0,0,1,0,0
130043,3204,119,6,38,-4,4153,222,235,147,3325,...,0,0,0,0,0,0,0,0,0,0
130044,3308,100,14,85,7,3495,242,220,104,2591,...,0,0,0,0,0,0,0,0,0,0
130045,3500,85,28,346,103,2483,246,179,42,3730,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,3247,339,12,525,39,570,193,222,168,1618,...,0,0,0,0,0,0,0,0,0,0
1,3065,186,19,85,7,1570,220,250,155,2315,...,0,0,1,0,0,0,0,0,0,0
2,3169,84,21,190,22,3004,244,199,74,2064,...,1,0,0,0,0,0,0,0,0,0
3,2893,294,29,175,89,5019,127,218,225,4763,...,0,0,0,0,0,0,0,0,0,0
4,2825,81,19,85,20,108,242,202,82,134,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,3432,172,14,480,232,2471,228,246,146,721,...,0,0,0,0,0,0,0,1,0,0
3496,3358,218,6,30,-1,4611,214,246,168,4251,...,0,0,0,0,0,0,0,0,0,0
3497,3431,184,12,458,106,4201,223,248,156,3102,...,0,0,1,0,0,0,0,0,0,0
3498,3378,331,36,255,121,960,114,170,181,1517,...,0,0,0,0,0,0,0,0,1,0


In [7]:
def load_heloc():
    """Load and preprocess the HELOC dataset.

    - training.csv with column 'RiskPerformance' (Good/Bad).
    - test.csv with the same feature columns.
    - Sentinel codes -7, -8, -9 are treated as missing and imputed.
    """
    df_train = pd.read_csv(HELOC_TRAIN)
    df_test  = pd.read_csv(HELOC_TEST)

    # Label: Good/Bad -> 0/1 (Bad = 1) 
    y = (df_train["RiskPerformance"] == "Bad").astype(int).values
    X = df_train.drop(columns=["RiskPerformance"]).astype(np.float32)
    X_test = df_test.copy().astype(np.float32)

    # Replace sentinel values with NaN
    sentinel = [-7, -8, -9]
    X = X.replace(sentinel, np.nan)
    X_test = X_test.replace(sentinel, np.nan)

    # Impute NaNs with train medians
    medians = X.median()
    X = X.fillna(medians)
    X_test = X_test.fillna(medians)

    display(X)
    display(X_test)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    return X_scaled, y, X_test_scaled

X_heloc, y_heloc, X_heloc_test = load_heloc()

print("HELOC:", X_heloc.shape, y_heloc.shape)

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,69.0,148.0,4.0,66.0,41.0,0.0,0.0,100.0,15.0,7.0,...,10.0,0.0,1.0,1.0,32.0,60.0,7.0,3.0,1.0,50.0
1,77.0,229.0,3.0,109.0,23.0,0.0,0.0,100.0,15.0,7.0,...,35.0,0.0,0.0,0.0,38.0,93.0,4.0,3.0,1.0,58.0
2,58.0,46.0,7.0,38.0,13.0,0.0,0.0,93.0,8.0,4.0,...,50.0,0.0,2.0,2.0,80.0,84.0,5.0,4.0,1.0,90.0
3,72.0,186.0,6.0,76.0,20.0,0.0,0.0,97.0,15.0,6.0,...,33.0,0.0,1.0,1.0,30.0,73.0,3.0,2.0,1.0,67.0
4,80.0,226.0,2.0,66.0,35.0,0.0,0.0,100.0,15.0,7.0,...,47.0,0.0,0.0,0.0,2.0,77.0,5.0,7.0,0.0,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9408,65.0,115.0,11.0,43.0,19.0,0.0,0.0,90.0,1.0,4.0,...,50.0,0.0,0.0,0.0,52.0,77.0,5.0,6.0,0.0,85.0
9409,77.0,437.0,8.0,115.0,35.0,0.0,0.0,100.0,15.0,7.0,...,26.0,1.0,3.0,3.0,23.0,74.0,6.0,2.0,0.0,50.0
9410,75.0,140.0,7.0,56.0,21.0,0.0,0.0,100.0,15.0,7.0,...,27.0,0.0,1.0,1.0,20.0,63.0,3.0,2.0,1.0,56.0
9411,64.0,92.0,3.0,35.0,21.0,2.0,2.0,91.0,33.0,6.0,...,23.0,0.0,1.0,1.0,9.0,58.0,3.0,4.0,0.0,53.0


Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,81.0,333.0,27.0,132.0,12.0,0.0,0.0,100.0,15.0,7.0,...,25.0,0.0,1.0,1.0,51.0,89.0,3.0,1.0,0.0,80.0
1,59.0,137.0,11.0,78.0,31.0,0.0,0.0,91.0,1.0,4.0,...,47.0,0.0,0.0,0.0,62.0,93.0,12.0,4.0,3.0,94.0
2,61.0,79.0,4.0,36.0,19.0,0.0,0.0,95.0,5.0,4.0,...,26.0,0.0,6.0,6.0,31.0,86.0,5.0,3.0,1.0,62.0
3,72.0,186.0,6.0,76.0,20.0,0.0,0.0,97.0,15.0,6.0,...,33.0,0.0,1.0,1.0,30.0,73.0,3.0,2.0,1.0,67.0
4,72.0,186.0,6.0,76.0,20.0,0.0,0.0,97.0,15.0,6.0,...,33.0,0.0,1.0,1.0,30.0,73.0,3.0,2.0,1.0,67.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1041,87.0,296.0,26.0,120.0,12.0,0.0,0.0,100.0,15.0,7.0,...,42.0,0.0,0.0,0.0,17.0,33.0,2.0,1.0,0.0,75.0
1042,72.0,186.0,6.0,76.0,20.0,0.0,0.0,97.0,15.0,6.0,...,33.0,0.0,1.0,1.0,30.0,73.0,3.0,2.0,1.0,67.0
1043,75.0,162.0,8.0,77.0,40.0,0.0,0.0,100.0,15.0,7.0,...,20.0,7.0,0.0,0.0,27.0,73.0,5.0,2.0,1.0,64.0
1044,71.0,155.0,1.0,71.0,24.0,0.0,0.0,100.0,15.0,7.0,...,32.0,3.0,3.0,3.0,54.0,73.0,3.0,1.0,1.0,100.0


HELOC: (9413, 23) (9413,)


In [8]:

def load_higgs():
    """Load and preprocess the HIGGS dataset.

    - training.csv with columns: EventId, 30 features, Weight, Label (b/s or 0/1).
    - test.csv with EventId and the 30 feature columns.

    We treat -999.0 as missing and impute with medians.
    """
    df_train = pd.read_csv(HIGGS_TRAIN)
    df_test  = pd.read_csv(HIGGS_TEST)

    # Label may be given as 'Label' (b/s) or 'label' (0/1)
    if "Label" in df_train.columns:
        y_raw = df_train["Label"]
        y = (y_raw == "s").astype(int).values
    elif "label" in df_train.columns:
        y = df_train["label"].astype(int).values
    else:
        raise ValueError("Could not find label column ('Label' or 'label') in HIGGS training.csv")

    # Sample weights (if available)
    if "Weight" in df_train.columns:
        w = df_train["Weight"].values.astype(np.float32)
    else:
        w = np.ones(len(df_train), dtype=np.float32)

    # Features: drop ID, Weight, label columns
    drop_cols = [c for c in ["EventId", "Weight", "Label", "label"] if c in df_train.columns]
    feature_cols = [c for c in df_train.columns if c not in drop_cols]
    X = df_train[feature_cols].copy()
    X_test = df_test[feature_cols].copy()

    # Replace sentinel -999.0 with NaN
    X = X.replace(-999.0, np.nan).astype(np.float32)
    X_test = X_test.replace(-999.0, np.nan).astype(np.float32)

    # Impute with medians
    medians = X.median()
    X = X.fillna(medians)
    X_test = X_test.fillna(medians)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    # Event IDs (optional, useful for submissions)
    event_id_train = df_train["EventId"] if "EventId" in df_train.columns else None
    event_id_test  = df_test["EventId"] if "EventId" in df_test.columns else None

    return X_scaled, y, w, X_test_scaled, event_id_train, event_id_test

X_higgs, y_higgs, w_higgs, X_higgs_test, eid_tr, eid_te = load_higgs()

print("HIGGS:", X_higgs.shape, y_higgs.shape)

HIGGS: (175000, 30) (175000,)


In [9]:
# 1. Check the shape of all the X (Features)
print("--- Shapes of X ---")
print(f"X_cov shape:   {X_cov.shape}")
print(f"X_heloc shape: {X_heloc.shape}")
print(f"X_higgs shape: {X_higgs.shape}")

# Specific row counts (Corrected syntax: used [] instead of ())
print(f"Rows total X:   {X_cov.shape[0] + X_heloc.shape[0] + X_higgs.shape[0]}")
print("\n" + "="*30 + "\n")

# 2. Check the shape of all the y (Targets)
print("--- Shapes of y ---")
print(f"y_cov shape:   {y_cov.shape}")
print(f"y_heloc shape: {y_heloc.shape}")
print(f"y_higgs shape: {y_higgs.shape}")

# Specific row counts (Corrected syntax: used [] instead of ())
print(f"Rows total y:   {y_cov.shape[0] + y_heloc.shape[0] + y_higgs.shape[0]}")

# Feature block sizes
d_cov   = X_cov.shape[1]
d_heloc = X_heloc.shape[1]
d_higgs = X_higgs.shape[1]

# Total unified feature length + 3 dataset-indicator features
D_total = d_cov + d_heloc + d_higgs + 3
print(D_total)

--- Shapes of X ---
X_cov shape:   (130047, 54)
X_heloc shape: (9413, 23)
X_higgs shape: (175000, 30)
Rows total X:   314460


--- Shapes of y ---
y_cov shape:   (130047,)
y_heloc shape: (9413,)
y_higgs shape: (175000,)
Rows total y:   314460
110


In [10]:
def embed_block(X, dataset_idx):
    """
    Embed X (n_samples, d_dataset) into unified feature space.

    dataset_idx: 0 = CoverType, 1 = HELOC, 2 = HIGGS
    """
    n = X.shape[0]
    Z = np.zeros((n, D_total), dtype=np.float32)

    if dataset_idx == 0:          # CoverType block
        Z[:, :d_cov] = X
    elif dataset_idx == 1:        # HELOC block
        Z[:, d_cov:d_cov + d_heloc] = X
    elif dataset_idx == 2:        # HIGGS block
        Z[:, d_cov + d_heloc:d_cov + d_heloc + d_higgs] = X

    # Dataset one-hot in last 3 positions
    Z[:, D_total - 3 + dataset_idx] = 1.0

    return Z

# CoverType: original labels (1..7) -> map to 0..6
cov_unique = np.sort(np.unique(y_cov))
cov_map = {v: i for i, v in enumerate(cov_unique)}
y_cov_int = np.array([cov_map[v] for v in y_cov], dtype=np.int64)

# HELOC: 0/1 -> shift by +7  (7, 8)
y_heloc_int = y_heloc.astype(np.int64) + 7

# HIGGS: 0/1 -> shift by +9  (9, 10)
y_higgs_int = y_higgs.astype(np.int64) + 9

# Embed features (train)
X_cov_emb   = embed_block(X_cov,   dataset_idx=0)
X_heloc_emb = embed_block(X_heloc, dataset_idx=1)
X_higgs_emb = embed_block(X_higgs, dataset_idx=2)

# Embed features (test) â€“ needed for the submission step
X_cov_test_emb   = embed_block(X_cov_test,   dataset_idx=0)
X_heloc_test_emb = embed_block(X_heloc_test, dataset_idx=1)
X_higgs_test_emb = embed_block(X_higgs_test, dataset_idx=2)

# Concatenate everything (train)
X_all = np.vstack([X_cov_emb, X_heloc_emb, X_higgs_emb])
y_all = np.concatenate([y_cov_int, y_heloc_int, y_higgs_int])

# Sample weights: CoverType & HELOC = 1, HIGGS uses its normalised weight
w_cov   = np.ones_like(y_cov_int,   dtype=np.float32)
w_heloc = np.ones_like(y_heloc_int, dtype=np.float32)
w_h_norm = w_higgs.astype(np.float32) / np.mean(w_higgs)

sample_weight_all = np.concatenate([w_cov, w_heloc, w_h_norm])

print("Unified X_all:", X_all.shape, "y_all:", y_all.shape)


Unified X_all: (314460, 110) y_all: (314460,)


In [None]:
X_train, X_val, y_train, y_val, w_train, w_val = train_test_split(
    X_all,
    y_all,
    sample_weight_all,
    test_size=0.2,
    random_state=SEED,
    stratify=y_all,
)

print("Train:", X_train.shape, "Val:", X_val.shape)