# Imports

In [9]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Configuration

In [10]:
SEED = 42
np.random.seed(SEED)

DATA_DIR = "../Data"  # Folder

# Covertype
COV_TRAIN = os.path.join(DATA_DIR, "covtype_train.csv")
COV_TEST  = os.path.join(DATA_DIR, "covtype_test.csv")

# HELOC
HELOC_TRAIN = os.path.join(DATA_DIR, "heloc_train.csv")
HELOC_TEST  = os.path.join(DATA_DIR, "heloc_test.csv")

# HIGGS
HIGGS_TRAIN = os.path.join(DATA_DIR, "higgs_train.csv")
HIGGS_TEST  = os.path.join(DATA_DIR, "higgs_test.csv")

# Sample submissions (optional)
COV_SAMPLE_SUB   = os.path.join(DATA_DIR, "covtype_test_submission.csv")
HELOC_SAMPLE_SUB = os.path.join(DATA_DIR, "heloc_test_submission.csv")
HIGGS_SAMPLE_SUB = os.path.join(DATA_DIR, "higgs_test_submission.csv")

print("Data directory:", os.path.abspath(DATA_DIR))
print("Covertype train:", COV_TRAIN)
print("HELOC train:", HELOC_TRAIN)
print("HIGGS train:", HIGGS_TRAIN)


Data directory: /home/ronlakeman/Applied Machine Learning/UvA-AML-2025/AppliedMachineLearning/Data
Covertype train: ../Data/covtype_train.csv
HELOC train: ../Data/heloc_train.csv
HIGGS train: ../Data/higgs_train.csv


# Load and preprocess

In [11]:
def load_covertype():
    """Load and preprocess the CoverType dataset.

    Expects:
    - training.csv with a column Cover_Type or CoverType as target.
    - test.csv with the same feature columns (without target).
    """
    df_train = pd.read_csv(COV_TRAIN)
    df_test  = pd.read_csv(COV_TEST)

    # Target column can be 'Cover_Type' or 'CoverType'
    if "Cover_Type" in df_train.columns:
        y_col = "Cover_Type"
    elif "CoverType" in df_train.columns:
        y_col = "CoverType"
    else:
        raise ValueError("Could not find CoverType label column in covertype training.csv")

    y = df_train[y_col].values
    X = df_train.drop(columns=[y_col])
    X_test = df_test.copy()

    # All numeric; scale with StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X.astype(np.float32))
    X_test_scaled = scaler.transform(X_test.astype(np.float32))

    return X_scaled, y, X_test_scaled


def load_heloc():
    """Load and preprocess the HELOC dataset.

    - training.csv with column 'RiskPerformance' (Good/Bad).
    - test.csv with the same feature columns.
    - Sentinel codes -7, -8, -9 are treated as missing and imputed.
    """
    df_train = pd.read_csv(HELOC_TRAIN)
    df_test  = pd.read_csv(HELOC_TEST)

    # Label: Good/Bad -> 0/1 (Bad = 1)
    y = (df_train["RiskPerformance"] == "Bad").astype(int).values
    X = df_train.drop(columns=["RiskPerformance"])
    X_test = df_test.copy()

    # Replace sentinel values with NaN
    sentinel = [-7, -8, -9]
    X = X.replace(sentinel, np.nan).astype(np.float32)
    X_test = X_test.replace(sentinel, np.nan).astype(np.float32)

    # Impute NaNs with train medians
    medians = X.median()
    X = X.fillna(medians)
    X_test = X_test.fillna(medians)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    return X_scaled, y, X_test_scaled


def load_higgs():
    """Load and preprocess the HIGGS dataset.

    - training.csv with columns: EventId, 30 features, Weight, Label (b/s or 0/1).
    - test.csv with EventId and the 30 feature columns.

    We treat -999.0 as missing and impute with medians.
    """
    df_train = pd.read_csv(HIGGS_TRAIN)
    df_test  = pd.read_csv(HIGGS_TEST)

    # Label may be given as 'Label' (b/s) or 'label' (0/1)
    if "Label" in df_train.columns:
        y_raw = df_train["Label"]
        y = (y_raw == "s").astype(int).values
    elif "label" in df_train.columns:
        y = df_train["label"].astype(int).values
    else:
        raise ValueError("Could not find label column ('Label' or 'label') in HIGGS training.csv")

    # Sample weights (if available)
    if "Weight" in df_train.columns:
        w = df_train["Weight"].values.astype(np.float32)
    else:
        w = np.ones(len(df_train), dtype=np.float32)

    # Features: drop ID, Weight, label columns
    drop_cols = [c for c in ["EventId", "Weight", "Label", "label"] if c in df_train.columns]
    feature_cols = [c for c in df_train.columns if c not in drop_cols]
    X = df_train[feature_cols].copy()
    X_test = df_test[feature_cols].copy()

    # Replace sentinel -999.0 with NaN
    X = X.replace(-999.0, np.nan).astype(np.float32)
    X_test = X_test.replace(-999.0, np.nan).astype(np.float32)

    # Impute with medians
    medians = X.median()
    X = X.fillna(medians)
    X_test = X_test.fillna(medians)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    # Event IDs (optional, useful for submissions)
    event_id_train = df_train["EventId"] if "EventId" in df_train.columns else None
    event_id_test  = df_test["EventId"] if "EventId" in df_test.columns else None

    return X_scaled, y, w, X_test_scaled, event_id_train, event_id_test

# load datasets

In [12]:
X_cov,   y_cov,   X_cov_test   = load_covertype()
X_heloc, y_heloc, X_heloc_test = load_heloc()
X_higgs, y_higgs, w_higgs, X_higgs_test, eid_tr, eid_te = load_higgs()

print("CoverType:", X_cov.shape, y_cov.shape)
print("HELOC:", X_heloc.shape, y_heloc.shape)
print("HIGGS:", X_higgs.shape, y_higgs.shape)

CoverType: (58101, 54) (58101,)
HELOC: (9413, 23) (9413,)
HIGGS: (175000, 30) (175000,)


# Build a unified feature and label


In [25]:
# Feature block sizes
d_cov   = X_cov.shape[1]
d_heloc = X_heloc.shape[1]
d_higgs = X_higgs.shape[1]

# Total unified feature length + 3 dataset-indicator features
D_total = d_cov + d_heloc + d_higgs + 3

def embed_block(X, dataset_idx):
    """
    Embed X (n_samples, d_dataset) into unified feature space.

    dataset_idx: 0 = CoverType, 1 = HELOC, 2 = HIGGS
    """
    n = X.shape[0]
    Z = np.zeros((n, D_total), dtype=np.float32)

    if dataset_idx == 0:          # CoverType block
        Z[:, :d_cov] = X
    elif dataset_idx == 1:        # HELOC block
        Z[:, d_cov:d_cov + d_heloc] = X
    elif dataset_idx == 2:        # HIGGS block
        Z[:, d_cov + d_heloc:d_cov + d_heloc + d_higgs] = X
    else:
        raise ValueError("Invalid dataset_idx, expected 0,1,2")

    # Dataset one-hot in last 3 positions
    Z[:, D_total - 3 + dataset_idx] = 1.0

    return Z


# ----- Map labels into unified label space -----

# CoverType: original labels (1..7) -> map to 0..6
cov_unique = np.sort(np.unique(y_cov))
cov_map = {v: i for i, v in enumerate(cov_unique)}
y_cov_int = np.array([cov_map[v] for v in y_cov], dtype=np.int64)

# HELOC: 0/1 -> shift by +7  (7, 8)
y_heloc_int = y_heloc.astype(np.int64) + 7

# HIGGS: 0/1 -> shift by +9  (9, 10)
y_higgs_int = y_higgs.astype(np.int64) + 9

# Embed features (train)
X_cov_emb   = embed_block(X_cov,   dataset_idx=0)
X_heloc_emb = embed_block(X_heloc, dataset_idx=1)
X_higgs_emb = embed_block(X_higgs, dataset_idx=2)

# Embed features (test) – needed for the submission step
X_cov_test_emb   = embed_block(X_cov_test,   dataset_idx=0)
X_heloc_test_emb = embed_block(X_heloc_test, dataset_idx=1)
X_higgs_test_emb = embed_block(X_higgs_test, dataset_idx=2)

# Concatenate everything (train)
X_all = np.vstack([X_cov_emb, X_heloc_emb, X_higgs_emb])
y_all = np.concatenate([y_cov_int, y_heloc_int, y_higgs_int])

# Sample weights: CoverType & HELOC = 1, HIGGS uses its normalised weight
w_cov   = np.ones_like(y_cov_int,   dtype=np.float32)
w_heloc = np.ones_like(y_heloc_int, dtype=np.float32)
w_higgs = np.ones_like(y_heloc_int, dtype=np.float32)

sample_weight_all = np.concatenate([w_cov, w_heloc, w_higgs])

print("Unified X_all:", X_all.shape, "y_all:", y_all.shape)


Unified X_all: (242514, 110) y_all: (242514,)


# Train/validation split

In [14]:
X_train, X_val, y_train, y_val, w_train, w_val = train_test_split(
    X_all,
    y_all,
    sample_weight_all,
    test_size=0.2,
    random_state=SEED,
    stratify=y_all,
)

print("Train:", X_train.shape, "Val:", X_val.shape)

Train: (194011, 110) Val: (48503, 110)


# Train foundationmodel

In [None]:
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [6, 8],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
}

xgb_base = XGBClassifier(
    random_state=SEED,
    use_label_encoder=False,
    eval_metric="mlogloss",
    n_jobs=-1,
    verbosity=1,
)

grid_search = GridSearchCV(
    xgb_base,
    param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2,
)

print("Starting grid search...")
grid_search.fit(X_train, y_train, sample_weight=w_train)

clf = grid_search.best_estimator_
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV accuracy: {grid_search.best_score_:.4f}")

# Evaluation

In [28]:
def eval_dataset(X_emb, y_int, name):
    y_pred_int = clf.predict(X_emb)
    acc = accuracy_score(y_int, y_pred_int)
    print(f"{name} accuracy (on provided data): {acc:.4f}")
    return acc

print("CoverType:", eval_dataset(X_cov_emb,   y_cov_int,   "CoverType"))
print("HELOC:   ", eval_dataset(X_heloc_emb, y_heloc_int, "HELOC"))
print("HIGGS:   ", eval_dataset(X_higgs_emb, y_higgs_int, "HIGGS"))


CoverType accuracy (on provided data): 0.9281
CoverType: 0.9281423727646684
HELOC accuracy (on provided data): 0.9194
HELOC:    0.9193668331031553
HIGGS accuracy (on provided data): 0.6872
HIGGS:    0.6872057142857143
HIGGS accuracy (on provided data): 0.6872
HIGGS:    0.6872057142857143


# Generate submition file

In [27]:
print("Generating unified submission file...\n")

# -----------------------------
# 1. Embed test datasets
# -----------------------------
X_cov_test_emb  = embed_block(X_cov_test,  dataset_idx=0)
X_heloc_test_emb = embed_block(X_heloc_test, dataset_idx=1)
X_higgs_test_emb = embed_block(X_higgs_test, dataset_idx=2)

# -----------------------------
# 2. Predict each dataset
# -----------------------------
cov_pred   = clf.predict(X_cov_test_emb)
heloc_pred = clf.predict(X_heloc_test_emb)
higgs_pred = clf.predict(X_higgs_test_emb)

# -----------------------------
# 3. Assign correct ID ranges
# -----------------------------
# CoverType: IDs start at 1
cov_df = pd.DataFrame({
    "ID": np.arange(1, 1 + len(cov_pred)),
    "Prediction": cov_pred
})

# HELOC: IDs start at 3501
heloc_start = 3501
heloc_df = pd.DataFrame({
    "ID": np.arange(heloc_start, heloc_start + len(heloc_pred)),
    "Prediction": heloc_pred
})

# HIGGS: IDs start at 4547
higgs_start = 4547
higgs_df = pd.DataFrame({
    "ID": np.arange(higgs_start, higgs_start + len(higgs_pred)),
    "Prediction": higgs_pred
})

# -----------------------------
# 4. Merge all into one CSV
# -----------------------------
submission = pd.concat([cov_df, heloc_df, higgs_df], ignore_index=True)

print("\nSubmission preview:")
print(submission.head())
print(submission.tail())
print("\nTotal rows:", len(submission))

# Save
submission_path = "combined_submission.csv"
submission.to_csv(submission_path, index=False)

print(f"\n✔ Saved unified submission to: {submission_path}")


Generating unified submission file...


Submission preview:
   ID  Prediction
0   1           0
1   2           0
2   3           0
3   4           0
4   5           0
          ID  Prediction
79541  79542           9
79542  79543           9
79543  79544           9
79544  79545           9
79545  79546           9

Total rows: 79546

✔ Saved unified submission to: combined_submission.csv

Submission preview:
   ID  Prediction
0   1           0
1   2           0
2   3           0
3   4           0
4   5           0
          ID  Prediction
79541  79542           9
79542  79543           9
79543  79544           9
79544  79545           9
79545  79546           9

Total rows: 79546

✔ Saved unified submission to: combined_submission.csv
