In [1]:
!pip install -q tabpfn pandas scikit-learn

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from tabpfn import TabPFNClassifier
from tabpfn.constants import ModelVersion

In [3]:
from google.colab import files
import zipfile, os

uploaded = files.upload()  # upload: aml-2025-benchmarking-tabular-ml-datasets.zip

zip_name = list(uploaded.keys())[0]
print("Zip:", zip_name)

with zipfile.ZipFile(zip_name, "r") as z:
    z.extractall("data")

data_dir = "data/aml-2025-benchmarking-tabular-ml-datasets"
print(os.listdir(data_dir))

Saving aml-2025-benchmarking-tabular-ml-datasets.zip to aml-2025-benchmarking-tabular-ml-datasets (2).zip
Zip: aml-2025-benchmarking-tabular-ml-datasets (2).zip
['heloc_test.csv', 'higgs_train.csv', 'covtype_test_submission.csv', 'heloc_train.csv', 'higgs_test.csv', 'heloc_test_submission.csv', 'combined_test_sample_submission.csv', 'higgs_test_submission.csv', 'covtype_train.csv', 'covtype_test.csv']


In [4]:
from sklearn.model_selection import StratifiedShuffleSplit

def stratified_subsample(X, y, n_samples=10_000, random_state=42):
    if n_samples >= len(X):
        return X.copy(), y.copy()

    sss = StratifiedShuffleSplit(
        n_splits=1,
        train_size=n_samples,
        random_state=random_state
    )
    idx_sub, _ = next(sss.split(X, y))
    X_sub = X.iloc[idx_sub].reset_index(drop=True)
    y_sub = y.iloc[idx_sub].reset_index(drop=True)
    return X_sub, y_sub

In [5]:
from tabpfn import TabPFNClassifier
from tabpfn.constants import ModelVersion

def make_tabpfn_v2():
    return TabPFNClassifier.create_default_for_version(ModelVersion.V2)

In [6]:
heloc_train = pd.read_csv(f"{data_dir}/heloc_train.csv")
heloc_test  = pd.read_csv(f"{data_dir}/heloc_test.csv")
heloc_sub   = pd.read_csv(f"{data_dir}/heloc_test_submission.csv")

heloc_train["RiskPerformance"] = heloc_train["RiskPerformance"].map({"Good": 1, "Bad": 0})

X_heloc = heloc_train.drop(columns=["RiskPerformance"])
y_heloc = heloc_train["RiskPerformance"]
X_heloc_test = heloc_test.copy()

print("HELOC geladen:", X_heloc.shape, X_heloc_test.shape)

HELOC geladen: (9413, 23) (1046, 23)


In [7]:
X_heloc_sub, y_heloc_sub = stratified_subsample(X_heloc, y_heloc, n_samples=10_000)

heloc_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", make_tabpfn_v2()),
])

Xtr, Xval, ytr, yval = train_test_split(
    X_heloc_sub, y_heloc_sub,
    test_size=0.2,
    random_state=42,
    stratify=y_heloc_sub
)

heloc_model.fit(Xtr, ytr)
val_pred = heloc_model.predict(Xval)

print("HELOC validation accuracy (10.000 rows):", accuracy_score(yval, val_pred))

HELOC validation accuracy (10.000 rows): 0.7371216144450345


In [8]:
higgs_train = pd.read_csv(f"{data_dir}/higgs_train.csv")
higgs_test  = pd.read_csv(f"{data_dir}/higgs_test.csv")
higgs_sub   = pd.read_csv(f"{data_dir}/higgs_test_submission.csv")

higgs_train["Label"] = higgs_train["Label"].map({"s": 1, "b": 0})

# Drop Weight if present
drop_cols = [c for c in ["Weight"] if c in higgs_train.columns]

X_higgs = higgs_train.drop(columns=["Label"] + drop_cols)
y_higgs = higgs_train["Label"]

X_higgs_test = higgs_test.drop(columns=drop_cols)

print("HIGGS geladen:", X_higgs.shape, X_higgs_test.shape)


HIGGS geladen: (175000, 31) (75000, 31)


In [9]:
X_higgs_sub, y_higgs_sub = stratified_subsample(X_higgs, y_higgs, n_samples=10_000)

higgs_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", make_tabpfn_v2()),
])

Xtr, Xval, ytr, yval = train_test_split(
    X_higgs_sub, y_higgs_sub,
    test_size=0.2,
    random_state=42,
    stratify=y_higgs_sub
)

higgs_model.fit(Xtr, ytr)
h_val_pred = higgs_model.predict(Xval)

print("HIGGS validation accuracy (10.000 rows):", accuracy_score(yval, h_val_pred))

HIGGS validation accuracy (10.000 rows): 0.821


In [10]:
cov_train = pd.read_csv(f"{data_dir}/covtype_train.csv")
cov_test  = pd.read_csv(f"{data_dir}/covtype_test.csv")
cov_sub   = pd.read_csv(f"{data_dir}/covtype_test_submission.csv")

X_cov = cov_train.drop(columns=["Cover_Type"])
y_cov = cov_train["Cover_Type"]
X_cov_test = cov_test.copy()

print("COVTYPE geladen:", X_cov.shape, X_cov_test.shape)

COVTYPE geladen: (58101, 54) (3500, 54)


In [11]:
X_cov_sub, y_cov_sub = stratified_subsample(X_cov, y_cov, n_samples=10_000)

cov_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", make_tabpfn_v2()),
])

Xtr, Xval, ytr, yval = train_test_split(
    X_cov_sub, y_cov_sub,
    test_size=0.2,
    random_state=42,
    stratify=y_cov_sub
)

cov_model.fit(Xtr, ytr)
c_val_pred = cov_model.predict(Xval)

print("COVTYPE validation accuracy (10.000 rows):", accuracy_score(yval, c_val_pred))

COVTYPE validation accuracy (10.000 rows): 0.8535


In [12]:
heloc_model.fit(X_heloc_sub, y_heloc_sub)
heloc_test_pred = heloc_model.predict(X_heloc_test)

heloc_out = heloc_sub.copy()
heloc_out["Prediction"] = heloc_test_pred.astype(int)
heloc_out.to_csv("heloc_test_submission.csv", index=False)

print("Saved:", "heloc_test_submission.csv")

Saved: heloc_test_submission.csv


In [13]:
higgs_model.fit(X_higgs_sub, y_higgs_sub)
higg_test_pred = higgs_model.predict(X_higgs_test)

higgs_out = higgs_sub.copy()
higgs_out["Prediction"] = higg_test_pred.astype(int)
higgs_out.to_csv("higgs_test_submission.csv", index=False)

print("Saved:", "higgs_test_submission.csv")

Saved: higgs_test_submission.csv


In [14]:
cov_model.fit(X_cov_sub, y_cov_sub)
cov_test_pred = cov_model.predict(X_cov_test)

cov_out = cov_sub.copy()
cov_out["Prediction"] = cov_test_pred.astype(int)
cov_out.to_csv("covtype_test_submission.csv", index=False)

print("Saved:", "covtype_test_submission.csv")

Saved: covtype_test_submission.csv


In [15]:
!zip baseline_submissions.zip heloc_test_submission.csv higgs_test_submission.csv covtype_test_submission.csv


updating: heloc_test_submission.csv (deflated 71%)
updating: higgs_test_submission.csv (deflated 70%)
updating: covtype_test_submission.csv (deflated 66%)


In [16]:
combined_out = pd.concat(
    [cov_out, heloc_out, higgs_out],
    ignore_index=True
)

combined_out = combined_out.sort_values("ID").reset_index(drop=True)

print(combined_out.shape)

combined_out.to_csv("combined_test_submission.csv", index=False)
print("Saved combined file: combined_test_submission.csv")

(79546, 2)
Saved combined file: combined_test_submission.csv
