In [95]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier

In [None]:
SEED = 42
np.random.seed(SEED)

DATA_DIR = "../Data"

# Covertype
COV_TRAIN = os.path.join("../Data", "covtype_train.csv")
COV_TEST  = os.path.join(DATA_DIR, "covtype_test.csv")

# HELOC
HELOC_TRAIN = os.path.join(DATA_DIR, "heloc_train.csv")
HELOC_TEST  = os.path.join(DATA_DIR, "heloc_test.csv")

# HIGGS
HIGGS_TRAIN = os.path.join(DATA_DIR, "higgs_train.csv")
HIGGS_TEST  = os.path.join(DATA_DIR, "higgs_test.csv")

# Sample submissions (optional)
COV_SAMPLE_SUB   = os.path.join(DATA_DIR, "covtype_test_submission.csv")
HELOC_SAMPLE_SUB = os.path.join(DATA_DIR, "heloc_test_submission.csv")
HIGGS_SAMPLE_SUB = os.path.join(DATA_DIR, "higgs_test_submission.csv")

# print("Data directory:", os.path.abspath(DATA_DIR))
print("Covertype train:", COV_TRAIN)
print("HELOC train:", HELOC_TRAIN)
print("HIGGS train:", HIGGS_TRAIN)

Covertype train: ../Data/covtype_train.csv
HELOC train: ../Data/heloc_train.csv
HIGGS train: ../Data/higgs_train.csv


In [85]:
def preprocess_covertype(scale_strategy, smote_ratio):
    """
    scale_strategy: "none", "standard", "minmax", or "robust".
    smote_ratio: target minority size as fraction of majority (None to skip).
    undersample_ratio: keep this fraction of majority after SMOTE (None to skip).
    Returns (X, y, X_test, scaler).
    """
    df_train = pd.read_csv(COV_TRAIN)
    df_test = pd.read_csv(COV_TEST)
    df_train.describe()


    y = df_train["Cover_Type"].values
    y_series = pd.Series(y)
    print(y_series.value_counts().sort_index())

    X = df_train.drop(columns=["Cover_Type"])

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=SEED, stratify=y)

    if smote_ratio is not None:
        counts = pd.Series(y_train).value_counts()
        max_class = counts.idxmax()
        max_count = counts.max()
        smote_strategy = {
            cls: int(max_count * smote_ratio)
            for cls in counts.index
            if counts[cls] < max_count * smote_ratio
        }
        if smote_strategy:
            smote = SMOTE(sampling_strategy=smote_strategy, random_state=42, k_neighbors=5)
            X_train, y_train = smote.fit_resample(X_train, y_train)

    # if undersample_ratio is not None:
    #     counts = pd.Series(y_train).value_counts()
    #     max_class = counts.idxmax()
    #     max_count = counts.max()
    #     target_majority = int(max_count * undersample_ratio)
    #     if 0 < target_majority < max_count:
    #         undersample = RandomUnderSampler(
    #             sampling_strategy={max_class: target_majority}, random_state=42
    #         )
    #         X_train, y_train = undersample.fit_resample(X_train, y_train)

    # Handle the outliers
    scaler = None
    if scale_strategy == "standard":
        scaler = StandardScaler()
    elif scale_strategy == "minmax":
        scaler = MinMaxScaler()
    elif scale_strategy == "robust":
        scaler = RobustScaler()


    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns, index=X_train.index)
    X_val = pd.DataFrame(scaler.transform(X_val), columns=X.columns, index=X_val.index)
    X_test = pd.DataFrame(scaler.transform(df_test), columns=X.columns)

    return X_train, y_train, X_val, y_val, X_test, scaler


# X_cov, y_cov, X_cov_test, cov_scaler = preprocess_covertype("standard", 0.8, 0.65)
# print("mean:", X_cov.mean().mean(), "std:", X_cov.std().mean())

# X_cov, y_cov, X_cov_test, cov_scaler = preprocess_covertype("minmax", 0.8, 0.65)
# print("mean:", X_cov.mean().mean(), "std:", X_cov.std().mean())

# X_cov, y_cov, X_cov_test, cov_scaler = preprocess_covertype("robust", 0.8, 0.65)
# print("mean:", X_cov.mean().mean(), "std:", X_cov.std().mean())


# Final scaling method to be used for modeling
scaling_method = "standard"
smote_ratio = 0.80
# undersample_ratio = 1

X_train_cov, y_train_cov, X_cov_val, y_cov_val, X_cov_test, cov_scaler = preprocess_covertype(scaling_method, smote_ratio)

print("CoverType:", X_train_cov.shape, y_train_cov.shape)
print("CoverType Test:", X_cov_val.shape, y_cov_val.shape)
print("Scaler used:", type(cov_scaler).__name__)

display(X_train_cov)
y_cov_series = pd.Series(y_train_cov)
print(y_cov_series.value_counts().sort_index())

1    21297
2    28248
3     3607
4      259
5      932
6     1706
7     2052
Name: count, dtype: int64
CoverType: (131066, 54) (131066,)
CoverType Test: (11621, 54) (11621,)
Scaler used: StandardScaler


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,1.078408,-0.457876,0.072867,1.040496,0.277928,-0.798351,1.121997,0.014204,-0.827653,3.682353,...,-0.140805,-0.207014,-0.199442,-0.043364,-0.06293,-0.017252,-0.034298,-0.204739,-0.186991,-0.144315
1,1.449522,0.015707,0.195042,2.973501,2.433447,-0.371441,0.784828,0.944556,-0.154569,-0.161305,...,-0.140805,-0.207014,-0.199442,-0.043364,-0.06293,-0.017252,-0.034298,-0.204739,-0.186991,6.929283
2,1.146324,0.442860,-1.515404,0.120018,-0.463031,2.828897,0.144205,1.077463,0.605365,-0.321835,...,-0.140805,-0.207014,-0.199442,-0.043364,-0.06293,-0.017252,-0.034298,-0.204739,-0.186991,-0.144315
3,-0.163489,0.312857,0.195042,-0.102835,-0.985071,0.503204,0.144205,1.431883,0.605365,0.264962,...,-0.140805,-0.207014,-0.199442,-0.043364,-0.06293,-0.017252,-0.034298,-0.204739,-0.186991,-0.144315
4,0.826147,1.863608,1.294614,2.372767,-0.176752,0.765746,-1.507926,-1.713592,0.149405,0.863550,...,-0.140805,-0.207014,-0.199442,-0.043364,-0.06293,-0.017252,-0.034298,-0.204739,-0.186991,-0.144315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131061,1.886126,-1.219323,-0.904530,0.042504,-0.193592,-0.199636,0.110488,0.014204,0.084267,-0.426134,...,-0.140805,-0.207014,-0.199442,-0.043364,-0.06293,-0.017252,-0.034298,-0.204739,-0.186991,6.929283
131062,1.607184,1.436455,0.072867,1.306950,0.193728,0.051007,-1.474209,0.102809,1.213312,-0.878703,...,-0.140805,-0.207014,-0.199442,-0.043364,-0.06293,-0.017252,-0.034298,-0.204739,-0.186991,-0.144315
131063,1.202112,-0.513592,-1.026705,-0.805305,-0.749311,0.471967,0.717394,0.412926,-0.197994,1.141077,...,-0.140805,-0.207014,-0.199442,-0.043364,-0.06293,-0.017252,-0.034298,4.884266,-0.186991,-0.144315
131064,1.612036,-0.708596,-1.026705,0.701373,-1.237671,-0.566302,0.515092,0.280019,-0.089432,-0.681894,...,-0.140805,-0.207014,-0.199442,-0.043364,-0.06293,-0.017252,-0.034298,4.884266,-0.186991,-0.144315


1    18078
2    22598
3    18078
4    18078
5    18078
6    18078
7    18078
Name: count, dtype: int64


In [37]:
def load_heloc():
    """Load and preprocess the HELOC dataset.

    - training.csv with column 'RiskPerformance' (Good/Bad).
    - test.csv with the same feature columns.
    - Sentinel codes -7, -8, -9 are treated as missing and imputed.
    """
    df_train = pd.read_csv(HELOC_TRAIN)
    df_test  = pd.read_csv(HELOC_TEST)

    # Label: Good/Bad -> 0/1 (Bad = 1) 
    y = (df_train["RiskPerformance"] == "Bad").astype(int).values
    X = df_train.drop(columns=["RiskPerformance"]).astype(np.float32)
    X_test = df_test.copy().astype(np.float32)

    # Replace sentinel values with NaN
    sentinel = [-7, -8, -9]
    X = X.replace(sentinel, np.nan)
    X_test = X_test.replace(sentinel, np.nan)

    # Impute NaNs with train medians
    medians = X.median()
    X = X.fillna(medians)
    X_test = X_test.fillna(medians)

    display(X)
    display(X_test)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    return X_scaled, y, X_test_scaled

X_heloc, y_heloc, X_heloc_test = load_heloc()

print("HELOC:", X_heloc.shape, y_heloc.shape)

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,69.0,148.0,4.0,66.0,41.0,0.0,0.0,100.0,15.0,7.0,...,10.0,0.0,1.0,1.0,32.0,60.0,7.0,3.0,1.0,50.0
1,77.0,229.0,3.0,109.0,23.0,0.0,0.0,100.0,15.0,7.0,...,35.0,0.0,0.0,0.0,38.0,93.0,4.0,3.0,1.0,58.0
2,58.0,46.0,7.0,38.0,13.0,0.0,0.0,93.0,8.0,4.0,...,50.0,0.0,2.0,2.0,80.0,84.0,5.0,4.0,1.0,90.0
3,72.0,186.0,6.0,76.0,20.0,0.0,0.0,97.0,15.0,6.0,...,33.0,0.0,1.0,1.0,30.0,73.0,3.0,2.0,1.0,67.0
4,80.0,226.0,2.0,66.0,35.0,0.0,0.0,100.0,15.0,7.0,...,47.0,0.0,0.0,0.0,2.0,77.0,5.0,7.0,0.0,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9408,65.0,115.0,11.0,43.0,19.0,0.0,0.0,90.0,1.0,4.0,...,50.0,0.0,0.0,0.0,52.0,77.0,5.0,6.0,0.0,85.0
9409,77.0,437.0,8.0,115.0,35.0,0.0,0.0,100.0,15.0,7.0,...,26.0,1.0,3.0,3.0,23.0,74.0,6.0,2.0,0.0,50.0
9410,75.0,140.0,7.0,56.0,21.0,0.0,0.0,100.0,15.0,7.0,...,27.0,0.0,1.0,1.0,20.0,63.0,3.0,2.0,1.0,56.0
9411,64.0,92.0,3.0,35.0,21.0,2.0,2.0,91.0,33.0,6.0,...,23.0,0.0,1.0,1.0,9.0,58.0,3.0,4.0,0.0,53.0


Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,81.0,333.0,27.0,132.0,12.0,0.0,0.0,100.0,15.0,7.0,...,25.0,0.0,1.0,1.0,51.0,89.0,3.0,1.0,0.0,80.0
1,59.0,137.0,11.0,78.0,31.0,0.0,0.0,91.0,1.0,4.0,...,47.0,0.0,0.0,0.0,62.0,93.0,12.0,4.0,3.0,94.0
2,61.0,79.0,4.0,36.0,19.0,0.0,0.0,95.0,5.0,4.0,...,26.0,0.0,6.0,6.0,31.0,86.0,5.0,3.0,1.0,62.0
3,72.0,186.0,6.0,76.0,20.0,0.0,0.0,97.0,15.0,6.0,...,33.0,0.0,1.0,1.0,30.0,73.0,3.0,2.0,1.0,67.0
4,72.0,186.0,6.0,76.0,20.0,0.0,0.0,97.0,15.0,6.0,...,33.0,0.0,1.0,1.0,30.0,73.0,3.0,2.0,1.0,67.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1041,87.0,296.0,26.0,120.0,12.0,0.0,0.0,100.0,15.0,7.0,...,42.0,0.0,0.0,0.0,17.0,33.0,2.0,1.0,0.0,75.0
1042,72.0,186.0,6.0,76.0,20.0,0.0,0.0,97.0,15.0,6.0,...,33.0,0.0,1.0,1.0,30.0,73.0,3.0,2.0,1.0,67.0
1043,75.0,162.0,8.0,77.0,40.0,0.0,0.0,100.0,15.0,7.0,...,20.0,7.0,0.0,0.0,27.0,73.0,5.0,2.0,1.0,64.0
1044,71.0,155.0,1.0,71.0,24.0,0.0,0.0,100.0,15.0,7.0,...,32.0,3.0,3.0,3.0,54.0,73.0,3.0,1.0,1.0,100.0


HELOC: (9413, 23) (9413,)


In [38]:
def load_higgs():
    """Load and preprocess the HIGGS dataset.

    - training.csv with columns: EventId, 30 features, Weight, Label (b/s or 0/1).
    - test.csv with EventId and the 30 feature columns.

    We treat -999.0 as missing and impute with medians.
    """
    df_train = pd.read_csv(HIGGS_TRAIN)
    df_test  = pd.read_csv(HIGGS_TEST)

    # Label may be given as 'Label' (b/s) or 'label' (0/1)
    if "Label" in df_train.columns:
        y_raw = df_train["Label"]
        y = (y_raw == "s").astype(int).values
    elif "label" in df_train.columns:
        y = df_train["label"].astype(int).values
    else:
        raise ValueError("Could not find label column ('Label' or 'label') in HIGGS training.csv")

    # Sample weights (if available)
    if "Weight" in df_train.columns:
        w = df_train["Weight"].values.astype(np.float32)
    else:
        w = np.ones(len(df_train), dtype=np.float32)

    # Features: drop ID, Weight, label columns
    drop_cols = [c for c in ["EventId", "Weight", "Label", "label"] if c in df_train.columns]
    feature_cols = [c for c in df_train.columns if c not in drop_cols]
    X = df_train[feature_cols].copy()
    X_test = df_test[feature_cols].copy()

    # Replace sentinel -999.0 with NaN
    X = X.replace(-999.0, np.nan).astype(np.float32)
    X_test = X_test.replace(-999.0, np.nan).astype(np.float32)

    # Impute with medians
    medians = X.median()
    X = X.fillna(medians)
    X_test = X_test.fillna(medians)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    # Event IDs (optional, useful for submissions)
    event_id_train = df_train["EventId"] if "EventId" in df_train.columns else None
    event_id_test  = df_test["EventId"] if "EventId" in df_test.columns else None

    return X_scaled, y, w, X_test_scaled, event_id_train, event_id_test

X_higgs, y_higgs, w_higgs, X_higgs_test, eid_tr, eid_te = load_higgs()

print("HIGGS:", X_higgs.shape, y_higgs.shape)

HIGGS: (175000, 30) (175000,)


In [None]:
# Statistics of the dataset
print("--- Shapes of X ---")
print(f"X_cov shape:   {X_cov.shape}")
print(f"X_heloc shape: {X_heloc.shape}")
print(f"X_higgs shape: {X_higgs.shape}")

print(f"Rows total X:   {X_cov.shape[0] + X_heloc.shape[0] + X_higgs.shape[0]}")
print("\n" + "="*30 + "\n")

# Check the shape of all the y
print("--- Shapes of y ---")
print(f"y_cov shape:   {y_cov.shape}")
print(f"y_heloc shape: {y_heloc.shape}")
print(f"y_higgs shape: {y_higgs.shape}")

print(f"Rows total y:   {y_cov.shape[0] + y_heloc.shape[0] + y_higgs.shape[0]}")

# Feature block sizes
d_cov   = X_cov.shape[1]
d_heloc = X_heloc.shape[1]
d_higgs = X_higgs.shape[1]

# Total unified feature length + 3 dataset-indicator features
D_total = d_cov + d_heloc + d_higgs + 3
print(D_total)

--- Shapes of X ---
X_cov shape:   (153949, 54)
X_heloc shape: (9413, 23)
X_higgs shape: (175000, 30)
Rows total X:   338362


--- Shapes of y ---
y_cov shape:   (153949,)
y_heloc shape: (9413,)
y_higgs shape: (175000,)
Rows total y:   338362
110


In [None]:
def embed_block(X, index):
    """Place a dataset's features into its slice of the unified space.

    dataset_idx: 0 = CoverType, 1 = HELOC, 2 = HIGGS
    """
    # Number of rows in de X trainingset:
    n_samples = X.shape[0]
    # print(n_samples)

    # Number of dimensions in the total dataset
    # print(D_total)

    embedded = np.zeros((n_samples, D_total), dtype=np.float32)

    def set_index(index):
        if index == 0:
            return 0, d_cov
        elif index == 1:
            return d_cov, d_cov + d_heloc
        elif index == 2: 
            return d_cov + d_heloc, d_cov + d_heloc + d_higgs

    start, end = set_index(index)
    # print(start)
    # print(end)

    # fill in the dataset with the 110 dimensions which will later be placed on top of each other to create the full dataset
    embedded[:, start:end] = X

    # Dataset indicator in the final 3 positions
    embedded[:, D_total - 3 + index] = 1.0
    return embedded

# Label the target columns to avoid collisions across datasets
cov_unique = np.sort(np.unique(y_cov))
cov_map = {v: i for i, v in enumerate(cov_unique)}
y_cov_int = np.array([cov_map[v] for v in y_cov], dtype=np.int64)     # 0..6

y_heloc_int = y_heloc.astype(np.int64) + 7                            # 7,8
y_higgs_int = y_higgs.astype(np.int64) + 9                            # 9,10

# Embed features (train)
X_cov_emb   = embed_block(X_cov, 0)
X_heloc_emb = embed_block(X_heloc, 1)
X_higgs_emb = embed_block(X_higgs, 2)

# Embed features (test) – needed for the submission step
X_cov_test_emb   = embed_block(X_cov_test, 0)
X_heloc_test_emb = embed_block(X_heloc_test, 1)
X_higgs_test_emb = embed_block(X_higgs_test, 2)

# Stack the datasets on top of each other
X_all = np.vstack([X_cov_emb, X_heloc_emb, X_higgs_emb])
y_all = np.concatenate([y_cov_int, y_heloc_int, y_higgs_int])

# Sample weights: CoverType & HELOC = 1, HIGGS uses its normalised weight
w_cov   = np.ones_like(y_cov_int,   dtype=np.float32)
w_heloc = np.ones_like(y_heloc_int, dtype=np.float32)
w_h_norm = w_higgs.astype(np.float32) / np.mean(w_higgs)

sample_weight_all = np.concatenate([w_cov, w_heloc, w_h_norm])
print(len(sample_weight_all))

print("Unified X_all:", X_all.shape, "y_all:", y_all.shape)


338362
Unified X_all: (338362, 110) y_all: (338362,)


In [None]:
X_train, X_val, y_train, y_val, w_train, w_val = train_test_split(
    X_all,
    y_all,
    sample_weight_all,
    test_size=0.2,
    random_state=SEED,
    stratify=y_all,
)

print("Train:", X_train.shape, "Val:", X_val.shape)

Train: (270689, 110) Val: (67673, 110)


In [103]:
# (Keep the calculate_smote_strategy function as is)
def calculate_smote_strategy(y_data, smote_ratio):
    # ... (function body as defined previously) ...
    if smote_ratio is None: return 'auto'
    counts = pd.Series(y_data).value_counts()
    max_class = counts.idxmax()
    max_count = counts.max()
    smote_strategy = {
        cls: int(max_count * smote_ratio)
        for cls in counts.index
        if counts[cls] < int(max_count * smote_ratio) 
    }
    if not smote_strategy: return 'auto'
    return smote_strategy

def get_scaler_options():
    """
    Returns a dictionary mapping of scaler names to their objects.
    """
    return {
        "standard": StandardScaler(), 
        "minmax": MinMaxScaler(), 
        "robust": RobustScaler(), 
        None: None,
        "none": None,
    }

In [105]:
print(f"Available CPU cores: {os.cpu_count()}")

# (Keep the calculate_smote_strategy function as is)


def train_model(model_name, use_grid, smote_ratio=None, scaler_type="standard"):
    # smote_ratio is the *target ratio* for calculate_smote_strategy
    # scaler_type is a string: "standard", "minmax", "robust", or "none"

    # Get the specific scaler instance based on input
    scaler_map = get_scaler_options()
    selected_scaler = scaler_map.get(scaler_type.lower())

    smote_strategy_config = calculate_smote_strategy(y_train, smote_ratio)
    
    if model_name == "mlp":
        classifier = MLPClassifier(
            # ... (MLP params) ...
            random_state=SEED, verbose=False,
        )
    elif model_name == "xgb":
        classifier = XGBClassifier(
            # ... (XGB params) ...
            random_state=SEED, n_jobs=-1,
        )
    else:
        raise ValueError(f"Unknown model name: {model_name}")

    # 1. Define the base Pipeline template
    pipeline = ImbPipeline([
        ('smote', SMOTE(sampling_strategy=smote_strategy_config, random_state=SEED)),
        ('scaler', selected_scaler), # Use the dynamically selected scaler
        ('classifier', classifier)
    ])

    if use_grid:
        # 2. Grid Search Path: Use the template but override 'scaler' with *all* options
        param_grid = {}
        if model_name == "mlp":
            param_grid = {
                "classifier__hidden_layer_sizes": [(128, 128), (256, 256)],
                "classifier__alpha": [1e-4, 1e-3],
                "classifier__learning_rate_init": [1e-3, 5e-4],
                # "smote__k_neighbors":,
                # In grid search mode, we test all options
                "scaler": list(scaler_map.values()), 
            }
        elif model_name == "xgb":
            param_grid = {
                # ... (xgb params) ...
                # "smote__k_neighbors":,
                "scaler": [None], # XGBoost usually works best without explicit scaling
            }

        cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
        
        model = GridSearchCV(
            estimator=pipeline, # GridSearchCV takes the pipeline as estimator
            param_grid=param_grid,
            cv=cv_strategy, 
            scoring="f1_weighted", 
            n_jobs=-1, 
            verbose=1,
            refit=True, 
        )
    else:
        # 3. Non-Grid Search Path: The pipeline already has the specific 
        #    smote_ratio and scaler_type you passed in the function call.
        model = pipeline

    # ... (Rest of the fitting and evaluation code remains the same) ...
    fit_params = {}
    if w_train is not None:
        fit_params['classifier__sample_weight'] = w_train
    
    model.fit(X_train, y_train, **fit_params)

    predictions = model.predict(X_val)
    val_score = f1_score(y_val, predictions, average='weighted') 
    print(f"[{model_name}, Scaler: {scaler_type}, Ratio: {smote_ratio}] validation F1-weighted score:", val_score)

    return model, val_score

# Example usage (assuming global variables defined):
# mlp_model, mlp_score = train_model("mlp", use_grid=True) 

Available CPU cores: 8


In [106]:
mlp_model, mlp_score = train_model("mlp", use_grid=False, smote_ratio=0.65, scaler_type="standard")

ValueError: sample_weight.shape == (270689,), expected (691281,)!

In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline # Import imblearn's pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, StratifiedKFold

import os
print(f"Available CPU cores: {os.cpu_count()}")


def train_model(model_name, use_grid):

    # 1. Determine the SMOTE strategy once based on the full X_train/y_train data
    # This configuration is used internally by the pipeline within each CV fold
    smote_strategy_config = calculate_smote_strategy(y_train, smote_ratio)
    
    # 2. Define the base classifier instance
    if model_name == "mlp":
        classifier = MLPClassifier(
            hidden_layer_sizes=(256, 256),
            activation="relu",
            solver="adam",
            alpha=1e-4,
            batch_size=256,
            learning_rate_init=1e-3,
            max_iter=80,
            early_stopping=True,
            n_iter_no_change=5,
            random_state=SEED,
            verbose=False, # Set verbose to False here to avoid massive printouts during CV
        )
    elif model_name == "xgb":
        classifier = XGBClassifier(
            n_estimators=400,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="multi:softprob",
            tree_method="hist",
            random_state=SEED,
            n_jobs=-1,
        )
    else:
        raise ValueError(f"Unknown model name: {model_name}")


    # 3. Define the Imblearn Pipeline
    pipeline = ImbPipeline([
        ('smote', SMOTE(sampling_strategy=smote_strategy_config, random_state=SEED)),
        ('classifier', classifier) # The classifier instance is added here
    ])

    # 4. Handle Grid Search logic
    if use_grid:
        # Define param grids with the correct "classifier__" prefix
        if model_name == "mlp":
            param_grid = {
                "classifier__hidden_layer_sizes": [(128, 128), (256, 256)],
                "classifier__alpha": [1e-4, 1e-3],
                "classifier__learning_rate_init": [1e-3, 5e-4],
                # Example of tuning SMOTE k_neighbors
                "smote__k_neighbors": [3, 5, 7]
            }
        elif model_name == "xgb":
            param_grid = {
                "classifier__n_estimators": [300, 500],
                "classifier__max_depth": [5, 7],
                "classifier__learning_rate": [0.05, 0.1],
                "classifier__subsample": [0.8, 1.0],
                "classifier__colsample_bytree": [0.8, 1.0],
                "smote__k_neighbors": [3, 5, 7]
            }

        cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
        
        # 'model' becomes the GridSearchCV object which wraps the pipeline
        model = GridSearchCV(
            estimator=pipeline, 
            param_grid=param_grid,
            cv=cv_strategy, 
            scoring="f1_weighted", # F1-weighted is recommended for imbalance
            n_jobs=-1, 
            verbose=1,
            refit=True, 
        )
    else:
        # 'model' becomes the simple ImbPipeline object
        model = pipeline

    fit_params = {}
    if w_train is not None:
        fit_params['classifier__sample_weight'] = w_train


    model.fit(X_train, y_train, classifier__sample_weight=w_train if w_train is not None else None)
    predictions = model.predict(X_val)
    val_acc = accuracy_score(y_val, predictions)
    print(f"{model_name} val accuracy:", val_acc)

    return model, val_acc

# Example usages:
mlp_model, mlp_acc = train_model("mlp", use_grid=False)
# xgb_model, xgb_acc = train_model("xgb", use_grid=False)

Available CPU cores: 8


ValueError: sample_weight.shape == (270689,), expected (829531,)!

In [63]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

def eval_dataset(X_emb, y_int, name):
    y_pred_int = clf_MLP.predict(X_emb)
    acc = accuracy_score(y_int, y_pred_int)
    print(f"{name} accuracy (on provided data): {acc:.4f}")
    return acc

print("CoverType:", eval_dataset(X_cov_emb,   y_cov_int, "CoverType"))
print("HELOC:   ", eval_dataset(X_heloc_emb, y_heloc_int, "HELOC"))
print("HIGGS:   ", eval_dataset(X_higgs_emb, y_higgs_int, "HIGGS"))

def evaluate_model(model, X, y_true, average_type='weighted'):
    """
    Compute and print evaluation metrics for a trained model.
    
    Parameters:
        model: Trained model (must support predict method)
        X: Features (numpy array or dataframe)
        y_true: True labels (numpy array or series)
        average_type: Averaging type for multiclass ('micro', 'macro', 'weighted')
    """
    y_pred = model.predict(X)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average=average_type, zero_division=0)
    rec = recall_score(y_true, y_pred, average=average_type, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=average_type, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)


    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1 Score:", f1)
    print("Confusion Matrix:\n", cm)
    print("Classification Report:")
    print(classification_report(y_true, y_pred, zero_division=0))

# Example usage for validation set (weighted average for multiclass):
print("=== MLP Model Validation Metrics ===")
evaluate_model(mlp_model, X_val, y_val, average_type='weighted')
# If xgb_model un-commented above, you can also evaluate it:
# print("=== XGB Model Validation Metrics ===")
# evaluate_model(xgb_model, X_val, y_val, average_type='weighted')


CoverType accuracy (on provided data): 0.9512
CoverType: 0.9512176110270284
HELOC accuracy (on provided data): 0.8178
HELOC:    0.8178051630723467
HIGGS accuracy (on provided data): 0.8541
HIGGS:    0.8541028571428572
=== MLP Model Validation Metrics ===
Accuracy: 0.8076780990941734
Precision: 0.8539069797561797
Recall: 0.8076780990941734
F1 Score: 0.7567110236649941
Confusion Matrix:
 [[ 3961   482     5     1    12     1    57     0     0     0     0]
 [  444  3119    34     0    45    19    11     0     0     0     0]
 [    1    11  4387     9     4   108     0     0     0     0     0]
 [    0     0     2  4500     0    18     0     0     0     0     0]
 [   22    35     3     0  4460     0     0     0     0     0     0]
 [    1    11    54     3     4  4447     0     0     0     0     0]
 [   80     3     0     0     0     0  4437     0     0     0     0]
 [    0     0     0     0     0     0     0   554   343     0     0]
 [    0     0     1     0     0     0     0   318   666    

In [64]:
# Reverse mapping for predictions
# CoverType: 0-6 -> 1-7
inverse_cov_map = {v: k for k, v in cov_map.items()}

# For MLP
y_cov_test_pred = clf_MLP.predict(X_cov_test_emb)
y_heloc_test_pred = clf_MLP.predict(X_heloc_test_emb)
y_higgs_test_pred = clf_MLP.predict(X_higgs_test_emb)

# Predictions on test sets
# y_cov_test_pred = clf.predict(X_cov_test_emb)
# y_heloc_test_pred = clf.predict(X_heloc_test_emb)
# y_higgs_test_pred = clf.predict(X_higgs_test_emb)


# Convert predictions back to original label space
# CoverType: 0-6 -> 1-7
y_cov_test_pred_orig = np.array([inverse_cov_map[pred] for pred in y_cov_test_pred])

# HELOC: 7, 8 -> 0, 1
y_heloc_test_pred_orig = (y_heloc_test_pred - 7).astype(int)

# HIGGS: 9, 10 -> 0, 1
y_higgs_test_pred_orig = (y_higgs_test_pred - 9).astype(int)

print("CoverType predictions (original labels 1-7):", np.unique(y_cov_test_pred_orig))
print("HELOC predictions (original labels 0-1):", np.unique(y_heloc_test_pred_orig))
print("HIGGS predictions (original labels 0-1):", np.unique(y_higgs_test_pred_orig))


CoverType predictions (original labels 1-7): [1 2 3 4 5 6 7]
HELOC predictions (original labels 0-1): [-6 -5  0  1]
HIGGS predictions (original labels 0-1): [-8  0  1]


In [66]:

# Generate submission file with original label encoding

# CoverType: IDs start at 1
cov_df = pd.DataFrame({
    "ID": np.arange(1, 1 + len(y_cov_test_pred_orig)),
    "Prediction": y_cov_test_pred_orig
})

# HELOC: IDs start at 3501
heloc_start = 3501
heloc_df = pd.DataFrame({
    "ID": np.arange(heloc_start, heloc_start + len(y_heloc_test_pred_orig)),
    "Prediction": y_heloc_test_pred_orig
})

# HIGGS: IDs start at 4547
higgs_start = 4547
higgs_df = pd.DataFrame({
    "ID": np.arange(higgs_start, higgs_start + len(y_higgs_test_pred_orig)),
    "Prediction": y_higgs_test_pred_orig
})

# Merge all into one CSV
submission = pd.concat([cov_df, heloc_df, higgs_df], ignore_index=True)

print("\nSubmission preview:")
print(submission.head(10))
print("...")
print(submission.tail(10))
print(f"\nTotal rows: {len(submission)}")

# Save
submission_path = "combined_submission.csv"
submission.to_csv(submission_path, index=False)

print(f"\n✓ Saved unified submission to: {submission_path}")



Submission preview:
   ID  Prediction
0   1           1
1   2           1
2   3           1
3   4           1
4   5           1
5   6           1
6   7           1
7   8           2
8   9           1
9  10           1
...
          ID  Prediction
79536  79537           1
79537  79538           1
79538  79539           0
79539  79540           0
79540  79541           1
79541  79542           1
79542  79543           0
79543  79544           0
79544  79545           0
79545  79546           0

Total rows: 79546

✓ Saved unified submission to: combined_submission.csv
