In [4]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer

In [5]:
SEED = 42
np.random.seed(SEED)

DATA_DIR = "../Data"

# Covertype
COV_TRAIN = os.path.join("../Data", "covtype_train.csv")
COV_TEST  = os.path.join(DATA_DIR, "covtype_test.csv")

# HELOC
HELOC_TRAIN = os.path.join(DATA_DIR, "heloc_train.csv")
HELOC_TEST  = os.path.join(DATA_DIR, "heloc_test.csv")

# HIGGS
HIGGS_TRAIN = os.path.join(DATA_DIR, "higgs_train.csv")
HIGGS_TEST  = os.path.join(DATA_DIR, "higgs_test.csv")

# Sample submissions (optional)
COV_SAMPLE_SUB   = os.path.join(DATA_DIR, "covtype_test_submission.csv")
HELOC_SAMPLE_SUB = os.path.join(DATA_DIR, "heloc_test_submission.csv")
HIGGS_SAMPLE_SUB = os.path.join(DATA_DIR, "higgs_test_submission.csv")

# print("Data directory:", os.path.abspath(DATA_DIR))
print("Covertype train:", COV_TRAIN)
print("HELOC train:", HELOC_TRAIN)
print("HIGGS train:", HIGGS_TRAIN)

Covertype train: ../Data/covtype_train.csv
HELOC train: ../Data/heloc_train.csv
HIGGS train: ../Data/higgs_train.csv


In [6]:
def load_covertype():
    """
    scale_strategy: "none", "standard", "minmax", or "robust".
    smote_ratio: target minority size as fraction of majority (None to skip).
    undersample_ratio: keep this fraction of majority after SMOTE (None to skip).
    Returns (X, y, X_test, scaler).
    """
    df_train = pd.read_csv(COV_TRAIN)
    df_test = pd.read_csv(COV_TEST)

    df_train.describe()

    X_train = df_train.drop(columns=["Cover_Type"])
    X_test = df_test
    y_train = df_train["Cover_Type"].values

    y_series = pd.Series(y_train)
    print(y_series.value_counts().sort_index())

    return X_train, y_train, X_test


X_cov_train, y_cov_train,  X_cov_test = load_covertype()

print("CoverType:", X_cov_train.shape, y_cov_train.shape)
print("CoverType test:", X_cov_test.shape)


display(X_cov_train)


    # if smote_ratio is not None:
    #     counts = pd.Series(y_train).value_counts()
    #     max_class = counts.idxmax()
    #     max_count = counts.max()
    #     smote_strategy = {
    #         cls: int(max_count * smote_ratio)
    #         for cls in counts.index
    #         if counts[cls] < max_count * smote_ratio
    #     }
    #     if smote_strategy:
    #         smote = SMOTE(sampling_strategy=smote_strategy, random_state=42, k_neighbors=5)
    #         X_train, y_train = smote.fit_resample(X_train, y_train)

    # if undersample_ratio is not None:
    #     counts = pd.Series(y_train).value_counts()
    #     max_class = counts.idxmax()
    #     max_count = counts.max()
    #     target_majority = int(max_count * undersample_ratio)
    #     if 0 < target_majority < max_count:
    #         undersample = RandomUnderSampler(
    #             sampling_strategy={max_class: target_majority}, random_state=42
    #         )
    #         X_train, y_train = undersample.fit_resample(X_train, y_train)

    # # Handle the outliers
    # scaler = None
    # if scale_strategy == "standard":
    #     scaler = StandardScaler()
    # elif scale_strategy == "minmax":
    #     scaler = MinMaxScaler()
    # elif scale_strategy == "robust":
    #     scaler = RobustScaler()


    # X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns, index=X_train.index)
    # X_val = pd.DataFrame(scaler.transform(X_val), columns=X.columns, index=X_val.index)
    # X_test = pd.DataFrame(scaler.transform(df_test), columns=X.columns)

    


# X_cov, y_cov, X_cov_test, cov_scaler = preprocess_covertype("standard", 0.8, 0.65)
# print("mean:", X_cov.mean().mean(), "std:", X_cov.std().mean())

# X_cov, y_cov, X_cov_test, cov_scaler = preprocess_covertype("minmax", 0.8, 0.65)
# print("mean:", X_cov.mean().mean(), "std:", X_cov.std().mean())

# X_cov, y_cov, X_cov_test, cov_scaler = preprocess_covertype("robust", 0.8, 0.65)
# print("mean:", X_cov.mean().mean(), "std:", X_cov.std().mean())


# Final scaling method to be used for modeling
# scaling_method = "standard"
# smote_ratio = 0.80
# undersample_ratio = 1



1    21297
2    28248
3     3607
4      259
5      932
6     1706
7     2052
Name: count, dtype: int64
CoverType: (58101, 54) (58101,)
CoverType test: (3500, 54)


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,3351,206,27,726,124,3813,192,252,180,2271,...,0,0,0,0,0,0,0,1,0,0
1,2732,129,7,212,1,1082,231,236,137,912,...,0,0,0,0,0,0,0,0,0,0
2,2572,24,9,201,25,957,216,222,142,2191,...,0,0,0,0,0,0,0,0,0,0
3,2824,69,13,417,39,3223,233,214,110,6478,...,0,0,0,0,0,0,0,0,0,0
4,2529,84,5,120,9,1092,227,231,139,4983,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58096,3160,315,8,0,0,1366,199,234,174,1129,...,0,0,0,0,0,0,0,0,0,0
58097,2607,45,12,242,52,977,223,214,123,1342,...,0,0,0,0,0,0,0,0,0,0
58098,2317,280,25,190,64,433,144,233,225,582,...,0,0,0,0,0,0,0,0,0,0
58099,3183,89,17,60,8,3443,243,211,91,443,...,0,0,0,0,0,0,0,0,0,0


In [7]:
def load_heloc():
    """Load and preprocess the HELOC dataset.

    - training.csv with column 'RiskPerformance' (Good/Bad).
    - test.csv with the same feature columns.
    - Sentinel codes -7, -8, -9 are treated as missing and imputed.
    """
    df_train = pd.read_csv(HELOC_TRAIN)
    df_test  = pd.read_csv(HELOC_TEST)

    # Label: Good/Bad -> 0/1 (Bad = 1) 
    y = (df_train["RiskPerformance"] == "Bad").astype(int).values
    X = df_train.drop(columns=["RiskPerformance"]).astype(np.float32)
    X_test = df_test.copy().astype(np.float32)

    # Replace sentinel values with NaN
    sentinel = [-7, -8, -9]
    X = X.replace(sentinel, np.nan)
    X_test = X_test.replace(sentinel, np.nan)

    # Impute NaNs with train medians
    medians = X.median()
    X = X.fillna(medians)
    X_test = X_test.fillna(medians)

    # display(X)
    # display(X_test)

    # scaler = StandardScaler()
    # X_scaled = scaler.fit_transform(X)
    # X_test_scaled = scaler.transform(X_test)

    return X, y, X_test

X_heloc_train, y_heloc_train, X_heloc_test = load_heloc()

print("HELOC Train:", X_heloc_train.shape, y_heloc_train.shape)
print("HELOC Test:", X_heloc_test.shape)

y_heloc_series = pd.Series(y_heloc_train)
print(y_heloc_series.value_counts().sort_index())

X_heloc_train

HELOC Train: (9413, 23) (9413,)
HELOC Test: (1046, 23)
0    4488
1    4925
Name: count, dtype: int64


Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,69.0,148.0,4.0,66.0,41.0,0.0,0.0,100.0,15.0,7.0,...,10.0,0.0,1.0,1.0,32.0,60.0,7.0,3.0,1.0,50.0
1,77.0,229.0,3.0,109.0,23.0,0.0,0.0,100.0,15.0,7.0,...,35.0,0.0,0.0,0.0,38.0,93.0,4.0,3.0,1.0,58.0
2,58.0,46.0,7.0,38.0,13.0,0.0,0.0,93.0,8.0,4.0,...,50.0,0.0,2.0,2.0,80.0,84.0,5.0,4.0,1.0,90.0
3,72.0,186.0,6.0,76.0,20.0,0.0,0.0,97.0,15.0,6.0,...,33.0,0.0,1.0,1.0,30.0,73.0,3.0,2.0,1.0,67.0
4,80.0,226.0,2.0,66.0,35.0,0.0,0.0,100.0,15.0,7.0,...,47.0,0.0,0.0,0.0,2.0,77.0,5.0,7.0,0.0,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9408,65.0,115.0,11.0,43.0,19.0,0.0,0.0,90.0,1.0,4.0,...,50.0,0.0,0.0,0.0,52.0,77.0,5.0,6.0,0.0,85.0
9409,77.0,437.0,8.0,115.0,35.0,0.0,0.0,100.0,15.0,7.0,...,26.0,1.0,3.0,3.0,23.0,74.0,6.0,2.0,0.0,50.0
9410,75.0,140.0,7.0,56.0,21.0,0.0,0.0,100.0,15.0,7.0,...,27.0,0.0,1.0,1.0,20.0,63.0,3.0,2.0,1.0,56.0
9411,64.0,92.0,3.0,35.0,21.0,2.0,2.0,91.0,33.0,6.0,...,23.0,0.0,1.0,1.0,9.0,58.0,3.0,4.0,0.0,53.0


In [8]:
# from EDA.eda_covtype import X_train_enc


def load_higgs():
    """Load and preprocess the HIGGS dataset.

    - training.csv with columns: EventId, 30 features, Weight, Label (b/s or 0/1).
    - test.csv with EventId and the 30 feature columns.

    We treat -999.0 as missing and impute with medians.
    """
    df_train = pd.read_csv(HIGGS_TRAIN)
    df_test  = pd.read_csv(HIGGS_TEST)


    y_train = df_train["Label"]
    y_train = (y_train == "s").astype(int).values

    w_train = df_train["Weight"].values.astype(np.float32)
 
    # Features: drop ID, Weight, label columns
    drop_cols = [c for c in ["EventId", "Weight", "Label", "label"] if c in df_train.columns]
    feature_cols = [c for c in df_train.columns if c not in drop_cols]

    X_train = df_train[feature_cols].copy()
    X_test = df_test[feature_cols].copy()

    return X_train, y_train, w_train, X_test 


    # # Replace sentinel -999.0 with NaN
    # X_train = X_train.replace(-999.0, np.nan).astype(np.float32)
    # X_test = X_test.replace(-999.0, np.nan).astype(np.float32)

    # # Impute with medians
    # medians = X_train.median()
    # X_train = X_train.fillna(medians)
    # X_test_train = X_test.fillna(medians)

    # # scaler = StandardScaler()
    # # X_scaled = scaler.fit_transform(X)
    # # X_test_scaled = scaler.transform(X_test)

    # # Event IDs (optional, useful for submissions)
    # event_id_train = df_train["EventId"] if "EventId" in df_train.columns else None
    # event_id_test  = df_test["EventId"] if "EventId" in df_test.columns else None

    


X_higgs_train, y_higgs_train, w_higgs_train, X_higgs_test = load_higgs()


print("HIGGS:", X_higgs_train.shape, y_higgs_train.shape)
print(len(w_higgs_train))

HIGGS: (175000, 30) (175000,)
175000


In [9]:
def embed_block(X, index, dimensions):
    """Place a dataset's features into its slice of the unified space.

    dataset_idx: 0 = CoverType, 1 = HELOC, 2 = HIGGS
    """
    # Number of rows in de X trainingset:
    n_samples = X.shape[0]


    embedded = np.zeros((n_samples, dimensions), dtype=np.float32)

    def set_index(index):
        if index == 0:
            return 0, d_cov
        elif index == 1:
            return d_cov, d_cov + d_heloc
        elif index == 2: 
            return d_cov + d_heloc, d_cov + d_heloc + d_higgs

    start, end = set_index(index)
    # print(start)
    # print(end)

    # fill in the dataset with the 110 dimensions which will later be placed on top of each other to create the full dataset
    embedded[:, start:end] = X

    # Dataset indicator in the final 3 positions
    embedded[:, D_total - 3 + index] = 1
    return embedded

# Label the target columns to avoid collisions across datasets
cov_unique = np.sort(np.unique(y_cov_train))
cov_map = {v: i for i, v in enumerate(cov_unique)}
y_cov_int = np.array([cov_map[v] for v in y_cov_train], dtype=np.int64)     # 0..6

y_heloc_int = y_heloc_train.astype(np.int64) + 7                            # 7,8
y_higgs_int = y_higgs_train.astype(np.int64) + 9                            # 9,10

d_cov = X_cov_train.shape[1]
d_heloc = X_heloc_train.shape[1]
d_higgs = X_higgs_train.shape[1]

D_total = d_cov + d_heloc + d_higgs + 3

# Embed features (train)
X_cov_emb   = embed_block(X_cov_train, 0, D_total)
X_heloc_emb = embed_block(X_heloc_train, 1, D_total)
X_higgs_emb = embed_block(X_higgs_train, 2, D_total)


# Embed features (test) – needed for the submission step
X_cov_test_emb   = embed_block(X_cov_test, 0, D_total)
X_heloc_test_emb = embed_block(X_heloc_test, 1, D_total)
X_higgs_test_emb = embed_block(X_higgs_test, 2, D_total)

# # Stack the datasets on top of each other
X_train = np.vstack([X_cov_emb, X_heloc_emb, X_higgs_emb])
y_train = np.concatenate([y_cov_int, y_heloc_int, y_higgs_int])

w_cov_train = np.ones_like(y_cov_train, dtype=np.float32)
w_heloc_train = np.ones_like(y_heloc_train, dtype=np.float32)
w_train = np.concatenate([w_cov_train, w_heloc_train, w_higgs_train])

# Statistics of the dataset
print("--- Shapes of X ---")
print(f"X_cov shape:   {X_cov_train.shape}")
print(f"X_heloc shape: {X_heloc_train.shape}")
print(f"X_higgs shape: {X_higgs_train.shape}")

print(f"Rows total X:   {X_cov_train.shape[0] + X_heloc_train.shape[0] + X_higgs_train.shape[0]}")
print("\n" + "="*30 + "\n")

# Check the shape of all the y
print("--- Shapes of y ---")
print(f"y_cov shape:   {y_cov_train.shape}")
print(f"y_heloc shape: {y_heloc_train.shape}")
print(f"y_higgs shape: {y_higgs_train.shape}")

# Feature block sizes
d_cov   = X_cov_train.shape[1]
d_heloc = X_heloc_train.shape[1]
d_higgs = X_higgs_train.shape[1]

print(f"Rows total y:   {y_cov_train.shape[0] + y_heloc_train.shape[0] + y_higgs_train.shape[0]}")
print("\n" + "="*30 + "\n")


print(f"X_total shape: {X_train.shape}")
print(f"y_total shape: {y_train.shape}")
print(f"w_total_shape: {len(w_train)}")

--- Shapes of X ---
X_cov shape:   (58101, 54)
X_heloc shape: (9413, 23)
X_higgs shape: (175000, 30)
Rows total X:   242514


--- Shapes of y ---
y_cov shape:   (58101,)
y_heloc shape: (9413,)
y_higgs shape: (175000,)
Rows total y:   242514


X_total shape: (242514, 110)
y_total shape: (242514,)
w_total_shape: 242514


In [10]:
def config_smote_strategy(y, smote_ratio):
    counts = pd.Series(y).value_counts()
    max_class = counts.idxmax()
    max_count = counts.max()
    smote_strategy = {
        cls: int(max_count * smote_ratio)
        for cls in counts.index
        if counts[cls] < int(max_count * smote_ratio) 
    }
    return smote_strategy, counts

def get_scaler_options():
    """
    Returns a dictionary mapping of scaler names to their objects.
    """
    return {
        "standard": StandardScaler(), 
        "minmax": MinMaxScaler(), 
        "robust": RobustScaler(), 
        "none": None,
    }

In [None]:
# print(f"Available CPU cores: {os.cpu_count()}")


# def train_model(model_name, X, y, imputer, smote_ratio, scaler_type, use_grid):
    
#     # 1. Select the scaler that we will be using for scaling the data
#     scaler_map = get_scaler_options()
#     selected_scaler = scaler_map.get(scaler_type.lower())
#     print("\n" + "="*30 + "\n")
#     print(f"Selected scalar: {selected_scaler}")


#     # 2. Configure the smote strategy that will be used for resampling
#     smote_strategy, counts = config_smote_strategy(y, smote_ratio)
#     print("\n" + "="*30 + "\n")
#     print(f"y count before SMOTE: {counts}")

#     # 3. Configure folding strategy
#     cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

#     # 4.Configure the model chosen by the user, if no grid search is chosen the following hyperparameters are chosen by default.
#     if model_name == "mlp":
#         classifier = MLPClassifier(
#             hidden_layer_sizes=(256, 256),
#             activation="relu",
#             solver="adam",
#             alpha=1e-4,
#             batch_size=256,
#             learning_rate_init=1e-3,
#             max_iter=80,
#             early_stopping=True,
#             n_iter_no_change=5,
#             random_state=SEED,
#             verbose=False
#         )
#     elif model_name == "xgb":
#         classifier = XGBClassifier(
#             n_estimators=400,
#             max_depth=6,
#             learning_rate=0.05,
#             subsample=0.8,
#             colsample_bytree=0.8,
#             objective="multi:softprob",
#             tree_method="hist",
#             random_state=SEED,
#             n_jobs=-1
#         )
#     print("\n" + "="*30 + "\n")
#     print(f"Chosen model: {model_name}")

#     # 5. Define the Imblearn Pipeline
#     pipeline = ImbPipeline([
#         ('imputer', SimpleImputer(strategy='median')),
#         ('smote', SMOTE(sampling_strategy=smote_strategy, random_state=SEED)),
#         ('scaler', selected_scaler), 
#         ('classifier', classifier)
#     ])

#     # 6. Handle Grid Search logic if grid search is chosen
#     if use_grid:
#         if model_name == "mlp":
#             param_grid = {
#                 "classifier__hidden_layer_sizes": [(128, 128), (256, 256)],
#                 "classifier__alpha": [1e-4, 1e-3],
#                 "classifier__learning_rate_init": [1e-3, 5e-4],
#                 "smote__k_neighbors": [3, 5, 7]
#             }
#         elif model_name == "xgb":
#             param_grid = {
#                 "classifier__n_estimators": [300, 500],
#                 "classifier__max_depth": [5, 7],
#                 "classifier__learning_rate": [0.05, 0.1],
#                 "classifier__subsample": [0.8, 1.0],
#                 "classifier__colsample_bytree": [0.8, 1.0],
#                 "smote__k_neighbors": [3, 5, 7]
#             }
#         # 'model' becomes the GridSearchCV object which wraps the pipeline
#         model = GridSearchCV(
#             estimator=pipeline, 
#             param_grid=param_grid,
#             cv=cv_strategy, 
#             scoring="f1_weighted", # F1-weighted is recommended for imbalance
#             n_jobs=-1, 
#             verbose=1,
#             refit=True, 
#         )
#     else:
#         # 'model' becomes the simple ImbPipeline object
#         model = pipeline
        
#     print("\n" + "="*30 + "\n")
#     print(f"Grid search: {use_grid}")

#     # model.fit(X_train, y_train, classifier__sample_weight=w_train if w_train is not None else None)
#     model.fit(X_train, y_train)


#     return model, val_acc


# use_grid = False
# imputer = "median"
# smote_ratio = 0.8
# scalar_type = "Standard"

# mlp_model, mlp_score = train_model("mlp", X_train, y_train, imputer, smote_ratio, scalar_type, use_grid)

# print(mlp_model, mlp_score)

In [14]:
def train_model(model_name, X, y, imputer, smote_ratio, scaler_type, use_grid, w=None, test_size=0.2):
    """
    Train a model with preprocessing pipeline.
    
    Parameters:
    -----------
    model_name : str
        Either "mlp" or "xgb"
    X : array-like
        Feature matrix
    y : array-like
        Target labels
    imputer : str
        Imputation strategy ('median', 'mean', etc.)
    smote_ratio : float
        Target ratio for SMOTE resampling (0-1)
    scaler_type : str
        Type of scaler: "standard", "minmax", "robust", or "none"
    use_grid : bool
        Whether to use GridSearchCV
    w : array-like, optional
        Sample weights for training (e.g., for HIGGS dataset)
    test_size : float, optional
        Proportion of data to use for validation (default 0.2)
    
    Returns:
    --------
    model : fitted model
        Trained model (either ImbPipeline or GridSearchCV)
    val_score : float
        Validation F1-weighted score
    """
    # 1. Handle data splitting based on whether we're using GridSearchCV
    # GridSearchCV handles splits internally via CV, so we only split for non-grid case
    if use_grid:
        # For GridSearchCV: use all data, CV will handle splits
        X_train, y_train = X, y
        X_val, y_val = None, None
    else:
        # For non-grid: split for validation
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=test_size, random_state=SEED, stratify=y
        )
    
    # 2. Select the scaler that we will be using for scaling the data
    scaler_map = get_scaler_options()
    selected_scaler = scaler_map.get(scaler_type.lower())
    
    # Handle None scaler - use 'passthrough' in pipeline
    if selected_scaler is None:
        scaler_step = 'passthrough'
    else:
        scaler_step = selected_scaler
    
    print("\n" + "="*30 + "\n")
    print(f"Selected scaler: {scaler_type} ({selected_scaler})")

    # 3. Configure the smote strategy that will be used for resampling
    smote_strategy, counts = config_smote_strategy(y_train, smote_ratio)
    print("\n" + "="*30 + "\n")
    print(f"y count before SMOTE:\n{counts}")

    # 4. Configure folding strategy for GridSearchCV
    cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

    # 5. Configure the model chosen by the user
    if model_name == "mlp":
        classifier = MLPClassifier(
            hidden_layer_sizes=(256, 256),
            activation="relu",
            solver="adam",
            alpha=1e-4,
            batch_size=256,
            learning_rate_init=1e-3,
            max_iter=5,
            early_stopping=True,
            n_iter_no_change=5,
            random_state=SEED,
            verbose=True  # Enable to see training iterations
        )
    elif model_name == "xgb":
        classifier = XGBClassifier(
            n_estimators=400,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="multi:softprob",
            tree_method="hist",
            random_state=SEED,
            n_jobs=-1
        )
    else:
        raise ValueError(f"Unknown model: {model_name}. Use 'mlp' or 'xgb'")
    
    print("\n" + "="*30 + "\n")
    print(f"Chosen model: {model_name}")

    # 6. Define the Imblearn Pipeline
    # Use the imputer parameter instead of hardcoding 'median'
    pipeline = ImbPipeline([
        ('imputer', SimpleImputer(strategy=imputer if isinstance(imputer, str) else 'median')),
        ('smote', SMOTE(sampling_strategy=smote_strategy, random_state=SEED, k_neighbors=5)),
        ('scaler', scaler_step), 
        ('classifier', classifier)
    ])

    # 7. Handle Grid Search logic if grid search is chosen
    if use_grid:
        if model_name == "mlp":
            param_grid = {
                "classifier__hidden_layer_sizes": [(128, 128), (256, 256)],
                "classifier__alpha": [1e-4, 1e-3],
                "classifier__learning_rate_init": [1e-3, 5e-4],
                "smote__k_neighbors": [3, 5, 7]
            }
        elif model_name == "xgb":
            param_grid = {
                "classifier__n_estimators": [300, 500],
                "classifier__max_depth": [5, 7],
                "classifier__learning_rate": [0.05, 0.1],
                "classifier__subsample": [0.8, 1.0],
                "classifier__colsample_bytree": [0.8, 1.0],
                "smote__k_neighbors": [3, 5, 7]
            }
        
        # Calculate total number of fits for progress tracking
        total_combinations = 1
        for param_values in param_grid.values():
            total_combinations *= len(param_values)
        total_fits = total_combinations * cv_strategy.n_splits
        print(f"\nGridSearchCV: Testing {total_combinations} parameter combinations")
        print(f"  with {cv_strategy.n_splits}-fold CV = {total_fits} total fits")
        
        # 'model' becomes the GridSearchCV object which wraps the pipeline
        model = GridSearchCV(
            estimator=pipeline, 
            param_grid=param_grid,
            cv=cv_strategy, 
            scoring="f1_weighted",  # F1-weighted is recommended for imbalance
            n_jobs=-1, 
            verbose=2,  # Increased verbosity to see CV progress
            refit=True, 
        )
    else:
        # 'model' becomes the simple ImbPipeline object
        model = pipeline
        
    print("\n" + "="*30 + "\n")
    print(f"Grid search: {use_grid}")

    # 8. Fit the model
    fit_params = {}
    if w is not None:
        if use_grid:
            # For GridSearchCV: use all weights
            fit_params['classifier__sample_weight'] = w
        else:
            # For non-grid: split weights to match train/val split
            _, _, w_train, _ = train_test_split(
                X, w, test_size=test_size, random_state=SEED
            )
            fit_params['classifier__sample_weight'] = w_train
    
    print("\n" + "="*30)
    print("FITTING MODEL")
    print("="*30)
    
    if use_grid:
        print("Starting GridSearchCV with cross-validation...")
        print("(Progress will show: [CV] fold scores for each parameter combination)\n")
    else:
        print("Training model on training set...")
        if model_name == "mlp":
            print("(MLP will show iteration progress below)\n")
        elif model_name == "xgb":
            print("(XGBoost training in progress...)\n")
    
    model.fit(X_train, y_train, **fit_params)

    # 9. Evaluate model
    if use_grid:
        # For GridSearchCV: use the best CV score and show detailed results
        val_score = model.best_score_
        print("\n" + "="*30)
        print("GRIDSEARCH RESULTS")
        print("="*30)
        print(f"Best CV F1-weighted score: {val_score:.4f}")
        print(f"\nBest parameters:")
        for param, value in model.best_params_.items():
            print(f"  {param}: {value}")
        
        # Show CV scores for all parameter combinations
        print(f"\nAll CV results (mean ± std across {cv_strategy.n_splits} folds):")
        results_df = pd.DataFrame(model.cv_results_)
        # Sort by mean test score descending
        results_df = results_df.sort_values('mean_test_score', ascending=False)
        
        # Show top 5 results
        print("\nTop 5 parameter combinations:")
        for idx, row in results_df.head(5).iterrows():
            mean_score = row['mean_test_score']
            std_score = row['std_test_score']
            params = {k.replace('param_', ''): v for k, v in row.items() if k.startswith('param_')}
            print(f"  Score: {mean_score:.4f} (±{std_score:.4f}) | Params: {params}")
        
        if len(results_df) > 5:
            print(f"  ... and {len(results_df) - 5} more combinations")
    else:
        # For non-grid: evaluate on validation set
        print("\n" + "="*30)
        print("EVALUATING ON VALIDATION SET")
        print("="*30)
        print(f"Validation set size: {len(X_val)} samples")
        
        y_val_pred = model.predict(X_val)
        val_score = f1_score(y_val, y_val_pred, average='weighted', zero_division=0)
        val_acc = accuracy_score(y_val, y_val_pred)
        
        print(f"\nValidation Metrics:")
        print(f"  F1-weighted score: {val_score:.4f}")
        print(f"  Accuracy: {val_acc:.4f}")
        
        # Show per-class metrics if not too many classes
        unique_classes = len(np.unique(y_val))
        if unique_classes <= 20:  # Only show if reasonable number of classes
            print(f"\nPer-class F1 scores:")
            f1_per_class = f1_score(y_val, y_val_pred, average=None, zero_division=0)
            for cls, f1 in enumerate(f1_per_class):
                print(f"  Class {cls}: {f1:.4f}")

    return model, val_score, X_val, y_val

In [15]:
use_grid = False
imputer = "median"
smote_ratio = 0.8
scalar_type = "Standard"

model, score, X_val, y_val = train_model("mlp", X_train, y_train, imputer, smote_ratio, scalar_type, use_grid)


print(model, score)



Selected scaler: Standard (StandardScaler())


y count before SMOTE:
9     92171
10    47829
1     22598
0     17037
8      3940
7      3590
2      2886
6      1642
5      1365
4       746
3       207
Name: count, dtype: int64


Chosen model: mlp


Grid search: False

FITTING MODEL
Training model on training set...
(MLP will show iteration progress below)

Iteration 1, loss = 0.48829260
Validation score: 0.834945
Iteration 2, loss = 0.33690255
Validation score: 0.865685
Iteration 3, loss = 0.28937803
Validation score: 0.881211
Iteration 4, loss = 0.25974384
Validation score: 0.887733
Iteration 5, loss = 0.23752651
Validation score: 0.901548

EVALUATING ON VALIDATION SET
Validation set size: 48503 samples





Validation Metrics:
  F1-weighted score: 0.8187
  Accuracy: 0.8175

Per-class F1 scores:
  Class 0: 0.8065
  Class 1: 0.8451
  Class 2: 0.8392
  Class 3: 0.6857
  Class 4: 0.5678
  Class 5: 0.7292
  Class 6: 0.8496
  Class 7: 0.6394
  Class 8: 0.7154
  Class 9: 0.8640
  Class 10: 0.7498
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('smote',
                 SMOTE(random_state=42,
                       sampling_strategy={0: 73736, 1: 73736, 2: 73736,
                                          3: 73736, 4: 73736, 5: 73736,
                                          6: 73736, 7: 73736, 8: 73736,
                                          10: 73736})),
                ('scaler', StandardScaler()),
                ('classifier',
                 MLPClassifier(batch_size=256, early_stopping=True,
                               hidden_layer_sizes=(256, 256), max_iter=5,
                               n_iter_no_change=5, random_state=42,
                       

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

def eval_dataset(X_emb, y_int, name):
    y_pred_int = model.predict(X_emb)
    acc = accuracy_score(y_int, y_pred_int)
    print(f"{name} accuracy (on provided data): {acc:.4f}")
    return acc

print("CoverType:", eval_dataset(X_cov_emb,   y_cov_int, "CoverType"))
print("HELOC:   ", eval_dataset(X_heloc_emb, y_heloc_int, "HELOC"))
print("HIGGS:   ", eval_dataset(X_higgs_emb, y_higgs_int, "HIGGS"))

def evaluate_model(model, X, y_true, average_type='weighted'):
    """
    Compute and print evaluation metrics for a trained model.
    
    Parameters:
        model: Trained model (must support predict method)
        X: Features (numpy array or dataframe)
        y_true: True labels (numpy array or series)
        average_type: Averaging type for multiclass ('micro', 'macro', 'weighted')
    """
    y_pred = model.predict(X)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average=average_type, zero_division=0)
    rec = recall_score(y_true, y_pred, average=average_type, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=average_type, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)


    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1 Score:", f1)
    print("Confusion Matrix:\n", cm)
    print("Classification Report:")
    print(classification_report(y_true, y_pred, zero_division=0))

# Example usage for validation set (weighted average for multiclass):
print("\n === MLP Model Validation Metrics ===")
evaluate_model(model, X_val, y_val, average_type='weighted')
# If xgb_model un-commented above, you can also evaluate it:
# print("=== XGB Model Validation Metrics ===")
# evaluate_model(xgb_model, X_val, y_val, average_type='weighted')


CoverType accuracy (on provided data): 0.8396
CoverType: 0.8395724686322095
HELOC accuracy (on provided data): 0.8014
HELOC:    0.8014448103686391
HIGGS accuracy (on provided data): 0.8269
HIGGS:    0.8269371428571428

 === MLP Model Validation Metrics ===
Accuracy: 0.817537059563326
Precision: 0.8223239732367363
Recall: 0.817537059563326
F1 Score: 0.8186767815737364
Confusion Matrix:
 [[ 3196   928     5     0    30     2    98     0     0     1     0]
 [  449  4845    92     0   174    75    15     0     0     0     0]
 [    0     9   634    11     4    63     0     0     0     0     0]
 [    0     0    10    36     0     6     0     0     0     0     0]
 [    0    24     4     0   157     1     0     0     0     0     0]
 [    0     8    45     6     2   280     0     0     0     0     0]
 [   21     2     0     0     0     0   387     0     0     0     0]
 [    0     0     0     0     0     0     0   531   367     0     0]
 [    0     0     0     0     0     0     0   232   753    

In [19]:
# Reverse mapping for predictions
# CoverType: 0-6 -> 1-7
inverse_cov_map = {v: k for k, v in cov_map.items()}

# For MLP
y_cov_test_pred = model.predict(X_cov_test_emb)
y_heloc_test_pred = model.predict(X_heloc_test_emb)
y_higgs_test_pred = model.predict(X_higgs_test_emb)

# Predictions on test sets
# y_cov_test_pred = clf.predict(X_cov_test_emb)
# y_heloc_test_pred = clf.predict(X_heloc_test_emb)
# y_higgs_test_pred = clf.predict(X_higgs_test_emb)


# Convert predictions back to original label space
# CoverType: 0-6 -> 1-7
y_cov_test_pred_orig = np.array([inverse_cov_map[pred] for pred in y_cov_test_pred])

# HELOC: 7, 8 -> 0, 1
y_heloc_test_pred_orig = (y_heloc_test_pred - 7).astype(int)

# HIGGS: 9, 10 -> 0, 1
y_higgs_test_pred_orig = (y_higgs_test_pred - 9).astype(int)

print("CoverType predictions (original labels 1-7):", np.unique(y_cov_test_pred_orig))
print("HELOC predictions (original labels 0-1):", np.unique(y_heloc_test_pred_orig))
print("HIGGS predictions (original labels 0-1):", np.unique(y_higgs_test_pred_orig))


CoverType predictions (original labels 1-7): [1 2 3 4 5 6 7]
HELOC predictions (original labels 0-1): [0 1]
HIGGS predictions (original labels 0-1): [0 1]


In [20]:

# Generate submission file with original label encoding

# CoverType: IDs start at 1
cov_df = pd.DataFrame({
    "ID": np.arange(1, 1 + len(y_cov_test_pred_orig)),
    "Prediction": y_cov_test_pred_orig
})

# HELOC: IDs start at 3501
heloc_start = 3501
heloc_df = pd.DataFrame({
    "ID": np.arange(heloc_start, heloc_start + len(y_heloc_test_pred_orig)),
    "Prediction": y_heloc_test_pred_orig
})

# HIGGS: IDs start at 4547
higgs_start = 4547
higgs_df = pd.DataFrame({
    "ID": np.arange(higgs_start, higgs_start + len(y_higgs_test_pred_orig)),
    "Prediction": y_higgs_test_pred_orig
})

# Merge all into one CSV
submission = pd.concat([cov_df, heloc_df, higgs_df], ignore_index=True)

print("\nSubmission preview:")
print(submission.head(10))
print("...")
print(submission.tail(10))
print(f"\nTotal rows: {len(submission)}")

# Save
submission_path = "combined_submission.csv"
submission.to_csv(submission_path, index=False)

print(f"\n✓ Saved unified submission to: {submission_path}")



Submission preview:
   ID  Prediction
0   1           1
1   2           1
2   3           1
3   4           1
4   5           1
5   6           2
6   7           1
7   8           2
8   9           2
9  10           2
...
          ID  Prediction
79536  79537           1
79537  79538           1
79538  79539           0
79539  79540           1
79540  79541           1
79541  79542           1
79542  79543           0
79543  79544           0
79544  79545           0
79545  79546           0

Total rows: 79546

✓ Saved unified submission to: combined_submission.csv
