In [14]:
import warnings
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning) 

warnings.filterwarnings(
    "ignore", 
    message="X does not have valid feature names, but LGBMRegressor was fitted with feature names"
)

In [15]:
import optuna
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer
from numpy.linalg import LinAlgError
from scipy.stats import skew, kurtosis

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (
    RBF,
    Matern,
    RationalQuadratic,
    ExpSineSquared,
    DotProduct,
    WhiteKernel,
    ConstantKernel
)

import os
from optuna.integration import LightGBMPruningCallback
from optuna.exceptions import TrialPruned
import json
import time

In [16]:
def compute_volume_weighted_component_features(X):
    """
    Computes individual volume-weighted features WjPk = Componentj_fraction * Componentj_Propertyk
    for j in 1..5 and k in 1..10 (total 50 features).
    """
    features = {}
    for comp_idx in range(1, 6):  # Components 1–5
        for prop_idx in range(1, 11):  # Properties 1–10
            vol_col = f'Component{comp_idx}_fraction'
            prop_col = f'Component{comp_idx}_Property{prop_idx}'
            feat_name = f'W{comp_idx}P{prop_idx}'
            features[feat_name] = X[vol_col] * X[prop_col]
    return pd.DataFrame(features)

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import QuantileTransformer
import os

def clip_outliers_iqr(df, factor=1.5):
    clipped_df = df.copy()
    clip_info = {}
    
    for col in clipped_df.select_dtypes(include=np.number).columns:
        Q1 = clipped_df[col].quantile(0.25)
        Q3 = clipped_df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - factor * IQR
        upper = Q3 + factor * IQR

        before = clipped_df[col].copy()
        clipped_df[col] = clipped_df[col].clip(lower, upper)

        n_clipped = (before != clipped_df[col]).sum()
        if n_clipped > 0:
            clip_info[col] = n_clipped

    return clipped_df, clip_info

def apply_quantile_transform(df):
    df_transformed = df.copy()
    for col in df_transformed.select_dtypes(include=np.number).columns:
        qt = QuantileTransformer(output_distribution='normal', n_quantiles=min(1000, len(df)))
        df_transformed[col] = qt.fit_transform(df_transformed[[col]]).ravel()
    return df_transformed

def get_data(target, clipiqr=False, quantiletransform=False, threshold=0.1, get_test=False):
    X_test = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/test.csv")
    X_train = pd.read_csv(f"{BASE_PATH}/train/{target}_X.csv")
    y_train = pd.read_csv(f"{BASE_PATH}/train/{target}_y.csv")
    X_val = pd.read_csv(f"{BASE_PATH}/val/{target}_X.csv")
    y_val = pd.read_csv(f"{BASE_PATH}/val/{target}_y.csv")
    
    # Optional preprocessing
    if clipiqr:
        X_train, clip_info_train = clip_outliers_iqr(X_train)
        X_val, clip_info_val = clip_outliers_iqr(X_val)
        if get_test:
            X_test, clip_info_val = clip_outliers_iqr(X_test)

    if quantiletransform:
        X_train = apply_quantile_transform(X_train)
        X_val = apply_quantile_transform(X_val)
        if get_test:
            X_test = apply_quantile_transform(X_test)
        
    # Feature engineering
    X_train = pd.concat([X_train, compute_volume_weighted_component_features(X_train)], axis=1)
    X_val = pd.concat([X_val, compute_volume_weighted_component_features(X_val)], axis=1)
    if get_test:
        X_test = pd.concat([X_test, compute_volume_weighted_component_features(X_test)], axis=1)

    # Feature selection
    df = pd.read_csv(os.path.join("/pscratch/sd/r/ritesh11/temp_dir/feature_importance", f"{target}.csv"))
    cols = df[df["importance"] > threshold].iloc[:, 0].tolist()
    # print(f"Selected features ({len(cols)}):", cols)
    X_train = X_train[cols]
    X_val = X_val[cols]
    if get_test:
        X_test = X_test[cols]
    if get_test:
        return X_train.values, y_train.values.ravel(), X_val.values, y_val.values.ravel(), X_test.values
        
    return X_train.values, y_train.values.ravel(), X_val.values, y_val.values.ravel()


In [18]:
TARGETS = [f"BlendProperty{i}" for i in range(1, 11)]
BASE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/dataset/updated"
model_dir = "/pscratch/sd/r/ritesh11/temp_dir/LGBM_models"
N_TRIALS = 200

In [19]:
gpr_fixed_params = {
    "random_state": 42,
    "optimizer" : "fmin_l_bfgs_b"
}

In [20]:
lgbm_fixed_params = {
    "objective": "regression",
    "n_estimators" : 2000,
    "verbosity": -1,
    "random_state": 42,
    "device_type": "cpu",
    # "gpu_use_dp": True
}

In [21]:
def get_kernel(trial):
    kernel_choice = trial.suggest_categorical("kernel", [
        "RBF", "Matern", "RQ", "DotProduct"
    ])

    if kernel_choice == "RBF":
        base_kernel = RBF(length_scale_bounds=(1e-5,1e5))

    elif kernel_choice == "Matern":
        nu = trial.suggest_categorical("matern_nu", [0.5, 1.5, 2.5])
        base_kernel = Matern(nu=nu,length_scale_bounds=(1e-5,1e5))

    elif kernel_choice == "RQ":
        base_kernel = RationalQuadratic(length_scale_bounds=(1e-5,1e5),alpha_bounds=(1e-5,1e5))

    elif kernel_choice == "DotProduct":
        base_kernel = DotProduct(sigma_0_bounds=(1e-5,1e5))

    return base_kernel

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.exceptions import NotFittedError
from optuna.exceptions import TrialPruned
from numpy.linalg import LinAlgError
import numpy as np
from sklearn.model_selection import KFold
import lightgbm as lgb

def train_gpr(trial, X_train, y_train):
    
    kernel = get_kernel(trial) 
    
    params = {
        "kernel": kernel,  # define this or tune externally
        "alpha": trial.suggest_float("gpr_alpha", 1e-12, 1e-3, log=True),
        "n_restarts_optimizer": trial.suggest_categorical("gpr_n_restarts_optimizer", [0, 1]),  
        "normalize_y": trial.suggest_categorical("gpr_normalize_y", [True, False]), 
    }
    try:
        model = GaussianProcessRegressor(**params, **gpr_fixed_params)
        model.fit(X_train, y_train)
        return model
    except (LinAlgError, ValueError) as e:
        raise TrialPruned(f"GPR fitting failed: {e}")

def train_lgb(trial, X_train, y_train, X_val, y_val):
    params = {
        "learning_rate": trial.suggest_float("lgb_lr", 1e-3, 0.3, log=True),
        # "boosting_type" : trial.suggest_categorical("lgb_boosting_type", ["gbdt", "dart"]),
        "max_depth": trial.suggest_int("lgb_max_depth", 3, 12),
        "num_leaves": trial.suggest_int("lgb_num_leaves", 16, 256),
        "reg_lambda": trial.suggest_float("lgb_reg_lambda", 1e-4, 100.0, log=True),
        "reg_alpha": trial.suggest_float("lgb_reg_alpha", 1e-4, 100.0, log=True),
        "bagging_fraction": trial.suggest_float("lgb_bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("lgb_bagging_freq", 1, 7),
        "colsample_bytree": trial.suggest_float("lgb_colsample_bytree", 0.4, 1.0),
        "min_child_weight": trial.suggest_float("lgb_min_child_weight", 1e-2, 10.0, log=True),
        "min_child_samples": trial.suggest_int("lgb_min_child_samples", 5, 100),
        "min_split_gain": trial.suggest_float("lgb_min_split_gain", 0.0, 1.0),
        "max_bin": trial.suggest_categorical("lgb_max_bin", [128, 256, 512, 1024]),
    }
    
    model = lgb.LGBMRegressor(**params, **lgbm_fixed_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="mape",
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            # LightGBMPruningCallback(trial, "mape")
        ]
    )
    return model

def objective(trial, target, n_splits=5, random_state=42):
    # Preprocess choices
    clipping = trial.suggest_categorical("clipping", ['ciqr', 'qtr', False])
    threshold = trial.suggest_categorical("threshold", [0.0001, 0.001, 0.1, 0])
    
    clipiqr = clipping == 'ciqr'
    quantiletransform = clipping == 'qtr'

    # Combine train + val into full dataset
    X_train, y_train, X_val, y_val = get_data(
        target, clipiqr=clipiqr, quantiletransform=quantiletransform, threshold=threshold
    )
    X_all = np.concatenate([X_train, X_val], axis=0)
    y_all = np.concatenate([y_train, y_val], axis=0)

    # Stacking configuration
    base_model_type = trial.suggest_categorical("base_model", ["gpr", "lgb"])
    meta_model_type = "gpr" if base_model_type == "lgb" else "lgb"
    
    if base_model_type == "gpr":
        passthrough = True  # enforced
    else:
        passthrough = trial.suggest_categorical("passthrough", [True, False])
    
    # Store per-fold metrics
    fold_mapes = []

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_all)):
        X_train_cv, y_train_cv = X_all[train_idx], y_all[train_idx]
        X_val_cv, y_val_cv = X_all[val_idx], y_all[val_idx]

        # === Train base model
        if base_model_type == "gpr":
            base_model = train_gpr(trial, X_train_cv, y_train_cv)
        else:
            base_model = train_lgb(trial, X_train_cv, y_train_cv, X_val_cv, y_val_cv)

        base_preds_train = base_model.predict(X_train_cv)
        base_preds_val = base_model.predict(X_val_cv)

        # === Prepare meta model input
        if passthrough:
            X_meta_train = np.column_stack([X_train_cv, base_preds_train])
            X_meta_val = np.column_stack([X_val_cv, base_preds_val])
        else:
            X_meta_train = base_preds_train.reshape(-1, 1)
            X_meta_val = base_preds_val.reshape(-1, 1)

        # === Train meta model
        if meta_model_type == "gpr":
            meta_model = train_gpr(trial, X_meta_train, y_train_cv)
            meta_preds = meta_model.predict(X_meta_val)
        else:
            meta_model = train_lgb(trial, X_meta_train, y_train_cv, X_meta_val, y_val_cv)
            meta_preds = meta_model.predict(X_meta_val)

        # === Weighted ensemble
        w_base = trial.suggest_float("w_base", 0.0, 1.0)
        w_meta = trial.suggest_float("w_meta", 0.0, 1.0)
        total_weight = w_base + w_meta + 1e-8
        w_base /= total_weight
        w_meta /= total_weight

        final_preds = w_base * base_preds_val + w_meta * meta_preds

        # === Metric
        fold_mape = mean_absolute_percentage_error(y_val_cv, final_preds)
        fold_mapes.append(fold_mape)

    return np.mean(fold_mapes)

In [23]:
# def objective(trial, target):
#     # Preprocessing hyperparams
#     clipping = trial.suggest_categorical("clipping", ['ciqr', 'qtr', False])
#     threshold = trial.suggest_categorical("threshold", [float('-inf'), 0.0001, 0.001, 0.1, 0])
    
#     clipiqr = False
#     quantiletransform = False

#     if clipping=='ciqr':
#         clipiqr = True
#     elif clipping=='qtr':
#         quantiletransform = True
        
#     # Load data with preprocessing
#     X_train, y_train, X_val, y_val = get_data(
#         target,
#         clipiqr=clipiqr,
#         quantiletransform=quantiletransform,
#         threshold=threshold,
#     )

#     params = {
#         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),

#         # Tree structure
#         "max_depth": trial.suggest_int("max_depth", 3, 12),
#         "num_leaves": trial.suggest_int("num_leaves", 16, 256),

#         # Regularization
#         "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 100.0, log=True),
#         "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 100.0, log=True),

#         # Bagging
#         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
#         "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),

#         # Feature sampling
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),

#         # Leaf constraints
#         "min_child_weight": trial.suggest_float("min_child_weight", 1e-2, 10.0, log=True),
#         "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
#         "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),

#         # Binning
#         "max_bin": trial.suggest_categorical("max_bin", [128, 256, 512, 1024])
#     }

#     model = lgb.LGBMRegressor(**params, **fixed_params)

#     model.fit(
#         X_train, y_train,
#         eval_set=[(X_val, y_val)],
#         eval_metric="mape",
#         callbacks=[
#             lgb.early_stopping(stopping_rounds=50, verbose=False),
#             LightGBMPruningCallback(trial, "mape"),
#         ]
#     )

#     preds = model.predict(X_val)
#     return mean_absolute_percentage_error(y_val, preds)


In [24]:
# def log_callback(study, trial):
#         if trial.number % 50 == 0 and trial.value is not None:
#             print(f"Trial {trial.number}: MAPE = {trial.value:.4f}: BEST: {study.best_trial.value:.4f}" )

In [25]:
# optuna.logging.set_verbosity(optuna.logging.WARNING)

In [26]:
# for target in TARGETS[7:]:
#     # pruner = optuna.pruners.MedianPruner(n_startup_trials=50, n_warmup_steps=10)
#     study = optuna.create_study(direction="minimize")

#     study.optimize(
#         lambda trial: objective(trial, target),
#         n_trials=N_TRIALS,
#         show_progress_bar=True,
#         n_jobs=12
#     )

#     print(f"\nBest MAPE for {target}: {study.best_value:.4f}")
#     print(f"Best params for {target}:\n{study.best_params}\n")

#     complete_params = {**study.best_params, **gpr_fixed_params, **lgbm_fixed_params}

#     os.makedirs(model_dir, exist_ok=True)
#     with open(os.path.join(model_dir, f"best_params_{target}_gprexperiment.json"), "w") as f:
#         json.dump(complete_params, f, indent=2)
        
#     with open(os.path.join(model_dir, f"best_mape_{target}.txt"), "w") as f:
#         f.write(f"{study.best_value:.6f}\n")


In [27]:
def get_fixed_data(target,config):
    return get_data(
        target,
        clipiqr=config['clipping'] == 'ciqr',
        quantiletransform=config['clipping'] == 'qtr',
        threshold=config['threshold'],
        get_test=True
    )

In [28]:
# === Train base LightGBM model
def train_lgb_model(X_train, y_train, X_val, y_val):
    model = lgb.LGBMRegressor(
        learning_rate=config['lgb_lr'],
        max_depth=config['lgb_max_depth'],
        num_leaves=config['lgb_num_leaves'],
        reg_lambda=config['lgb_reg_lambda'],
        reg_alpha=config['lgb_reg_alpha'],
        bagging_fraction=config['lgb_bagging_fraction'],
        bagging_freq=config['lgb_bagging_freq'],
        colsample_bytree=config['lgb_colsample_bytree'],
        min_child_weight=config['lgb_min_child_weight'],
        min_child_samples=config['lgb_min_child_samples'],
        min_split_gain=config['lgb_min_split_gain'],
        max_bin=config['lgb_max_bin'],
        **lgbm_fixed_params
    )
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='mape',
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    return model

# === Train GPR model
def train_gpr_model(X_train, y_train):
    kernel = get_kernel_from_config(config)
    model = GaussianProcessRegressor(
        kernel=kernel,
        alpha=config['gpr_alpha'],
        n_restarts_optimizer=config['gpr_n_restarts_optimizer'],
        normalize_y=config['gpr_normalize_y'],
        **gpr_fixed_params
    )
    model.fit(X_train, y_train)
    return model

In [29]:
def get_kernel_from_config(config):
    kernel_choice = config.get("kernel", "RBF")

    if kernel_choice == "RBF":
        return RBF(length_scale_bounds=(1e-5, 1e5))

    elif kernel_choice == "Matern":
        nu = config.get("matern_nu", 1.5)  # default to 1.5 if not specified
        return Matern(nu=nu, length_scale_bounds=(1e-5, 1e5))

    elif kernel_choice == "RQ":
        return RationalQuadratic(length_scale_bounds=(1e-5, 1e5), alpha_bounds=(1e-5, 1e5))

    elif kernel_choice == "DotProduct":
        return DotProduct(sigma_0_bounds=(1e-5, 1e5))

    else:
        raise ValueError(f"Unsupported kernel choice: {kernel_choice}")

In [30]:
def train_stacked_model(target,config):
    # Split data
    X_train, y_train, X_val, y_val, X_test = get_fixed_data(target,config)
    X_all = np.concatenate([X_train, X_val], axis=0)
    y_all = np.concatenate([y_train, y_val], axis=0)

    base_model_type = config['base_model'] 
    meta_model_type = "gpr" if base_model_type == "lgb" else "lgb"
    passthrough = config.get('passthrough',True)

    # === Base Model
    if base_model_type == 'lgb':
        base_model = train_lgb_model(X_train, y_train, X_val, y_val)
        try:
            print(f"[Base LGBM] used {base_model.best_iteration_} estimators")
        except AttributeError:
            print("[Base LGBM] did not use early stopping or no best_iteration_ found")
    else:
        base_model = train_gpr_model(X_all, y_all)

    base_preds_train = base_model.predict(X_train)
    base_preds_val = base_model.predict(X_val)
    base_preds_test = base_model.predict(X_test)
    
    # === Meta Input
    if passthrough:
        X_meta_train = np.column_stack([X_train, base_preds_train])
        X_meta_val = np.column_stack([X_val, base_preds_val])
        X_meta_test = np.column_stack([X_test, base_preds_test])
        X_meta_all = np.vstack([X_meta_train, X_meta_val])
    else:
        X_meta_train = base_preds_train.reshape(-1, 1)
        X_meta_val = base_preds_val.reshape(-1, 1)
        X_meta_test = base_preds_test.reshape(-1, 1)
        X_meta_all = np.vstack([X_meta_train, X_meta_val])

    if meta_model_type == "gpr":
        meta_model = train_gpr_model(X_meta_all, y_all)
        meta_preds = meta_model.predict(X_meta_test)
    else:
        meta_model = train_lgb_model(X_meta_train, y_train, X_meta_val, y_val)
        meta_preds = meta_model.predict(X_meta_test)

    # === Weighted Ensemble
    w_base = config['w_base']
    w_meta = config['w_meta']
    total = w_base + w_meta + 1e-8
    w_base /= total
    w_meta /= total

    final_preds = w_base * base_preds_test + w_meta * meta_preds

    return final_preds

In [31]:
res = {}

In [32]:
for target in TARGETS:
    with open(os.path.join(model_dir, f"best_params_{target}_gprexperiment.json"), "r") as f:
        config = json.load(f)
    res[target] = train_stacked_model(target,config)

[Base LGBM] used 1417 estimators
[Base LGBM] used 1223 estimators
[Base LGBM] used 572 estimators


In [33]:
pd.DataFrame(res)

Unnamed: 0,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,0.154898,0.251252,0.721554,0.562056,0.340857,0.713312,0.686304,0.371897,-0.311870,0.326182
1,-0.810104,-0.588758,-1.149076,0.056294,-0.730921,-0.103821,-0.968339,-0.961511,-1.061895,0.007668
2,1.769542,1.139059,1.120918,1.047087,2.485193,1.861209,1.006254,2.002036,0.349134,2.223461
3,-0.453284,0.295343,0.924950,-0.684258,1.912963,-0.438880,0.917463,1.606631,0.802194,-0.936379
4,0.155276,-1.138684,1.149454,0.451078,2.135648,0.237306,0.982660,0.205062,-0.055396,1.075128
...,...,...,...,...,...,...,...,...,...,...
495,0.169084,-0.880250,1.106266,-0.274813,-0.269777,-0.742171,1.072742,-0.497242,-1.359244,-0.446446
496,-2.175267,-1.310912,-1.126352,-2.300229,-0.626259,-2.460529,-1.094580,-1.847499,-1.442878,-1.296665
497,1.977994,2.199151,0.301280,1.133344,0.001200,0.660378,0.276348,0.923305,0.264307,0.450129
498,-0.137799,0.822333,1.602543,-1.385970,-0.906719,0.177172,1.907439,0.561952,0.218511,1.277569


In [34]:
test_df = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/test.csv")

In [35]:
df = pd.DataFrame(res)
df.insert(0, 'ID', test_df.index+1)

In [36]:
df.to_csv("LGBM+GPR.csv", index=False)