In [103]:
import pandas as pd
import numpy as np
import os
import joblib
import json
import optuna
from tqdm import tqdm

from sklearn.metrics import mean_absolute_percentage_error
from autogluon.tabular import TabularPredictor
import xgboost as xgb
import lightgbm as lgb
from tabpfn_extensions.hpo import TunedTabPFNRegressor

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (
    RBF,
    Matern,
    RationalQuadratic,
    ExpSineSquared,
    DotProduct,
    WhiteKernel,
    ConstantKernel
)

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.linear_model import RidgeCV, ElasticNetCV, LassoCV

In [2]:
def compute_volume_weighted_component_features(X):
    """
    Computes individual volume-weighted features WjPk = Componentj_fraction * Componentj_Propertyk
    for j in 1..5 and k in 1..10 (total 50 features).
    """
    features = {}
    for comp_idx in range(1, 6):  # Components 1–5
        for prop_idx in range(1, 11):  # Properties 1–10
            vol_col = f'Component{comp_idx}_fraction'
            prop_col = f'Component{comp_idx}_Property{prop_idx}'
            feat_name = f'W{comp_idx}P{prop_idx}'
            features[feat_name] = X[vol_col] * X[prop_col]
    return pd.DataFrame(features)

In [3]:
def get_GPR_data(target):
    # Load train and val sets
    X_train = pd.read_csv(f"{BASE_PATH}/train/{target}_X.csv")
    y_train = pd.read_csv(f"{BASE_PATH}/train/{target}_y.csv")
    X_val = pd.read_csv(f"{BASE_PATH}/val/{target}_X.csv")
    y_val = pd.read_csv(f"{BASE_PATH}/val/{target}_y.csv")

    # Feature engineering
    X_train = pd.concat([X_train, compute_volume_weighted_component_features(X_train)], axis=1)
    X_val = pd.concat([X_val, compute_volume_weighted_component_features(X_val)], axis=1)

    # Feature selection
    df = pd.read_csv(os.path.join(fi_path, f"{target}.csv"))
    cols = df[df["importance"] > 0.1].iloc[:, 0].tolist()

    X_train = X_train[cols]
    X_val = X_val[cols]

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    return X_train_scaled, y_train.values.ravel(), X_val_scaled, y_val.values.ravel()

In [4]:
def get_data(target):
    X_train = pd.read_csv(f"{BASE_PATH}/train/{target}_X.csv")
    y_train = pd.read_csv(f"{BASE_PATH}/train/{target}_y.csv")
    X_val = pd.read_csv(f"{BASE_PATH}/val/{target}_X.csv")
    y_val = pd.read_csv(f"{BASE_PATH}/val/{target}_y.csv")

    X_train = pd.concat([X_train, compute_volume_weighted_component_features(X_train)], axis=1)
    X_val = pd.concat([X_val, compute_volume_weighted_component_features(X_val)], axis=1)

    return X_train, y_train.values.ravel(), X_val, y_val.values.ravel()

# MLP

In [5]:
MLP = {'val_loss':[], 'models':[]}

In [6]:
targets = [f"BlendProperty{i}" for i in range(1, 11)]
BASE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/dataset"
model_dir = "/pscratch/sd/r/ritesh11/temp_dir/NN_models"

In [7]:
data = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/train.csv")
X_test = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/test.csv")
X_train = data.iloc[:,:55]
y = data.iloc[:,55:]

In [8]:
scaler = StandardScaler()

In [10]:
val_loss = []
predictors = []

In [11]:
for t in targets:
    
    # Load data
    X_train = pd.read_csv(f"{BASE_PATH}/train/{t}_X.csv")
    y_train = pd.read_csv(f"{BASE_PATH}/train/{t}_y.csv")
    X_val = pd.read_csv(f"{BASE_PATH}/val/{t}_X.csv")
    y_val = pd.read_csv(f"{BASE_PATH}/val/{t}_y.csv")
    # print("Data loaded.")

    # Feature engineering
    scaler = StandardScaler()
    blend_features_train = compute_volume_weighted_component_features(X_train)
    blend_features_val = compute_volume_weighted_component_features(X_val)

    X_train = pd.concat([X_train, blend_features_train], axis=1)
    X_val = pd.concat([X_val, blend_features_val], axis=1)
    # print("Volume-weighted features added.")

    # Scaling
    X_train = pd.DataFrame(
        scaler.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    X_val = pd.DataFrame(
        scaler.transform(X_val),
        columns=X_val.columns,
        index=X_val.index
    )
    # print("Data scaled.")

    # Load model and evaluate
    model_dir = f"/pscratch/sd/r/ritesh11/temp_dir/NN_models/{t}"
    try:
        predictor = TabularPredictor.load(model_dir)
        # print("Model loaded.")
        
        y_pred = predictor.predict(X_val)
        loss = mean_absolute_percentage_error(y_val.values.flatten(), y_pred.values)
        val_loss.append(loss)
        # print(f"MAPE for {t}: {loss:.4f}")
        
    except Exception as e:
        print(f"Failed to evaluate model for target {t}: {e}")

In [12]:
MLP['val_loss'] = val_loss

In [13]:
for t in targets:
    model_dir = f"/pscratch/sd/r/ritesh11/temp_dir/NN_models/{t}"
    predictor = TabularPredictor.load(model_dir)
    predictors.append(predictor)

In [14]:
MLP['models'] = predictors

# LR

In [86]:
LASSO = {'data': {}, 'val_loss':[], 'models':[]}
RIDGE = {'data': {}, 'val_loss':[], 'models':[]}
ELASTIC = {'data': {}, 'val_loss':[], 'models':[]}

In [87]:
SAVE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs"

MODEL_NAMES = ["LASSOCV", "RIDGECV", "ELASTICNETCV"]
TARGETS = [f"BlendProperty{i}" for i in range(1, 11)]

In [88]:
val_loss = {model: [] for model in MODEL_NAMES}
models = {model: [] for model in MODEL_NAMES}

In [89]:
for t in targets:
    # Load data
    X_train = pd.read_csv(f"{BASE_PATH}/train/{t}_X.csv")
    y_train = pd.read_csv(f"{BASE_PATH}/train/{t}_y.csv")
    X_val = pd.read_csv(f"{BASE_PATH}/val/{t}_X.csv")
    y_val = pd.read_csv(f"{BASE_PATH}/val/{t}_y.csv").values.ravel()
    # print("Data loaded.")

    for name in MODEL_NAMES:
        model_path = os.path.join(SAVE_PATH, f"{t}_{name}.joblib")
        model = joblib.load(model_path)
        preds = model.predict(X_val)
        loss = mean_absolute_percentage_error(y_val, preds)
        val_loss[name].append(loss)
        models[name].append(model)

In [90]:
LASSO['val_loss'] = val_loss['LASSOCV']
ELASTIC['val_loss'] = val_loss['ELASTICNETCV']
RIDGE['val_loss'] = val_loss['RIDGECV']

In [91]:
LASSO['models'] = models['LASSOCV']
ELASTIC['models'] = models['ELASTICNETCV']
RIDGE['models'] = models['RIDGECV']

# XGBoost

In [67]:
TARGETS = [f"BlendProperty{i}" for i in range(1, 11)]
BASE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/dataset"
model_dir = "/pscratch/sd/r/ritesh11/temp_dir/XGB_models"

In [68]:
XGB = {'val_loss':[], 'models':[]}

In [69]:
val_loss = []
models = []

In [70]:
for t in tqdm(TARGETS):
    X_train, y_train, X_val, y_val = get_data(t)
    with open(os.path.join(model_dir, f"best_params_{t}.json"), "r") as f:
        params = json.load(f) 
    model = xgb.XGBRegressor(**params,device='cuda',tree_method="hist")
    model.fit(
        X_train, y_train,
        verbose=False
    )
    preds = model.predict(X_val)
    loss = mean_absolute_percentage_error(y_val, preds)
    val_loss.append(loss)
    models.append(model)

100%|██████████| 10/10 [01:23<00:00,  8.39s/it]


In [71]:
XGB['val_loss'] = val_loss
XGB['models'] = models

# LGBM

In [72]:
model_dir = "/pscratch/sd/r/ritesh11/temp_dir/LGBM_models"

In [73]:
LGBM = {'val_loss':[], 'models':[]}

In [74]:
val_loss = []
models = []

In [75]:
for t in tqdm(TARGETS):
    X_train, y_train, X_val, y_val = get_data(t)
    with open(os.path.join(model_dir, f"best_params_{t}.json"), "r") as f:
        params = json.load(f) 
    model = lgb.LGBMRegressor(**params)
    model.fit(
        X_train, y_train,
    )
    preds = model.predict(X_val)
    loss = mean_absolute_percentage_error(y_val, preds)
    val_loss.append(loss)
    models.append(model)

100%|██████████| 10/10 [00:30<00:00,  3.06s/it]


In [76]:
LGBM['val_loss'] = val_loss
LGBM['models'] = models

# TabPFN

In [30]:
TABPFN = {'val_loss':[], 'models':[]}

In [31]:
def obj_fn(model, X_val, y_val):
    preds = model.predict(X_val)
    return mean_absolute_percentage_error(y_val, preds)

In [125]:
model = joblib.load(f"/pscratch/sd/r/ritesh11/temp_dir/TabPFN_models/{t}.pkl")

In [32]:
model_dir = "/pscratch/sd/r/ritesh11/temp_dir/TabPFN_models"
val_loss = []
models = []

In [33]:
for t in tqdm(TARGETS):
    X_train, y_train, X_val, y_val = get_data(t)

    model = joblib.load(f"{model_dir}/{t}.pkl")
    preds = model.predict(X_val)
    loss = mean_absolute_percentage_error(y_val, preds)
    val_loss.append(loss)
    models.append(model)

100%|██████████| 10/10 [00:18<00:00,  1.81s/it]


In [34]:
TABPFN['models'] = models
TABPFN['val_loss'] = val_loss

# GPR

In [None]:
GPR = {'val_loss':[], 'models':[]}

In [38]:
BASE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/dataset"
model_dir = "/pscratch/sd/r/ritesh11/temp_dir/GPR_models"
fi_path = "/pscratch/sd/r/ritesh11/temp_dir/feature_importance"

In [39]:
models = []
val_loss = []

In [43]:
for target in TARGETS:
    print(f"\nProcessing target: {target}")

    X_train, y_train, X_val, y_val = get_GPR_data(target)

    # Load saved best hyperparameters
    with open(os.path.join(model_dir, f"best_params_{target}.json"), "r") as f:
        params = json.load(f)

    # Convert strings/bools properly
    kernel_choice = params["kernel"]
    const_scale = float(params["const_scale"])
    const_bias = float(params["const_bias"])

    # Build the base kernel
    if kernel_choice == "RBF":
        base_kernel = RBF(length_scale_bounds=(1e-5, 1e5))

    elif kernel_choice == "Matern":
        nu = float(params["matern_nu"])
        base_kernel = Matern(nu=nu, length_scale_bounds=(1e-5, 1e5))

    elif kernel_choice == "RQ":
        base_kernel = RationalQuadratic(length_scale_bounds=(1e-5, 1e5), alpha_bounds=(1e-5, 1e5))

    elif kernel_choice == "DotProduct":
        base_kernel = DotProduct(sigma_0_bounds=(1e-5, 1e5))

    else:
        raise ValueError(f"Unknown kernel type: {kernel_choice}")

    # Final kernel composition: scale * kernel + bias + noise
    kernel = ConstantKernel(const_scale) * base_kernel
    kernel += ConstantKernel(const_bias)
    kernel += WhiteKernel(noise_level_bounds=(1e-5, 1e5))

    # Final params to GPR
    gpr_params = {
        "kernel": kernel,
        "alpha": float(params["alpha"]),
        "n_restarts_optimizer": int(params["n_restarts_optimizer"]),
        "normalize_y": bool(params["normalize_y"]),
        "random_state": int(params.get("random_state", 42)),
        "optimizer": params.get("optimizer", "fmin_l_bfgs_b")
    }

    # Train model
    model = GaussianProcessRegressor(**gpr_params)
    model.fit(X_train, y_train)

    # Evaluate
    preds = model.predict(X_val)
    score = mean_absolute_percentage_error(y_val, preds)

    print(f"MAPE for {target}: {score:.4f}")
    val_loss.append(score)
    models.append(model)


Processing target: BlendProperty1


ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


MAPE for BlendProperty1: 0.0001

Processing target: BlendProperty2




MAPE for BlendProperty2: 0.0138

Processing target: BlendProperty3
MAPE for BlendProperty3: 0.4910

Processing target: BlendProperty4


ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


MAPE for BlendProperty4: 0.0099

Processing target: BlendProperty5
MAPE for BlendProperty5: 0.2714

Processing target: BlendProperty6




MAPE for BlendProperty6: 0.0002

Processing target: BlendProperty7
MAPE for BlendProperty7: 0.5589

Processing target: BlendProperty8
MAPE for BlendProperty8: 0.1600

Processing target: BlendProperty9
MAPE for BlendProperty9: 0.3166

Processing target: BlendProperty10
MAPE for BlendProperty10: 0.2190




In [46]:
GPR = {}

In [47]:
GPR['models'] = models
GPR['val_loss'] = val_loss

# ENSEMBLE

In [195]:
weights = []

In [93]:
order = {
    0 : MLP,
    1 : LASSO,
    2 : RIDGE,
    3 : ELASTIC,
    4 : XGB,
    5 : LGBM,
    6 : TABPFN,
    7 : GPR
}

In [197]:
for i in range(10):
    weights.append([order[j]['val_loss'][i] for j in range(8)]) 

In [198]:
val_loss_matrix = np.array(weights)  # shape: (10, 8)
inv_losses = 1.0 / val_loss_matrix
normalized_weights = inv_losses / inv_losses.sum(axis=1, keepdims=True)

In [205]:
def objective(trial, X_val, X_val_LR, X_val_mlp, X_val_gpr, y_val, idx, tabpfn_model):
    # Individual model predictions
    y_pred_mlp     = MLP['models'][idx].predict(X_val_mlp).values
    y_pred_lasso   = LASSO['models'][idx].predict(X_val_LR)
    y_pred_ridge   = RIDGE['models'][idx].predict(X_val_LR)
    y_pred_elastic = ELASTIC['models'][idx].predict(X_val_LR)
    y_pred_xgb     = XGB['models'][idx].predict(X_val)
    y_pred_lgbm    = LGBM['models'][idx].predict(X_val)
    y_pred_tabpfn  = tabpfn_model.predict(X_val)
    y_pred_gpr     = GPR['models'][idx].predict(X_val_gpr)

    # Suggest weights in [0, 1]
    weights = [
        trial.suggest_float('w_mlp',     0.0, 1.0),
        trial.suggest_float('w_lasso',   0.0, 0.0),
        trial.suggest_float('w_ridge',   0.0, 0.0),
        trial.suggest_float('w_elastic', 0.0, 0.0),
        trial.suggest_float('w_xgb',     0.0, 0.0),
        trial.suggest_float('w_lgbm',    0.0, 0.0),
        trial.suggest_float('w_tabpfn',  0.0, 1.0),
        trial.suggest_float('w_gpr',     0.0, 1.0)
    ]

    # Normalize weights to sum to 1 (avoid division by zero)
    weight_sum = sum(weights)
    if weight_sum == 0:
        return float("inf")  # Penalize this trial
    normed_weights = [w / weight_sum for w in weights]

    # Apply weighted ensemble
    preds = [y_pred_mlp, y_pred_tabpfn, y_pred_gpr]
    
    weighted_preds = sum(w * p for w, p in zip(normed_weights, preds))

    # Evaluate
    score = mean_absolute_percentage_error(y_val, weighted_preds)
    return score

In [214]:
targets = [f"BlendProperty{i}" for i in range(1, 11)]
BASE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/dataset"
tabpfn_dir = "/pscratch/sd/r/ritesh11/temp_dir/TabPFN_models"
N_TRIALS = 10

In [215]:
best_params = []

In [216]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [219]:
argmins = np.argmin(weights, axis=1)

In [221]:
weights[9]

[0.048122061044873134,
 0.09695343906489767,
 0.18289210976185172,
 0.09670067622134722,
 0.6783644979965386,
 0.3861311492839096,
 0.16757454430321522,
 0.21895494520979195]

In [220]:
argmins

array([7, 7, 6, 7, 6, 7, 6, 6, 6, 0])

In [217]:
for i, t in enumerate(targets):
    
    # Load data
    X_train = pd.read_csv(f"{BASE_PATH}/train/{t}_X.csv")
    y_train = pd.read_csv(f"{BASE_PATH}/train/{t}_y.csv")
    X_val = pd.read_csv(f"{BASE_PATH}/val/{t}_X.csv")
    y_val = pd.read_csv(f"{BASE_PATH}/val/{t}_y.csv")
    # print("Data loaded.")
    X_val_LR = X_val
    # Feature engineering
    scaler = StandardScaler()
    blend_features_train = compute_volume_weighted_component_features(X_train)
    blend_features_val = compute_volume_weighted_component_features(X_val)

    X_train = pd.concat([X_train, blend_features_train], axis=1)
    X_val = pd.concat([X_val, blend_features_val], axis=1)
    
    # print("Volume-weighted features added.")

    # Scaling
    X_train_MLP = pd.DataFrame(
        scaler.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    X_val_MLP = pd.DataFrame(
        scaler.transform(X_val),
        columns=X_val.columns,
        index=X_val.index
    )

    _, _, X_val_gpr, _ = get_GPR_data(t)
    
    # Prepare initial weights to enqueue (assumes normalized_weights[i] exists)
    weights_to_enqueue = normalized_weights[i]
    enqueue_params = {
        'w_mlp':     weights_to_enqueue[0],
        'w_lasso':   weights_to_enqueue[1],
        'w_ridge':   weights_to_enqueue[2],
        'w_elastic': weights_to_enqueue[3],
        'w_xgb':     weights_to_enqueue[4],
        'w_lgbm':    weights_to_enqueue[5],
        'w_tabpfn':  weights_to_enqueue[1],
        'w_gpr':     weights_to_enqueue[2],
    }

    study = optuna.create_study(direction="minimize")
    study.enqueue_trial(enqueue_params)
    # print(y_val)
    tabpfn_model = joblib.load(f"{tabpfn_dir}/{t}.pkl")
    study.optimize(
        lambda trial: objective(trial, X_val ,X_val_LR,X_val_MLP, X_val_gpr, y_val.values.flatten(),i, tabpfn_model),
        n_trials=N_TRIALS,
        n_jobs=1,
        show_progress_bar=True
    )
    best_params.append(study.best_params)

  0%|          | 0/10 [00:00<?, ?it/s]



[W 2025-07-11 15:24:53,730] Trial 4 failed with parameters: {} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/global/homes/r/ritesh11/.conda/envs/myenv_3.12/lib/python3.12/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_730066/802730959.py", line 52, in <lambda>
    lambda trial: objective(trial, X_val ,X_val_LR,X_val_MLP, X_val_gpr, y_val.values.flatten(),i, tabpfn_model),
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_730066/1082557424.py", line 9, in objective
    y_pred_tabpfn  = tabpfn_model.predict(X_val)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/global/homes/r/ritesh11/.conda/envs/myenv_3.12/lib/python3.12/site-packages/tabpfn_extensions/hpo/tuned_tabpfn.py", line 540, in predict
    return self.best_model_.predict(X)
 

KeyboardInterrupt: 