In [1]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning) 
import os
import joblib
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging
import joblib
import json

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer
from numpy.linalg import LinAlgError
from scipy.stats import skew, kurtosis

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (
    RBF,
    Matern,
    RationalQuadratic,
    ExpSineSquared,
    DotProduct,
    WhiteKernel,
    ConstantKernel
)


In [2]:
def compute_volume_weighted_component_features(X):
    """
    Computes individual volume-weighted features WjPk = Componentj_fraction * Componentj_Propertyk
    for j in 1..5 and k in 1..10 (total 50 features).
    """
    features = {}
    for comp_idx in range(1, 6):  # Components 1–5
        for prop_idx in range(1, 11):  # Properties 1–10
            vol_col = f'Component{comp_idx}_fraction'
            prop_col = f'Component{comp_idx}_Property{prop_idx}'
            feat_name = f'W{comp_idx}P{prop_idx}'
            features[feat_name] = X[vol_col] * X[prop_col]
    return pd.DataFrame(features)

In [3]:
from sklearn.preprocessing import StandardScaler, PowerTransformer
from scipy.stats import skew
import numpy as np
import os
import pandas as pd

def get_data(target):
    # Load train and val sets
    data = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/train.csv")
    X_train = data.iloc[:,:55] 
    y_train = data[[target]]
    X_test = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/test.csv")
    X_test = X_test.iloc[:,1:]
    
    # Feature engineering
    X_train = pd.concat([X_train, compute_volume_weighted_component_features(X_train)], axis=1)
    X_test = pd.concat([X_test, compute_volume_weighted_component_features(X_test)], axis=1)

    # Feature selection
    df = pd.read_csv(os.path.join(fi_path, f"{target}.csv"))
    cols = df[df["importance"] > 0.1].iloc[:, 0].tolist()
    print(cols)
    X_train = X_train[cols]
    X_test = X_test[cols]

    # Separate out fraction-based columns
    fraction_cols = [col for col in X_train.columns if "fraction" in col.lower()]
    non_fraction_cols = [col for col in X_train.columns if col not in fraction_cols]

    # Initialize scalers
    x_scaler = StandardScaler()

    # Scale only non-fraction columns
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()

    X_train_scaled[non_fraction_cols] = x_scaler.fit_transform(X_train[non_fraction_cols])
    X_test_scaled[non_fraction_cols] = x_scaler.transform(X_test[non_fraction_cols])
    # Fraction columns remain unchanged


    y_vals = y_train.values.ravel()

    return X_train_scaled, y_vals, X_test_scaled


In [4]:
BASE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/dataset/updated"

In [5]:
TARGETS = [f"BlendProperty{i}" for i in range(1, 11)]
TARGETS = [TARGETS[i] for i in [0, 5]]
model_dir = "/pscratch/sd/r/ritesh11/temp_dir/GPR_models"
fi_path = "/pscratch/sd/r/ritesh11/temp_dir/feature_importance"

In [6]:
models = []
res = {}

In [8]:
for target in TARGETS:
    print(f"\nProcessing target: {target}")

    X_train, y_train, X_test = get_data(target)

    # Load saved best hyperparameters
    with open(os.path.join(model_dir, f"best_params_{target}_updated_plain.json"), "r") as f:
        params = json.load(f)

    # Convert strings/bools properly
    kernel_choice = params["kernel"]
    # const_scale = float(params["const_scale"])
    # const_bias = float(params["const_bias"])

    # Build the base kernel
    if kernel_choice == "RBF":
        base_kernel = RBF(length_scale_bounds=(1e-5, 1e5))

    elif kernel_choice == "Matern":
        nu = float(params["matern_nu"])
        base_kernel = Matern(nu=nu, length_scale_bounds=(1e-5, 1e5))

    elif kernel_choice == "RQ":
        base_kernel = RationalQuadratic(length_scale_bounds=(1e-5, 1e5), alpha_bounds=(1e-5, 1e5))

    elif kernel_choice == "DotProduct":
        base_kernel = DotProduct(sigma_0_bounds=(1e-5, 1e5))

    else:
        raise ValueError(f"Unknown kernel type: {kernel_choice}")


    # Final params to GPR
    gpr_params = {
        "kernel": base_kernel,
        "alpha": float(params["alpha"]),
        "n_restarts_optimizer": int(params["n_restarts_optimizer"]),
        "normalize_y": bool(params["normalize_y"]),
        "random_state": int(params.get("random_state", 42)),
        "optimizer": params.get("optimizer", "fmin_l_bfgs_b")
    }

    # Train model
    model = GaussianProcessRegressor(**gpr_params)
    model.fit(X_train, y_train)

    model_path = os.path.join(model_dir, f"model_{target}_full.pkl")
    joblib.dump(model, model_path)
    print(f"Model saved to {model_path}")

    # Evaluate
    preds = model.predict(X_test)
    res[target] = preds

    del model


Processing target: BlendProperty1
['Component5_fraction', 'Component4_fraction', 'Component2_fraction', 'Component1_fraction', 'W5P1', 'W1P1', 'W4P1', 'W3P1', 'W2P1', 'Component5_Property1', 'Component4_Property1', 'W1P9', 'Component2_Property1']
Model saved to /pscratch/sd/r/ritesh11/temp_dir/GPR_models/model_BlendProperty1_full.pkl

Processing target: BlendProperty6
['Component5_fraction', 'Component2_fraction', 'W4P6', 'W3P6', 'W1P6', 'W5P6', 'Component4_fraction', 'W2P6', 'Component1_fraction', 'Component3_fraction', 'Component4_Property6']
Model saved to /pscratch/sd/r/ritesh11/temp_dir/GPR_models/model_BlendProperty6_full.pkl


In [9]:
df = pd.DataFrame(res)

In [10]:
df

Unnamed: 0,BlendProperty1,BlendProperty6
0,0.154935,0.713273
1,-0.810190,-0.103823
2,1.769240,1.860298
3,-0.453352,-0.431306
4,0.155412,0.236669
...,...,...
495,0.169250,-0.742228
496,-2.175778,-2.457496
497,1.978037,0.656933
498,-0.137998,0.177199


In [11]:
ens_df = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/TabM_submission.csv")

In [12]:
ens_df.update(df)

In [13]:
ens_df.to_csv("GPR+TabM_submission.csv",index=False)

In [31]:
df

Unnamed: 0,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,0.154975,0.259825,0.731118,0.601156,0.324673,0.713313,0.731466,0.349134,-0.295319,0.316149
1,-0.810157,-0.589556,-1.200426,0.057067,-0.727296,-0.103822,-1.245061,-1.004629,-0.782186,0.021541
2,1.769253,1.144621,1.294916,1.058812,1.040548,1.861213,1.178889,2.074316,0.653489,2.282356
3,-0.453314,0.296121,0.883504,-0.696685,1.650993,-0.438880,1.056158,1.984992,0.802829,-0.947537
4,0.155420,-1.162808,1.180197,0.447424,2.099641,0.237305,1.091717,0.028042,-0.303397,1.035714
...,...,...,...,...,...,...,...,...,...,...
495,0.169248,-0.883828,1.191562,-0.273603,0.491402,-0.742171,1.106440,-0.565681,-1.190804,-0.476616
496,-2.175765,-1.318512,-1.089808,-2.367691,-0.647675,-2.460536,-1.107157,-1.977467,-1.102134,-1.353626
497,1.978063,2.193041,0.285681,1.162550,1.530629,0.660380,0.294618,0.955072,0.265382,0.434947
498,-0.137981,0.822464,1.648600,-1.438191,0.700771,0.177174,1.975969,0.465482,0.244681,1.218859


In [15]:
df

Unnamed: 0,ID,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,1,0.154909,0.259559,0.728013,0.574044,0.311982,0.713313,0.707702,0.355958,-0.292612,0.325873
1,2,-0.810234,-0.588280,-1.227666,0.059979,-0.725513,-0.103821,-1.250158,-1.020324,-0.775185,0.020747
2,3,1.769219,1.143371,1.278172,0.985408,1.064214,1.861213,1.185071,2.073053,0.604669,2.276667
3,4,-0.453348,0.295859,0.890734,-0.685241,1.603738,-0.438882,0.965011,1.830062,0.853499,-0.944300
4,5,0.155379,-1.163631,1.170446,0.489454,2.224776,0.237307,1.167475,0.078241,-0.209719,1.041330
...,...,...,...,...,...,...,...,...,...,...,...
495,496,0.169277,-0.884299,1.205476,-0.279737,0.822539,-0.742171,1.139463,-0.536783,-1.179960,-0.454042
496,497,-2.175777,-1.318532,-1.096115,-2.246309,-0.645123,-2.460537,-1.097126,-1.923694,-1.159512,-1.345306
497,498,1.978051,2.191579,0.282636,1.165851,1.704645,0.660380,0.262349,0.920835,0.256065,0.456061
498,499,-0.138103,0.819953,1.647070,-1.400321,0.701627,0.177175,1.945475,0.447202,0.262802,1.233936
