In [1]:
import os
import joblib
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging
from pathlib import Path
import joblib
import json

from scipy.stats import gaussian_kde
from optuna.samplers import TPESampler

from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    KFold,
    GridSearchCV
)
from sklearn.metrics import (
    make_scorer,
    mean_absolute_error,
    mean_squared_error,
    mean_absolute_percentage_error
)

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.linear_model import RidgeCV, ElasticNetCV, LassoCV
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (
    RBF,
    Matern,
    RationalQuadratic,
    ConstantKernel as C
)


# # Load data
# X = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/train/BlendProperty6_X.csv")
# y = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/train/BlendProperty6_y.csv")
# X_val = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/val/BlendProperty6_X.csv")
# y_val = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/val/BlendProperty6_y.csv")
# test_df = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/test.csv")

In [2]:
def get_model_scores(model, X, y, model_name):
    """
    Returns CV scores and trained model
    """
    model.fit(X, y)
    y_pred = model.predict(X)
    
    return {
        'name': model_name,
        'model': model,
        'MAPE': mean_absolute_percentage_error(y, y_pred),
        'MAE': mean_absolute_error(y, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y, y_pred))
    }

In [3]:
def evaluate_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    return {
        "MAPE": mean_absolute_percentage_error(y_val, y_pred),
        "MAE": mean_absolute_error(y_val, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_val, y_pred))
    }

In [4]:
# ========== Logging Setup ==========
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)

# ========== Config ==========
degree = 2
cv = 5
targets = [f"BlendProperty{i}" for i in range(1, 11)]

In [5]:
BASE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/dataset"
SAVE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs"
Path(SAVE_PATH).mkdir(parents=True, exist_ok=True)

In [6]:
# ========== Main Loop ==========
for t in targets:
    logging.info(f"🔧 Processing target: {t}")

    # ========== Load Data ==========
    X = pd.read_csv(f"{BASE_PATH}/train/{t}_X.csv")
    y = pd.read_csv(f"{BASE_PATH}/train/{t}_y.csv").values.ravel()
    X_val = pd.read_csv(f"{BASE_PATH}/val/{t}_X.csv")
    y_val = pd.read_csv(f"{BASE_PATH}/val/{t}_y.csv").values.ravel()

    # ========== Build Models ==========
    models = {
        "LASSOCV": make_pipeline(
            StandardScaler(),
            PolynomialFeatures(degree),
            LassoCV(cv=cv, n_jobs=-1)
        ),
        "RIDGECV": make_pipeline(
            StandardScaler(),
            PolynomialFeatures(degree),
            RidgeCV(alphas=np.logspace(-6, 6, 13), cv=cv)
        ),
        "ELASTICNETCV": make_pipeline(
            StandardScaler(),
            PolynomialFeatures(degree),
            ElasticNetCV(cv=cv, l1_ratio=[0.1, 0.5, 0.9], n_jobs=-1)
        )
    }

    results = {}

    for name, model in models.items():
        logging.info(f"🧠 Training model: {name} for {t}")
        model.fit(X, y)

        # Save model
        model_path = f"{SAVE_PATH}/{t}_{name}.joblib"
        joblib.dump(model, model_path)
        logging.info(f"💾 Saved model to {model_path}")

        # Save optimal hyperparameters
        if hasattr(model.named_steps[name.lower()], 'alpha_'):
            alpha = model.named_steps[name.lower()].alpha_
        elif name == "ELASTICNETCV":
            alpha = {
                "alpha": model.named_steps[name.lower()].alpha_,
                "l1_ratio": model.named_steps[name.lower()].l1_ratio_
            }
        else:
            alpha = "N/A"

        param_path = f"{SAVE_PATH}/{t}_{name}_params.json"
        with open(param_path, "w") as f:
            json.dump({"alpha": alpha}, f, indent=2)
        logging.info(f"📄 Saved params to {param_path}")

        # Evaluate on holdout
        metrics = evaluate_model(model, X_val, y_val)
        results[name] = metrics
        logging.info(f"✅ {name} hold-out metrics: {metrics}")

    # Save results
    results_df = pd.DataFrame(results).T
    results_df.to_csv(f"{SAVE_PATH}/{t}_metrics.csv")
    logging.info(f"📊 Metrics saved to {SAVE_PATH}/{t}_metrics.csv\n")

2025-07-08 09:15:47,515 [INFO] 🔧 Processing target: BlendProperty1
2025-07-08 09:15:47,560 [INFO] 🧠 Training model: LASSOCV for BlendProperty1
2025-07-08 09:16:00,813 [INFO] 💾 Saved model to /pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs/BlendProperty1_LASSOCV.joblib
2025-07-08 09:16:00,825 [INFO] 📄 Saved params to /pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs/BlendProperty1_LASSOCV_params.json
2025-07-08 09:16:00,833 [INFO] ✅ LASSOCV hold-out metrics: {'MAPE': 0.03038513487929343, 'MAE': 0.002435146165653691, 'RMSE': np.float64(0.002990723298540005)}
2025-07-08 09:16:00,834 [INFO] 🧠 Training model: RIDGECV for BlendProperty1
2025-07-08 09:16:23,414 [INFO] 💾 Saved model to /pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs/BlendProperty1_RIDGECV.joblib
2025-07-08 09:16:23,419 [INFO] 📄 Saved params to /pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs/BlendProperty1_RIDGECV_params.json
2025-07-08 09:16:23,424 [INFO] ✅ RIDGECV hold-out metrics: {'MAPE': 0.0011051240392759604, 'MAE':

In [3]:
# === Config ===
SAVE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs"
BASE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/dataset"
ENSEMBLE_DIR = os.path.join(SAVE_PATH, "optuna_ensemble")
Path(ENSEMBLE_DIR).mkdir(parents=True, exist_ok=True)

MODEL_NAMES = ["LASSOCV", "RIDGECV", "ELASTICNETCV"]
TARGETS = [f"BlendProperty{i}" for i in range(1, 11)]

In [7]:
# === Loop over all targets ===
for TARGET in TARGETS:
    print(f"\n🔧 Optimizing ensemble weights for: {TARGET}")

    # === Load validation data ===
    X_val = pd.read_csv(f"{BASE_PATH}/val/{TARGET}_X.csv")
    y_val = pd.read_csv(f"{BASE_PATH}/val/{TARGET}_y.csv").values.ravel()

    # === Load predictions from all models ===
    preds = {}
    for name in MODEL_NAMES:
        model_path = os.path.join(SAVE_PATH, f"{TARGET}_{name}.joblib")
        model = joblib.load(model_path)
        preds[name] = model.predict(X_val)

    # === Define objective function ===
    def objective(trial):
        w1 = trial.suggest_float("w1", 0.0, 1.0)
        w2 = trial.suggest_float("w2", 0.0, 1.0)
        w3 = 1.0 - w1 - w2
        if w3 < 0:
            raise optuna.TrialPruned()

        y_pred = (
            w1 * preds["LASSOCV"] +
            w2 * preds["RIDGECV"] +
            w3 * preds["ELASTICNETCV"]
        )

        return mean_absolute_percentage_error(y_val, y_pred)
        

    # === Custom callback for logging every 100 trials ===
    def log_callback(study, trial):
        if trial.number % 100 == 0 and trial.value is not None:
            print(f"📈 {TARGET} | Trial {trial.number}: MAPE = {trial.value:.4f}")
        
    optuna.logging.set_verbosity(optuna.logging.WARNING)

    # === Run Optuna study ===
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=1000, callbacks=[log_callback])

    # === Save best weights ===
    best_weights = study.best_params
    best_weights["w3"] = 1.0 - best_weights["w1"] - best_weights["w2"]
    weights_path = os.path.join(ENSEMBLE_DIR, f"{TARGET}_weights.json")
    with open(weights_path, "w") as f:
        json.dump(best_weights, f, indent=2)

    print(f"✅ {TARGET} best weights: {best_weights}")
    print(f"📉 Best MAPE: {study.best_value:.4f}")


🔧 Optimizing ensemble weights for: BlendProperty1
📈 BlendProperty1 | Trial 0: MAPE = 0.0216
📈 BlendProperty1 | Trial 200: MAPE = 0.0008
📈 BlendProperty1 | Trial 300: MAPE = 0.0006
📈 BlendProperty1 | Trial 400: MAPE = 0.0010
📈 BlendProperty1 | Trial 600: MAPE = 0.0024
📈 BlendProperty1 | Trial 700: MAPE = 0.0015
✅ BlendProperty1 best weights: {'w1': 0.000557166553044104, 'w2': 0.968124894937447, 'w3': 0.031317938509508925}
📉 Best MAPE: 0.0003

🔧 Optimizing ensemble weights for: BlendProperty2
📈 BlendProperty2 | Trial 0: MAPE = 0.7440
📈 BlendProperty2 | Trial 100: MAPE = 0.7427
📈 BlendProperty2 | Trial 200: MAPE = 0.7426
📈 BlendProperty2 | Trial 300: MAPE = 0.7426
📈 BlendProperty2 | Trial 400: MAPE = 0.7427
📈 BlendProperty2 | Trial 700: MAPE = 0.7428
📈 BlendProperty2 | Trial 800: MAPE = 0.7427
✅ BlendProperty2 best weights: {'w1': 0.7601189916387876, 'w2': 0.23701573838588397, 'w3': 0.0028652699753284327}
📉 Best MAPE: 0.7424

🔧 Optimizing ensemble weights for: BlendProperty3
📈 BlendPrope

In [26]:
# === Config ===
SAVE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs/final_LR_models"
BASE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/dataset"
MODEL_NAMES = ["LASSOCV", "RIDGECV", "ELASTICNETCV"]
TARGETS = [f"BlendProperty{i}" for i in range(1, 11)]
degree = 2

X = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/train.csv")
tar = X.iloc[:,55:]
X = X.iloc[:,:55]

In [27]:
tar

Unnamed: 0,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,0.489143,0.607589,0.321670,-1.236055,1.601132,1.384662,0.305850,0.193460,0.580374,-0.762738
1,-1.257481,-1.475283,-0.437385,-1.402911,0.147941,-1.143244,-0.439171,-1.379041,-1.280989,-0.503625
2,1.784349,0.450467,0.622687,1.375614,-0.428790,1.161616,0.601289,0.872950,0.660000,2.024576
3,-0.066422,0.483730,-1.865442,-0.046295,-0.163820,-0.209693,-1.840566,0.300293,-0.351336,-1.551914
4,-0.118913,-1.172398,0.301785,-1.787407,-0.493361,-0.528049,0.286344,-0.265192,0.430513,0.735073
...,...,...,...,...,...,...,...,...,...,...
1995,-0.028366,-0.327297,-0.316933,-1.294092,-0.530259,-0.421526,-0.320869,0.709627,-0.737244,-0.744289
1996,-0.449245,0.156778,-0.367445,-0.938615,-0.577451,-0.209996,-0.370505,-0.195531,-0.032834,0.269718
1997,0.029135,0.164890,-0.092942,-1.134490,-0.437479,-0.695636,-0.101073,0.063650,0.624368,-0.477053
1998,-0.232960,-0.464947,0.112536,-0.793522,-0.811272,-1.194914,0.100644,0.760116,-0.751394,-0.857598


In [28]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet

In [31]:
for target in TARGETS:
    print(f"\n🔄 Retraining on full data for: {target}")

    y = tar[[target]].values.ravel()

    for model_name in MODEL_NAMES:
        model_path = os.path.join("/pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs", f"{target}_{model_name}.joblib")
        if not os.path.exists(model_path):
            print(f"  ⚠️ Skipping {model_name} — model file not found.")
            continue

        print(f"  🔧 Loading model: {model_name}")

        # === Load trained model pipeline ===
        pipeline = joblib.load(model_path)

        # === Extract fitted regressor and its tuned params ===
        reg = pipeline.named_steps[model_name.lower()]
        if model_name == "LASSOCV":
            final_model = Lasso(alpha=reg.alpha_, max_iter=10000)
        elif model_name == "RIDGECV":
            final_model = Ridge(alpha=reg.alpha_)
        elif model_name == "ELASTICNETCV":
            final_model = ElasticNet(alpha=reg.alpha_, l1_ratio=reg.l1_ratio_, max_iter=10000)
        else:
            raise ValueError(f"Unknown model name: {model_name}")

        # === Rebuild pipeline and fit on full data ===
        full_model = make_pipeline(
            StandardScaler(),
            PolynomialFeatures(degree=degree),
            final_model
        )
        full_model.fit(X, y)

        # === Save retrained model ===
        full_model_path = os.path.join(SAVE_PATH, f"{target}_{model_name}_FULL.joblib")
        joblib.dump(full_model, full_model_path)
        print(f"  💾 Saved full-data model to: {full_model_path}")


🔄 Retraining on full data for: BlendProperty1
  🔧 Loading model: LASSOCV
  💾 Saved full-data model to: /pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs/final_LR_models/BlendProperty1_LASSOCV_FULL.joblib
  🔧 Loading model: RIDGECV
  💾 Saved full-data model to: /pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs/final_LR_models/BlendProperty1_RIDGECV_FULL.joblib
  🔧 Loading model: ELASTICNETCV
  💾 Saved full-data model to: /pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs/final_LR_models/BlendProperty1_ELASTICNETCV_FULL.joblib

🔄 Retraining on full data for: BlendProperty2
  🔧 Loading model: LASSOCV
  💾 Saved full-data model to: /pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs/final_LR_models/BlendProperty2_LASSOCV_FULL.joblib
  🔧 Loading model: RIDGECV
  💾 Saved full-data model to: /pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs/final_LR_models/BlendProperty2_RIDGECV_FULL.joblib
  🔧 Loading model: ELASTICNETCV
  💾 Saved full-data model to: /pscratch/sd/r/ritesh11/temp_dir/LR_model_ou

In [38]:
SAVE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs/final_LR_models"
ENSEMBLE_DIR = "/pscratch/sd/r/ritesh11/temp_dir/LR_model_outputs/optuna_ensemble"

MODEL_NAMES = ["LASSOCV", "RIDGECV", "ELASTICNETCV"]
TARGETS = [f"BlendProperty{i}" for i in range(1, 11)]

In [48]:
predictions = {"ID" : [i for i in range(1,501)]}
X_test = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/dataset/test.csv")
X_test.drop(columns=["ID"], inplace=True)

In [49]:
for target in TARGETS:
    print(f"\n🔍 Building ensemble prediction for: {target}")

    # === Load ensemble weights ===
    weights_path = os.path.join(ENSEMBLE_DIR, f"{target}_weights.json")
    if not os.path.exists(weights_path):
        print(f"⚠️ Ensemble weights not found: {weights_path}")
        continue
    with open(weights_path, "r") as f:
        weights = json.load(f)

    # === Load models and compute weighted predictions ===
    preds = []
    for model_name in MODEL_NAMES:
        model_file = os.path.join(SAVE_PATH, f"{target}_{model_name}_FULL.joblib")
        if not os.path.exists(model_file):
            print(f"⚠️ Model not found: {model_file}")
            continue

        model = joblib.load(model_file)
        y_pred = model.predict(X_test)
        weight = weights.get(f"w{MODEL_NAMES.index(model_name) + 1}", 0.0)
        preds.append(weight * y_pred)

    # === Combine predictions ===
    if not preds:
        print(f"❌ No predictions generated for {target}")
        continue
    y_ensemble = np.sum(preds, axis=0)

    # === Save ensemble predictions ===
    predictions[target] = y_ensemble


🔍 Building ensemble prediction for: BlendProperty1

🔍 Building ensemble prediction for: BlendProperty2

🔍 Building ensemble prediction for: BlendProperty3

🔍 Building ensemble prediction for: BlendProperty4

🔍 Building ensemble prediction for: BlendProperty5

🔍 Building ensemble prediction for: BlendProperty6

🔍 Building ensemble prediction for: BlendProperty7

🔍 Building ensemble prediction for: BlendProperty8

🔍 Building ensemble prediction for: BlendProperty9

🔍 Building ensemble prediction for: BlendProperty10


In [52]:
df = pd.DataFrame(predictions)
df.to_csv("LR_submission.csv",index=False)

In [2]:
def compute_volume_weighted_blend_features(X):
    """
    Computes volume-weighted blend-level properties for each of the 10 properties.
    """
    weighted_props = {}
    for prop_idx in range(1, 11):  # Property1 to Property10
        blend_prop = 0
        for comp_idx in range(1, 6):  # Component1 to Component5
            vol_col = f'Component{comp_idx}_fraction'
            prop_col = f'Component{comp_idx}_Property{prop_idx}'
            blend_prop += X[vol_col] * X[prop_col]
        weighted_props[f'BlendEst_Property{prop_idx}'] = blend_prop / 100.0
    return pd.DataFrame(weighted_props)

In [3]:
# Compute and append new blend-level features
blend_features = compute_volume_weighted_blend_features(X)
X = pd.concat([X, blend_features], axis=1)

# Do the same for test data
X_test = test_df[X.columns[:55]]  # assumes same structure
blend_features_test = compute_volume_weighted_blend_features(X_test)
X_test = pd.concat([X_test, blend_features_test], axis=1)

In [4]:
correlations = X.corrwith(y.iloc[:,0]).sort_values(key=abs, ascending=False)

In [5]:
corr_set = set(correlations.index.tolist()[:20])

In [6]:
fi_df = pd.read_csv("/pscratch/sd/r/ritesh11/temp_dir/feature_importance/BlendProperty1.csv")

In [7]:
fi_set = set(fi_df.iloc[:20,0].tolist())

In [8]:
corr_set

{'BlendEst_Property1',
 'BlendEst_Property6',
 'Component1_Property1',
 'Component1_Property10',
 'Component1_Property3',
 'Component1_Property4',
 'Component1_Property8',
 'Component2_Property1',
 'Component2_Property10',
 'Component2_fraction',
 'Component3_Property1',
 'Component3_Property3',
 'Component3_Property9',
 'Component3_fraction',
 'Component4_Property1',
 'Component4_fraction',
 'Component5_Property1',
 'Component5_Property2',
 'Component5_Property5',
 'Component5_fraction'}

In [9]:
cols_to_keep = fi_set.intersection(corr_set)

In [10]:
cols_to_keep

{'BlendEst_Property1',
 'Component1_Property1',
 'Component2_Property1',
 'Component2_fraction',
 'Component3_Property1',
 'Component3_Property9',
 'Component3_fraction',
 'Component4_Property1',
 'Component4_fraction',
 'Component5_Property1',
 'Component5_fraction'}

In [11]:
def objective(trial, X, y):
    # -------- Feature Subset Selection --------
    # cols_to_remove = [col for col in cols_to_keep if trial.suggest_categorical(f"remove_{col}", [True, False])]
    # cols_to_add = [col for col in optional_cols if trial.suggest_categorical(f"add_{col}", [True, False])]
    
    # selected_cols = list(set(cols_to_keep) - set(cols_to_remove)) + cols_to_add
    # selected_cols = list(sorted(set(selected_cols)))  # deduplicate and sort

    # X_sel = X[selected_cols].copy()

    # -------- RBF Kernel Only --------
    length_scale = trial.suggest_float("length_scale", 0.1, 10.0, log=True)
    constant = trial.suggest_float("constant", 0.1, 100.0, log=True)
    alpha = trial.suggest_float("alpha", 1e-6, 1e-1, log=True)
    normalize_y = trial.suggest_categorical("normalize_y", [True, False])
    n_restarts_optimizer = trial.suggest_int("n_restarts_optimizer", 0, 3)

    kernel = C(constant, constant_value_bounds=(1e-2, 1e6)) * RBF(length_scale=length_scale, length_scale_bounds=(1e-2, 1e5))

    # -------- Build pipeline --------
    pipeline = Pipeline([
        ("scaler", MinMaxScaler()),
        ("gpr", GaussianProcessRegressor(
            kernel=kernel,
            alpha=alpha,
            normalize_y=normalize_y,
            n_restarts_optimizer=n_restarts_optimizer,
            random_state=42
        ))
    ])

    # -------- Cross-validation --------
    score = cross_val_score(
        pipeline, X, y,
        scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
        cv=3,
        n_jobs=-1
    )
    
    return -np.mean(score)


In [12]:
study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
study.enqueue_trial({
    "length_scale": 1.0,
    "constant": 1.0,
    "alpha": 0.01,
    "normalize_y": True,
    "n_restarts_optimizer": 0
})

study.optimize(lambda trial: objective(trial, X, y), n_trials=50)

print("Best trial:")
print(study.best_trial.params)

[I 2025-07-07 11:01:46,172] A new study created in memory with name: no-name-e2f7c17a-0787-4b86-bb8a-41ac51021572
[I 2025-07-07 11:02:02,930] Trial 0 finished with value: 1.043538644229599 and parameters: {'length_scale': 1.0, 'constant': 1.0, 'alpha': 0.01, 'normalize_y': True, 'n_restarts_optimizer': 0}. Best is trial 0 with value: 1.043538644229599.
[I 2025-07-07 11:02:16,282] Trial 1 finished with value: 1.0706228111502767 and parameters: {'length_scale': 0.5611516415334505, 'constant': 71.14476009343416, 'alpha': 0.004570563099801453, 'normalize_y': True, 'n_restarts_optimizer': 0}. Best is trial 0 with value: 1.043538644229599.
[I 2025-07-07 11:02:44,564] Trial 2 finished with value: 1.1117553880325055 and parameters: {'length_scale': 0.13066739238053282, 'constant': 39.67605077052987, 'alpha': 0.0010129197956845735, 'normalize_y': True, 'n_restarts_optimizer': 3}. Best is trial 0 with value: 1.043538644229599.
[I 2025-07-07 11:03:11,538] Trial 3 finished with value: 1.0 and para

Best trial:
{'length_scale': 2.733762389478874, 'constant': 4.737772386034826, 'alpha': 0.07424873426630181, 'normalize_y': False, 'n_restarts_optimizer': 2}
