In [1]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
import os

In [2]:
def compute_volume_weighted_component_features(X):
    """
    Computes individual volume-weighted features WjPk = Componentj_fraction * Componentj_Propertyk
    for j in 1..5 and k in 1..10 (total 50 features).
    """
    features = {}
    for comp_idx in range(1, 6):  # Components 1–5
        for prop_idx in range(1, 11):  # Properties 1–10
            vol_col = f'Component{comp_idx}_fraction'
            prop_col = f'Component{comp_idx}_Property{prop_idx}'
            feat_name = f'W{comp_idx}P{prop_idx}'
            features[feat_name] = X[vol_col] * X[prop_col]
    return pd.DataFrame(features)

In [3]:
targets = [f"BlendProperty{i}" for i in [3,5,7,8,9,10]]
BASE_PATH = "/pscratch/sd/r/ritesh11/temp_dir/dataset"
model_dir = "/pscratch/sd/r/ritesh11/temp_dir/autogluon_models"
fi_path = "/pscratch/sd/r/ritesh11/temp_dir/feature_importance"

In [None]:
for t in targets:
    X_train = pd.read_csv(f"{BASE_PATH}/train/{t}_X.csv")
    y_train = pd.read_csv(f"{BASE_PATH}/train/{t}_y.csv")
    X_val = pd.read_csv(f"{BASE_PATH}/val/{t}_X.csv")
    y_val = pd.read_csv(f"{BASE_PATH}/val/{t}_y.csv")

    df = pd.read_csv(os.path.join(fi_path, f"{t}.csv"))
    cols = df[df["importance"] > 0.1].iloc[:, 0].tolist()

    blend_features = compute_volume_weighted_component_features(X_train)
    X_train = pd.concat([X_train, blend_features], axis=1)
    blend_features = compute_volume_weighted_component_features(X_val)
    X_val = pd.concat([X_val, blend_features], axis=1)
    
    X_train = X_train[cols]
    X_val = X_val[cols]
    
    predictor = TabularPredictor(
        label=t,
        problem_type="regression",
        eval_metric="mean_absolute_percentage_error",  # You can use "rmse", "r2", etc.
        path=os.path.join(model_dir,t),
    )

    predictor.fit(
        train_data=pd.concat([X_train,y_train],axis=1),
        tuning_data=pd.concat([X_val,y_val],axis=1),
        excluded_model_types = ['KNN'],
        dynamic_stacking=False,
        auto_stack=True,
        time_limit=6000,
        presets='best_quality',
        use_bag_holdout = True,
        # fit_strategy='parallel',
        hyperparameters = 'zeroshot',
        num_gpus=1,
        keep_only_best=True,
    )

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.12.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Mar 13 20:09:44 UTC 2025 (330b47d)
CPU Count:          256
Memory Avail:       191.07 GB / 502.97 GB (38.0%)
Disk Space Avail:   15421725.24 GB / 45921523.47 GB (33.6%)
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
  import pkg_resources
Beginning AutoGluon training ... Time limit = 6000s
AutoGluon will save models to "/pscratch/sd/r/ritesh11/temp_dir/autogluon_models/BlendProperty3"
Train Data Rows:    1900
Train Data Columns: 13
Tuning Data Rows:    100
Tuning Data Columns: 13
Label Column:       BlendProperty3
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    195657.43 MB
	Train Data (Original)  Memory 