In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer,QuantileTransformer,RobustScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from catboost import CatBoostRegressor

In [4]:
base_path = os.path.join("/", "Volumes", "Extreme SSD", "ShellAi")
train_path = os.path.join(base_path, "train.csv")
test_path = os.path.join(base_path, "test.csv")

# Load the CSV files
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [6]:
# Creating  weighted columns
base_features = [col for col in train_df.columns if col not in [f"BlendProperty{i}" for i in range(1, 11)]]
targets = [f"BlendProperty{i}" for i in range(1, 11)]
weighted_data = {}
for i in range(1, 6):
    for j in range(1, 11):
        prop_col = f"Component{i}_Property{j}"
        frac_col = f"Component{i}_fraction"
        weighted_col = f"Weighted_Component{i}_Property{j}"
        weighted_data[weighted_col] = train_df[prop_col] * train_df[frac_col]

# Combine
data_with_weighted = pd.concat([train_df[base_features], pd.DataFrame(weighted_data)], axis=1)
data_with_weighted = data_with_weighted.copy()  # De-fragmented copy

In [8]:
scaler = RobustScaler()
scaled_features = scaler.fit_transform(data_with_weighted)
scaled_df_features = pd.DataFrame(scaled_features, columns=data_with_weighted.columns)


pt = PowerTransformer(method='yeo-johnson')
transformed_targets_df = pd.DataFrame(pt.fit_transform(train_df[targets]), columns=targets, index=train_df.index)

# Initial feature selection using Random Forest feature importance
top_features = {}
for target in targets:
    X = scaled_df_features
    y = transformed_targets_df[target]
    rf_initial = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_initial.fit(X, y)
    feature_importance = pd.Series(rf_initial.feature_importances_, index=scaled_df_features.columns).sort_values(ascending=False)
    top_features[target] = feature_importance.head(20).index.tolist()

# GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20,]

}


In [32]:
warnings.filterwarnings("ignore",category=UserWarning)
for target in targets:
    print(f"\n🔹 Target: {target}")

    X = scaled_df_features[top_features[target]]
    y = transformed_targets_df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = CatBoostRegressor(verbose=0, random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=5,
                               scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    print(f"Best params: {grid_search.best_params_}")

    # CV MAPE on transformed targets
    cv_mape = -cross_val_score(best_model, X_train, y_train, cv=5,
                               scoring='neg_mean_absolute_percentage_error').mean() * 100
    print(f"Cross-validated MAPE (transformed): {cv_mape:.2f}%")

    # Predict on test set
    y_pred = best_model.predict(X_test)

    # Inverse transform for original scale MAPE
    y_test_full = pd.DataFrame(np.column_stack([y_test] + [np.zeros(len(y_test))]*(len(targets)-1)),
                               columns=targets, index=X_test.index)
    y_pred_full = pd.DataFrame(np.column_stack([y_pred] + [np.zeros(len(y_pred))]*(len(targets)-1)),
                               columns=targets, index=X_test.index)

    y_test_inv = pt.inverse_transform(y_test_full)[targets.index(target)]
    y_pred_inv = pt.inverse_transform(y_pred_full)[targets.index(target)]

    # Safe MAPE
    mask = y_test_inv != 0
    test_mape = np.mean(np.abs((y_test_inv[mask] - y_pred_inv[mask]) / y_test_inv[mask]) * 100) if mask.any() else 0.0
    print(f"Test MAPE (original scale): {test_mape:.2f}%")


🔹 Target: BlendProperty1
Best params: {'max_depth': 10, 'n_estimators': 200}
Cross-validated MAPE (transformed): 77.03%
Test MAPE (original scale): 0.70%

🔹 Target: BlendProperty2
Best params: {'max_depth': 10, 'n_estimators': 100}
Cross-validated MAPE (transformed): 74.20%
Test MAPE (original scale): 1.27%

🔹 Target: BlendProperty3
Best params: {'max_depth': 10, 'n_estimators': 200}
Cross-validated MAPE (transformed): 69.74%
Test MAPE (original scale): 0.38%

🔹 Target: BlendProperty4
Best params: {'max_depth': 10, 'n_estimators': 200}
Cross-validated MAPE (transformed): 81.47%
Test MAPE (original scale): 34.82%

🔹 Target: BlendProperty5
Best params: {'max_depth': 10, 'n_estimators': 100}
Cross-validated MAPE (transformed): 57.89%
Test MAPE (original scale): 0.71%

🔹 Target: BlendProperty6
Best params: {'max_depth': 10, 'n_estimators': 200}
Cross-validated MAPE (transformed): 74.37%
Test MAPE (original scale): 1.51%

🔹 Target: BlendProperty7
Best params: {'max_depth': 10, 'n_estimator

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from catboost import CatBoostRegressor
import warnings
import time # Import time module to measure execution time

warnings.filterwarnings("ignore", category=UserWarning)

# --- ASSUMED PRE-EXISTING DATA (same as before for runnable example) ---
num_samples = 1000
num_features = 50
targets = [f'BlendProperty{i}' for i in range(1, 11)]

base_features = pd.DataFrame(np.random.rand(num_samples, num_features) * 100, columns=[f'feature_{i}' for i in range(num_features)])
scaled_df_features = base_features

top_features = {}
for target in targets:
    num_top = np.random.randint(5, 15)
    top_features[target] = np.random.choice(base_features.columns, num_top, replace=False).tolist()

original_targets_df = pd.DataFrame(index=base_features.index)
for target in targets:
    if target in ['BlendProperty4', 'BlendProperty9']:
        vals = np.random.rand(num_samples) * 50
        vals[np.random.choice(num_samples, int(num_samples * 0.1), replace=False)] = 0
        original_targets_df[target] = vals
    else:
        original_targets_df[target] = np.random.rand(num_samples) * 100 + 1
# --- END ASSUMED PRE-EXISTING DATA ---


# Dictionary to store fitted transformers for each target
target_transformers = {}

# --- Hyperparameter Grid for CatBoost (Reduced for faster initial tuning) ---
param_grid = {
    'n_estimators': [200, 500, 800], # Fewer options, more manageable
    'max_depth': [8, 10, 12],       # Fewer options
    'learning_rate': [0.03, 0.05, 0.08], # Tighter range
    'l2_leaf_reg': [3, 5, 7],           # Fewer options
    'subsample': [0.7, 0.9],            # Fewer options
    'colsample_bylevel': [0.7, 0.9],    # Fewer options
    'min_data_in_leaf': [3, 10],        # Fewer options
}

# Parameters for RandomizedSearchCV (Reduced iterations and CV folds for speed)
n_iterations = 25 # SIGNIFICANTLY REDUCED: Try 20-30 for initial runs
cv_folds_tuning = 3 # REDUCED CV folds for the tuning phase
cv_folds_evaluation = 5 # Use 5 folds for final cross_val_score if time permits

print(f"Starting model training with {n_iterations} iterations and {cv_folds_tuning}-fold CV per target.")
print(f"Total estimated CatBoost fits: {len(targets)} targets * {n_iterations} iter * {cv_folds_tuning} cv = {len(targets) * n_iterations * cv_folds_tuning}")

# --- Main Loop for Each Target ---
for i, target in enumerate(targets):
    start_time_target = time.time() # Start timer for current target
    print(f"\n🔹 Target: {target}")

    # --- 1. Target-Specific Data Preparation & Transformation ---
    X = scaled_df_features[top_features[target]]
    y_original = original_targets_df[target]

    if target in ['BlendProperty4', 'BlendProperty9']:
        transformer = QuantileTransformer(output_distribution='normal', random_state=42)
        print(f"   Using QuantileTransformer for {target}")
    else:
        transformer = PowerTransformer(method='yeo-johnson')
        print(f"   Using PowerTransformer (Yeo-Johnson) for {target}")

    y_transformed = transformer.fit_transform(y_original.values.reshape(-1, 1)).flatten()
    target_transformers[target] = transformer

    X_train, X_test, y_train_transformed, y_test_transformed = train_test_split(
        X, y_transformed, test_size=0.2, random_state=42
    )
    # y_test_original is needed for original scale MAPE
    _, _, _, y_test_original = train_test_split(
        X, y_original, test_size=0.2, random_state=42
    )

    # --- 2. Model Training with RandomizedSearchCV ---
    model = CatBoostRegressor(verbose=0, random_state=42)

    random_search = RandomizedSearchCV(
        model,
        param_distributions=param_grid,
        n_iter=n_iterations,
        cv=cv_folds_tuning, # Use reduced CV for tuning
        scoring='neg_mean_absolute_percentage_error',
        n_jobs=-1,
        random_state=42,
        verbose=0
    )
    random_search.fit(X_train, y_train_transformed)

    best_model = random_search.best_estimator_
    print(f"Best params: {random_search.best_params_}")

    # --- 3. Evaluation on Transformed Data (Cross-validated) ---
    cv_mape_transformed = -cross_val_score(
        best_model,
        X_train,
        y_train_transformed,
        cv=cv_folds_evaluation, # Use full CV for a more robust estimate here
        scoring='neg_mean_absolute_percentage_error'
    ).mean() * 100
    print(f"Cross-validated MAPE (transformed): {cv_mape_transformed:.2f}%")

    # --- 4. Prediction and Inverse Transformation for Original Scale MAPE ---
    y_pred_transformed = best_model.predict(X_test)

    y_test_inv = target_transformers[target].inverse_transform(y_test_transformed.reshape(-1, 1)).flatten()
    y_pred_inv = target_transformers[target].inverse_transform(y_pred_transformed.reshape(-1, 1)).flatten()

    # --- 5. Safe MAPE Calculation on Original Scale ---
    epsilon = 1e-6
    mask = y_test_inv != 0

    if mask.any():
        test_mape_original = np.mean(np.abs((y_test_inv[mask] - y_pred_inv[mask]) / (y_test_inv[mask] + epsilon)) * 100)
    else:
        test_mape_original = 0.0

    print(f"Test MAPE (original scale): {test_mape_original:.2f}%")

    # --- Optional: Detailed Analysis for BlendProperty4 and BlendProperty9 ---
    if target in ['BlendProperty4', 'BlendProperty9']:
        print(f"   Consider a two-stage model for {target} if many zeros/near-zeros are present.")
        print("   Examine feature importance specific to this target's large errors.")
        # Add your plotting and specific error analysis here if you want.
        # Example plotting code would go here:
        # import matplotlib.pyplot as plt
        # plt.figure(figsize=(10, 5))
        # plt.subplot(1, 2, 1)
        # y_original.hist(bins=50, alpha=0.7, color='skyblue')
        # plt.title(f'Original Distribution of {target}')
        # plt.subplot(1, 2, 2)
        # pd.Series(y_transformed).hist(bins=50, alpha=0.7, color='lightcoral')
        # plt.title(f'Transformed Distribution of {target}')
        # plt.tight_layout()
        # plt.show()


    end_time_target = time.time() # End timer for current target
    print(f"Time taken for {target}: {end_time_target - start_time_target:.2f} seconds")

# --- AFTER THE LOOP: ENSEMBLE/FINAL PREDICTION STRATEGY ---
print("\n--- Model Training Completed for all Targets ---")
print("\nNext Steps for Further Optimization:")
print("1. After finding good general hyperparameter ranges with reduced iterations/CV, you can increase `n_iterations` for a more thorough search or focus on specific promising ranges.")
print("2. Implement Ensemble Methods (e.g., averaging predictions from multiple best models, or stacking).")
print("3. Deep dive into feature engineering, especially for 'BlendProperty4' and 'BlendProperty9'.")
print("4. Consider a two-stage model (classification for zero/non-zero, then regression) for targets with many zeros.")
print("5. Experiment with different random seeds for training to check stability and average results.")
print("6. If still too slow, consider LightGBM or XGBoost which are often faster than CatBoost, though CatBoost has advantages for categorical features.")



Starting model training with 25 iterations and 3-fold CV per target.
Total estimated CatBoost fits: 10 targets * 25 iter * 3 cv = 750

🔹 Target: BlendProperty1
   Using PowerTransformer (Yeo-Johnson) for BlendProperty1
Best params: {'subsample': 0.9, 'n_estimators': 200, 'min_data_in_leaf': 10, 'max_depth': 12, 'learning_rate': 0.03, 'l2_leaf_reg': 7, 'colsample_bylevel': 0.7}
Cross-validated MAPE (transformed): 457.11%
Test MAPE (original scale): 152.24%
Time taken for BlendProperty1: 478.28 seconds

🔹 Target: BlendProperty2
   Using PowerTransformer (Yeo-Johnson) for BlendProperty2


KeyboardInterrupt: 

In [42]:
!pip install 'optuna-integration[catboost]'

Collecting optuna-integration[catboost]
  Downloading optuna_integration-4.4.0-py3-none-any.whl.metadata (12 kB)
Downloading optuna_integration-4.4.0-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.9/98.9 kB[0m [31m376.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: optuna-integration
Successfully installed optuna-integration-4.4.0


In [46]:
import optuna
from catboost import CatBoostRegressor, Pool
from optuna.integration import CatBoostPruningCallback  



In [113]:
import optuna
from sklearn.metrics import mean_absolute_percentage_error
from catboost import CatBoostRegressor

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
best_models = {}  # ← define this before the loop

# Inside the for-loop after training
 # ← define this before the loop

# Inside the for-loop after training


# Loop through each target
for target in targets:
    print(f"\n🔹 Optimizing Target: {target}")

    X = scaled_df_features[top_features[target]]
    y = transformed_targets_df[target]

    # Split for Optuna
    X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

    # Define objective function
    def objective(trial):
        params = {
            "iterations": trial.suggest_int("iterations", 100, 1000),
            "depth": trial.suggest_int("depth", 4, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
            "random_strength": trial.suggest_float("random_strength", 1e-9, 10.0),
            "loss_function": "MAPE",
            "verbose": 0
        }
        model = CatBoostRegressor(**params)
        model.fit(X_train, y_train)
        preds = model.predict(X_valid)
        return mean_absolute_percentage_error(y_valid, preds)

    # Run Optuna
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=40, show_progress_bar=True)

    best_params = study.best_params
    print(f"✅ Best params for {target}: {best_params}")

    # Train on full training data
    best_model = CatBoostRegressor(**best_params)
    best_model.fit(X_train_full, y_train_full)

    # Predict on test
    y_pred = best_model.predict(X_test)

    # Inverse transform for original scale MAPE
    y_test_full = pd.DataFrame(np.column_stack([y_test] + [np.zeros(len(y_test))]*(len(targets)-1)),
                               columns=targets, index=X_test.index)
    y_pred_full = pd.DataFrame(np.column_stack([y_pred] + [np.zeros(len(y_pred))]*(len(targets)-1)),
                               columns=targets, index=X_test.index)

    y_test_inv = pt.inverse_transform(y_test_full)[targets.index(target)]
    y_pred_inv = pt.inverse_transform(y_pred_full)[targets.index(target)]

    # Safe MAPE
    mask = y_test_inv != 0
    test_mape = np.mean(np.abs((y_test_inv[mask] - y_pred_inv[mask]) / y_test_inv[mask]) * 100) if mask.any() else 0.0
    best_models[target] = best_model
    print(f"📉 Test MAPE (original scale): {test_mape:.2f}%")



[I 2025-07-18 00:07:45,094] A new study created in memory with name: no-name-ef6cf9ed-e5ec-42cf-b5a3-9c458ff62d33



🔹 Optimizing Target: BlendProperty1


  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-07-18 00:07:46,658] Trial 0 finished with value: 0.5067652493825582 and parameters: {'iterations': 321, 'depth': 6, 'learning_rate': 0.10274099844938712, 'l2_leaf_reg': 9.111231737434888, 'random_strength': 3.598139456724781}. Best is trial 0 with value: 0.5067652493825582.
[I 2025-07-18 00:07:48,044] Trial 1 finished with value: 0.7275947526556562 and parameters: {'iterations': 970, 'depth': 7, 'learning_rate': 0.20145532988306608, 'l2_leaf_reg': 7.9132491596575, 'random_strength': 6.700756814461257}. Best is trial 0 with value: 0.5067652493825582.
[I 2025-07-18 00:07:50,021] Trial 2 finished with value: 0.42561622690752865 and parameters: {'iterations': 853, 'depth': 8, 'learning_rate': 0.0390034837581032, 'l2_leaf_reg': 6.552136426699168, 'random_strength': 1.5558232006048653}. Best is trial 2 with value: 0.42561622690752865.
[I 2025-07-18 00:07:50,613] Trial 3 finished with value: 0.4747965924882081 and parameters: {'iterations': 987, 'depth': 5, 'learning_rate': 0.14687763

[I 2025-07-18 00:08:21,119] A new study created in memory with name: no-name-cd933458-0ba9-4118-911c-58609b88b1a8


467:	learn: 0.0472186	total: 187ms	remaining: 25.9ms
468:	learn: 0.0471692	total: 187ms	remaining: 25.6ms
469:	learn: 0.0471605	total: 188ms	remaining: 25.2ms
470:	learn: 0.0471192	total: 188ms	remaining: 24.8ms
471:	learn: 0.0470200	total: 189ms	remaining: 24.4ms
472:	learn: 0.0469022	total: 189ms	remaining: 24ms
473:	learn: 0.0468370	total: 189ms	remaining: 23.6ms
474:	learn: 0.0467618	total: 190ms	remaining: 23.2ms
475:	learn: 0.0466619	total: 190ms	remaining: 22.8ms
476:	learn: 0.0465854	total: 191ms	remaining: 22.4ms
477:	learn: 0.0465756	total: 191ms	remaining: 22ms
478:	learn: 0.0465643	total: 192ms	remaining: 21.6ms
479:	learn: 0.0464944	total: 192ms	remaining: 21.2ms
480:	learn: 0.0463982	total: 192ms	remaining: 20.8ms
481:	learn: 0.0463216	total: 193ms	remaining: 20.4ms
482:	learn: 0.0462143	total: 193ms	remaining: 20ms
483:	learn: 0.0461605	total: 193ms	remaining: 19.6ms
484:	learn: 0.0460871	total: 194ms	remaining: 19.2ms
485:	learn: 0.0460716	total: 194ms	remaining: 18.8ms

  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-07-18 00:08:21,353] Trial 0 finished with value: 0.45141563400404217 and parameters: {'iterations': 370, 'depth': 5, 'learning_rate': 0.06215682059147865, 'l2_leaf_reg': 5.356754848165213, 'random_strength': 7.375693004821569}. Best is trial 0 with value: 0.45141563400404217.
[I 2025-07-18 00:08:26,108] Trial 1 finished with value: 0.5518742537319926 and parameters: {'iterations': 698, 'depth': 10, 'learning_rate': 0.0565244494897019, 'l2_leaf_reg': 2.8505939871536445, 'random_strength': 3.2156326993799507}. Best is trial 0 with value: 0.45141563400404217.
[I 2025-07-18 00:08:26,507] Trial 2 finished with value: 0.42824201864162675 and parameters: {'iterations': 953, 'depth': 4, 'learning_rate': 0.23812774716753277, 'l2_leaf_reg': 9.859452836347275, 'random_strength': 6.2330567964175625}. Best is trial 2 with value: 0.42824201864162675.
[I 2025-07-18 00:08:28,151] Trial 3 finished with value: 0.48778286019880157 and parameters: {'iterations': 391, 'depth': 9, 'learning_rate': 0

[I 2025-07-18 00:08:48,291] A new study created in memory with name: no-name-a7627b12-7575-43d1-82be-87128d6b2b5a


469:	learn: 0.0580524	total: 187ms	remaining: 64.9ms
470:	learn: 0.0579798	total: 188ms	remaining: 64.6ms
471:	learn: 0.0579580	total: 188ms	remaining: 64.2ms
472:	learn: 0.0577786	total: 189ms	remaining: 63.8ms
473:	learn: 0.0576415	total: 189ms	remaining: 63.4ms
474:	learn: 0.0575197	total: 189ms	remaining: 63ms
475:	learn: 0.0574765	total: 190ms	remaining: 62.6ms
476:	learn: 0.0574266	total: 190ms	remaining: 62.2ms
477:	learn: 0.0573628	total: 191ms	remaining: 61.8ms
478:	learn: 0.0572929	total: 191ms	remaining: 61.4ms
479:	learn: 0.0571842	total: 191ms	remaining: 61ms
480:	learn: 0.0571597	total: 192ms	remaining: 60.6ms
481:	learn: 0.0570732	total: 192ms	remaining: 60.3ms
482:	learn: 0.0570363	total: 193ms	remaining: 59.9ms
483:	learn: 0.0570155	total: 193ms	remaining: 59.5ms
484:	learn: 0.0569365	total: 194ms	remaining: 59.1ms
485:	learn: 0.0568474	total: 194ms	remaining: 58.7ms
486:	learn: 0.0567401	total: 194ms	remaining: 58.3ms
487:	learn: 0.0565761	total: 195ms	remaining: 57.9

  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-07-18 00:08:50,696] Trial 0 finished with value: 0.8735894552704068 and parameters: {'iterations': 983, 'depth': 8, 'learning_rate': 0.14419223911219087, 'l2_leaf_reg': 2.6775148784959626, 'random_strength': 0.41665109204806855}. Best is trial 0 with value: 0.8735894552704068.
[I 2025-07-18 00:08:51,102] Trial 1 finished with value: 0.7467416263682202 and parameters: {'iterations': 424, 'depth': 6, 'learning_rate': 0.0961768623384609, 'l2_leaf_reg': 7.648420537807846, 'random_strength': 0.39537785142907345}. Best is trial 1 with value: 0.7467416263682202.
[I 2025-07-18 00:08:54,163] Trial 2 finished with value: 0.9683624299721671 and parameters: {'iterations': 402, 'depth': 10, 'learning_rate': 0.2122084774694994, 'l2_leaf_reg': 1.7272605491788726, 'random_strength': 2.461321218309287}. Best is trial 1 with value: 0.7467416263682202.
[I 2025-07-18 00:08:54,326] Trial 3 finished with value: 0.8198771129542225 and parameters: {'iterations': 366, 'depth': 4, 'learning_rate': 0.285

[I 2025-07-18 00:09:11,740] A new study created in memory with name: no-name-abd9d959-b125-4850-8bcb-779195948504


[I 2025-07-18 00:09:11,583] Trial 39 finished with value: 0.6365046927268448 and parameters: {'iterations': 432, 'depth': 6, 'learning_rate': 0.19454833057909277, 'l2_leaf_reg': 2.2400684450375463, 'random_strength': 2.1282675549375853}. Best is trial 31 with value: 0.590441227808445.
✅ Best params for BlendProperty3: {'iterations': 326, 'depth': 4, 'learning_rate': 0.14268566560064322, 'l2_leaf_reg': 4.607910217297233, 'random_strength': 1.2225741031322643}
0:	learn: 0.9110078	total: 995us	remaining: 324ms
1:	learn: 0.8511468	total: 1.39ms	remaining: 226ms
2:	learn: 0.7848108	total: 1.77ms	remaining: 191ms
3:	learn: 0.7280245	total: 2.11ms	remaining: 170ms
4:	learn: 0.6748018	total: 2.5ms	remaining: 161ms
5:	learn: 0.6354860	total: 2.98ms	remaining: 159ms
6:	learn: 0.6016263	total: 3.33ms	remaining: 152ms
7:	learn: 0.5730571	total: 3.7ms	remaining: 147ms
8:	learn: 0.5510449	total: 4.08ms	remaining: 144ms
9:	learn: 0.5271883	total: 4.45ms	remaining: 140ms
10:	learn: 0.5040952	total: 4.

  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-07-18 00:09:16,021] Trial 0 finished with value: 0.5393294940596556 and parameters: {'iterations': 928, 'depth': 9, 'learning_rate': 0.278934835203864, 'l2_leaf_reg': 9.906975481438508, 'random_strength': 0.6006616468985183}. Best is trial 0 with value: 0.5393294940596556.
[I 2025-07-18 00:09:16,358] Trial 1 finished with value: 0.3742714630391206 and parameters: {'iterations': 501, 'depth': 5, 'learning_rate': 0.23106801431363144, 'l2_leaf_reg': 7.847784838283391, 'random_strength': 2.8481749214784116}. Best is trial 1 with value: 0.3742714630391206.
[I 2025-07-18 00:09:21,350] Trial 2 finished with value: 0.733328989356502 and parameters: {'iterations': 627, 'depth': 10, 'learning_rate': 0.26071474496814906, 'l2_leaf_reg': 5.019238574105737, 'random_strength': 7.480310510319101}. Best is trial 1 with value: 0.3742714630391206.
[I 2025-07-18 00:09:22,166] Trial 3 finished with value: 0.5699777768719367 and parameters: {'iterations': 317, 'depth': 8, 'learning_rate': 0.21323986

[I 2025-07-18 00:09:40,930] A new study created in memory with name: no-name-8a3318d6-ce5a-4247-93bc-0a502ed40c4d


[I 2025-07-18 00:09:40,750] Trial 39 finished with value: 0.25745440045493934 and parameters: {'iterations': 523, 'depth': 5, 'learning_rate': 0.07258411714321847, 'l2_leaf_reg': 9.399241626318883, 'random_strength': 2.714241844502104}. Best is trial 35 with value: 0.22745817985491726.
✅ Best params for BlendProperty4: {'iterations': 359, 'depth': 4, 'learning_rate': 0.1359593590368422, 'l2_leaf_reg': 7.740442580794736, 'random_strength': 1.4062161069860657}
0:	learn: 0.9342070	total: 430us	remaining: 154ms
1:	learn: 0.8804514	total: 814us	remaining: 145ms
2:	learn: 0.8286099	total: 1.2ms	remaining: 142ms
3:	learn: 0.7813556	total: 1.58ms	remaining: 140ms
4:	learn: 0.7379694	total: 1.98ms	remaining: 140ms
5:	learn: 0.7065306	total: 2.36ms	remaining: 139ms
6:	learn: 0.6740078	total: 2.76ms	remaining: 139ms
7:	learn: 0.6481649	total: 3.19ms	remaining: 140ms
8:	learn: 0.6293543	total: 3.59ms	remaining: 140ms
9:	learn: 0.6029963	total: 3.99ms	remaining: 139ms
10:	learn: 0.5828515	total: 4.

  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-07-18 00:09:45,064] Trial 0 finished with value: 0.45252241135388555 and parameters: {'iterations': 792, 'depth': 9, 'learning_rate': 0.128799006840202, 'l2_leaf_reg': 7.182110766005853, 'random_strength': 3.215795387042607}. Best is trial 0 with value: 0.45252241135388555.
[I 2025-07-18 00:09:45,278] Trial 1 finished with value: 0.3218031565623495 and parameters: {'iterations': 297, 'depth': 5, 'learning_rate': 0.21344468545476258, 'l2_leaf_reg': 5.446836915571096, 'random_strength': 5.661044312831758}. Best is trial 1 with value: 0.3218031565623495.
[I 2025-07-18 00:09:49,678] Trial 2 finished with value: 0.3454081828081705 and parameters: {'iterations': 859, 'depth': 9, 'learning_rate': 0.08754703152476458, 'l2_leaf_reg': 5.2716122474954785, 'random_strength': 1.5403500298527446}. Best is trial 1 with value: 0.3218031565623495.
[I 2025-07-18 00:09:50,984] Trial 3 finished with value: 0.5291038833754289 and parameters: {'iterations': 473, 'depth': 8, 'learning_rate': 0.149037

[I 2025-07-18 00:10:20,921] A new study created in memory with name: no-name-52020386-60f4-41ea-bb0a-d35331426229


399:	learn: 0.0923827	total: 187ms	remaining: 135ms
400:	learn: 0.0922830	total: 188ms	remaining: 134ms
401:	learn: 0.0921850	total: 188ms	remaining: 134ms
402:	learn: 0.0920886	total: 189ms	remaining: 133ms
403:	learn: 0.0919179	total: 189ms	remaining: 133ms
404:	learn: 0.0917392	total: 190ms	remaining: 132ms
405:	learn: 0.0916447	total: 190ms	remaining: 132ms
406:	learn: 0.0914859	total: 191ms	remaining: 132ms
407:	learn: 0.0912627	total: 191ms	remaining: 131ms
408:	learn: 0.0910772	total: 191ms	remaining: 131ms
409:	learn: 0.0908847	total: 192ms	remaining: 130ms
410:	learn: 0.0907926	total: 192ms	remaining: 130ms
411:	learn: 0.0906225	total: 193ms	remaining: 129ms
412:	learn: 0.0905321	total: 193ms	remaining: 129ms
413:	learn: 0.0903644	total: 194ms	remaining: 128ms
414:	learn: 0.0901754	total: 194ms	remaining: 128ms
415:	learn: 0.0899872	total: 195ms	remaining: 127ms
416:	learn: 0.0898391	total: 195ms	remaining: 127ms
417:	learn: 0.0896236	total: 196ms	remaining: 126ms
418:	learn: 

  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-07-18 00:10:21,472] Trial 0 finished with value: 0.4897099892330835 and parameters: {'iterations': 789, 'depth': 5, 'learning_rate': 0.27335341832705834, 'l2_leaf_reg': 5.053129622931514, 'random_strength': 0.9720907382117924}. Best is trial 0 with value: 0.4897099892330835.
[I 2025-07-18 00:10:28,951] Trial 1 finished with value: 0.8707164681697591 and parameters: {'iterations': 863, 'depth': 10, 'learning_rate': 0.041254021553368436, 'l2_leaf_reg': 2.5182533089137413, 'random_strength': 6.188271366430312}. Best is trial 0 with value: 0.4897099892330835.
[I 2025-07-18 00:10:29,623] Trial 2 finished with value: 0.7504618248123194 and parameters: {'iterations': 420, 'depth': 7, 'learning_rate': 0.21303913106386121, 'l2_leaf_reg': 6.210269701682258, 'random_strength': 2.4795874935171076}. Best is trial 0 with value: 0.4897099892330835.
[I 2025-07-18 00:10:33,019] Trial 3 finished with value: 0.6473577999295477 and parameters: {'iterations': 685, 'depth': 9, 'learning_rate': 0.062

[I 2025-07-18 00:11:20,616] A new study created in memory with name: no-name-8834df38-9d2a-420e-adfd-cd81a615b24d


830:	learn: 0.0269236	total: 380ms	remaining: 913us
831:	learn: 0.0268950	total: 380ms	remaining: 457us
832:	learn: 0.0268505	total: 381ms	remaining: 0us
📉 Test MAPE (original scale): 0.27%

🔹 Optimizing Target: BlendProperty7


  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-07-18 00:11:21,554] Trial 0 finished with value: 0.9683825415288675 and parameters: {'iterations': 108, 'depth': 10, 'learning_rate': 0.2709436086724174, 'l2_leaf_reg': 7.834000628938723, 'random_strength': 2.987108459535852}. Best is trial 0 with value: 0.9683825415288675.
[I 2025-07-18 00:11:21,980] Trial 1 finished with value: 0.8192194570673064 and parameters: {'iterations': 881, 'depth': 4, 'learning_rate': 0.042742536897849265, 'l2_leaf_reg': 7.849564396755135, 'random_strength': 0.17794717873729202}. Best is trial 1 with value: 0.8192194570673064.
[I 2025-07-18 00:11:25,154] Trial 2 finished with value: 1.0588326480298043 and parameters: {'iterations': 654, 'depth': 9, 'learning_rate': 0.1528605244175317, 'l2_leaf_reg': 3.723555533236987, 'random_strength': 7.711247251437167}. Best is trial 1 with value: 0.8192194570673064.
[I 2025-07-18 00:11:28,767] Trial 3 finished with value: 0.807413985844397 and parameters: {'iterations': 420, 'depth': 10, 'learning_rate': 0.085109

[I 2025-07-18 00:12:01,911] A new study created in memory with name: no-name-b1c95918-b845-4e5a-b626-ff4331f8a161


425:	learn: 0.0657213	total: 188ms	remaining: 141ms
426:	learn: 0.0655983	total: 188ms	remaining: 141ms
427:	learn: 0.0654905	total: 189ms	remaining: 140ms
428:	learn: 0.0652994	total: 189ms	remaining: 140ms
429:	learn: 0.0652401	total: 189ms	remaining: 139ms
430:	learn: 0.0651591	total: 190ms	remaining: 139ms
431:	learn: 0.0650166	total: 190ms	remaining: 138ms
432:	learn: 0.0649510	total: 191ms	remaining: 138ms
433:	learn: 0.0648191	total: 191ms	remaining: 138ms
434:	learn: 0.0648106	total: 192ms	remaining: 137ms
435:	learn: 0.0647999	total: 192ms	remaining: 137ms
436:	learn: 0.0647209	total: 193ms	remaining: 136ms
437:	learn: 0.0645894	total: 193ms	remaining: 136ms
438:	learn: 0.0644501	total: 194ms	remaining: 135ms
439:	learn: 0.0643345	total: 194ms	remaining: 135ms
440:	learn: 0.0641757	total: 194ms	remaining: 134ms
441:	learn: 0.0640136	total: 195ms	remaining: 134ms
442:	learn: 0.0637199	total: 195ms	remaining: 134ms
443:	learn: 0.0636112	total: 196ms	remaining: 133ms
444:	learn: 

  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-07-18 00:12:02,136] Trial 0 finished with value: 0.611117274596042 and parameters: {'iterations': 424, 'depth': 4, 'learning_rate': 0.04275342972583276, 'l2_leaf_reg': 6.343798021577319, 'random_strength': 2.7308685635485688}. Best is trial 0 with value: 0.611117274596042.
[I 2025-07-18 00:12:02,733] Trial 1 finished with value: 0.7819756633627595 and parameters: {'iterations': 586, 'depth': 6, 'learning_rate': 0.2946035527014231, 'l2_leaf_reg': 1.751882229421513, 'random_strength': 6.44203413038598}. Best is trial 0 with value: 0.611117274596042.
[I 2025-07-18 00:12:03,366] Trial 2 finished with value: 0.7938213186451017 and parameters: {'iterations': 398, 'depth': 7, 'learning_rate': 0.0996385048092317, 'l2_leaf_reg': 4.107849611958604, 'random_strength': 4.951205604082606}. Best is trial 0 with value: 0.611117274596042.
[I 2025-07-18 00:12:03,947] Trial 3 finished with value: 0.9569284265615776 and parameters: {'iterations': 850, 'depth': 5, 'learning_rate': 0.03744908833872

[I 2025-07-18 00:13:39,739] A new study created in memory with name: no-name-1490c998-6f74-4f53-90a7-6abb8cc3389a


716:	learn: 0.0077861	total: 1.98s	remaining: 2.76ms
717:	learn: 0.0077490	total: 1.98s	remaining: 0us
📉 Test MAPE (original scale): 0.35%

🔹 Optimizing Target: BlendProperty9


  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-07-18 00:13:41,258] Trial 0 finished with value: 0.9514721858253481 and parameters: {'iterations': 914, 'depth': 7, 'learning_rate': 0.2316833145524914, 'l2_leaf_reg': 4.256453898201748, 'random_strength': 4.291325044198839}. Best is trial 0 with value: 0.9514721858253481.
[I 2025-07-18 00:13:46,002] Trial 1 finished with value: 0.8359393828652275 and parameters: {'iterations': 515, 'depth': 10, 'learning_rate': 0.05531472168798561, 'l2_leaf_reg': 9.083556181783651, 'random_strength': 5.372816714729701}. Best is trial 1 with value: 0.8359393828652275.
[I 2025-07-18 00:13:46,426] Trial 2 finished with value: 0.9221290556513667 and parameters: {'iterations': 389, 'depth': 6, 'learning_rate': 0.25847407219024326, 'l2_leaf_reg': 1.2701358288749107, 'random_strength': 8.67357723432813}. Best is trial 1 with value: 0.8359393828652275.
[I 2025-07-18 00:13:51,947] Trial 3 finished with value: 1.2391882170341144 and parameters: {'iterations': 609, 'depth': 10, 'learning_rate': 0.2882888

[I 2025-07-18 00:14:33,253] A new study created in memory with name: no-name-a4b0226e-5932-4d7d-ae62-b99b05a2409e


399:	learn: 0.2280485	total: 187ms	remaining: 153ms
400:	learn: 0.2279281	total: 187ms	remaining: 153ms
401:	learn: 0.2275699	total: 188ms	remaining: 152ms
402:	learn: 0.2272055	total: 188ms	remaining: 152ms
403:	learn: 0.2269156	total: 189ms	remaining: 151ms
404:	learn: 0.2264415	total: 189ms	remaining: 151ms
405:	learn: 0.2262029	total: 190ms	remaining: 150ms
406:	learn: 0.2258259	total: 190ms	remaining: 150ms
407:	learn: 0.2255015	total: 190ms	remaining: 149ms
408:	learn: 0.2251538	total: 191ms	remaining: 149ms
409:	learn: 0.2247886	total: 191ms	remaining: 148ms
410:	learn: 0.2246702	total: 192ms	remaining: 148ms
411:	learn: 0.2245353	total: 192ms	remaining: 148ms
412:	learn: 0.2242154	total: 193ms	remaining: 147ms
413:	learn: 0.2238297	total: 193ms	remaining: 147ms
414:	learn: 0.2234824	total: 194ms	remaining: 146ms
415:	learn: 0.2232002	total: 194ms	remaining: 146ms
416:	learn: 0.2230987	total: 195ms	remaining: 145ms
417:	learn: 0.2228007	total: 195ms	remaining: 145ms
418:	learn: 

  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-07-18 00:14:37,960] Trial 0 finished with value: 1.532469490182545 and parameters: {'iterations': 538, 'depth': 10, 'learning_rate': 0.2129254217012503, 'l2_leaf_reg': 9.455917748499916, 'random_strength': 5.310116295466133}. Best is trial 0 with value: 1.532469490182545.
[I 2025-07-18 00:14:38,594] Trial 1 finished with value: 0.445942040474338 and parameters: {'iterations': 901, 'depth': 5, 'learning_rate': 0.11327330446951976, 'l2_leaf_reg': 9.693602854326658, 'random_strength': 3.5302100035235493}. Best is trial 1 with value: 0.445942040474338.
[I 2025-07-18 00:14:42,946] Trial 2 finished with value: 0.8940885018506355 and parameters: {'iterations': 882, 'depth': 9, 'learning_rate': 0.030638528661154844, 'l2_leaf_reg': 5.906287150560765, 'random_strength': 9.261862680183807}. Best is trial 1 with value: 0.445942040474338.
[I 2025-07-18 00:14:43,144] Trial 3 finished with value: 0.6818464931273878 and parameters: {'iterations': 379, 'depth': 4, 'learning_rate': 0.28070418770

In [115]:
print("Trained models:", best_models.keys())
missing = [t for t in targets if t not in best_models]
print("Missing models:", missing)

weighted_test = {}
for i in range(1, 6):
    for j in range(1, 11):
        prop_col = f"Component{i}_Property{j}"
        frac_col = f"Component{i}_fraction"
        weighted_col = f"Weighted_Component{i}_Property{j}"
        weighted_test[weighted_col] = test_df[prop_col] * test_df[frac_col]

test_with_weighted = pd.concat([test_df[base_features], pd.DataFrame(weighted_test)], axis=1)
test_scaled = pd.DataFrame(scaler.transform(test_with_weighted), columns=test_with_weighted.columns)

# Predict with Optuna models


Trained models: dict_keys(['BlendProperty1', 'BlendProperty2', 'BlendProperty3', 'BlendProperty4', 'BlendProperty5', 'BlendProperty6', 'BlendProperty7', 'BlendProperty8', 'BlendProperty9', 'BlendProperty10'])
Missing models: []


In [117]:
submission_preds=[]
for target in targets:
    model=best_models[target]
    expected_features=model.feature_names_
    X_test_target=test_scaled[expected_features]
    pred_transformed=model.predict(X_test_target)
    pred_full=pd.DataFrame(np.column_stack([pred_transformed]+[np.zeros(len(pred_transformed))]*(len(targets)-1)),columns=targets,index=test_df.index)
    pred_original=pt.inverse_transform(pred_full)[:,targets.index(target)]
    submission_preds.append(pred_original)

submission=pd.DataFrame(np.column_stack(submission_preds),columns=targets,index=test_df.index)
submission.to_csv("submission4.csv",index=False)
print("Submission file created: submission4.csv")

Submission file created: submission4.csv
