In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer,QuantileTransformer,RobustScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from catboost import CatBoostRegressor
import optuna
from catboost import CatBoostRegressor, Pool
from optuna.integration import CatBoostPruningCallback  



In [51]:
base_path = os.path.join("/", "Volumes", "Extreme SSD", "ShellAi")
train_path = os.path.join(base_path, "train.csv")
test_path = os.path.join(base_path, "test.csv")

# Load the CSV files
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [53]:
# Creating  weighted columns
base_features = [col for col in train_df.columns if col not in [f"BlendProperty{i}" for i in range(1, 11)]]
targets = [f"BlendProperty{i}" for i in range(1, 11)]
weighted_data = {}
for i in range(1, 6):
    for j in range(1, 11):
        prop_col = f"Component{i}_Property{j}"
        frac_col = f"Component{i}_fraction"
        weighted_col = f"Weighted_Component{i}_Property{j}"
        weighted_data[weighted_col] = train_df[prop_col] * train_df[frac_col]

# Combine
data_with_weighted = pd.concat([train_df[base_features], pd.DataFrame(weighted_data)], axis=1)
data_with_weighted = data_with_weighted.copy()  # De-fragmented copy

In [55]:
scaler = RobustScaler()
scaled_features = scaler.fit_transform(data_with_weighted)
scaled_df_features = pd.DataFrame(scaled_features, columns=data_with_weighted.columns)


pt = PowerTransformer(method='yeo-johnson')
transformed_targets_df = pd.DataFrame(pt.fit_transform(train_df[targets]), columns=targets, index=train_df.index)

# Initial feature selection using Random Forest feature importance
top_features = {}
for target in targets:
    X = scaled_df_features
    y = transformed_targets_df[target]
    rf_initial = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_initial.fit(X, y)
    feature_importance = pd.Series(rf_initial.feature_importances_, index=scaled_df_features.columns).sort_values(ascending=False)
    top_features[target] = feature_importance.head(20).index.tolist()

# GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20,]

}


In [57]:
warnings.filterwarnings("ignore",category=UserWarning)
for target in targets:
    print(f"\n🔹 Target: {target}")

    X = scaled_df_features[top_features[target]]
    y = transformed_targets_df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = CatBoostRegressor(verbose=0, random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=5,
                               scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    print(f"Best params: {grid_search.best_params_}")

    # CV MAPE on transformed targets
    cv_mape = -cross_val_score(best_model, X_train, y_train, cv=5,
                               scoring='neg_mean_absolute_percentage_error').mean() * 100
    print(f"Cross-validated MAPE (transformed): {cv_mape:.2f}%")

    # Predict on test set
    y_pred = best_model.predict(X_test)

    # Inverse transform for original scale MAPE
    y_test_full = pd.DataFrame(np.column_stack([y_test] + [np.zeros(len(y_test))]*(len(targets)-1)),
                               columns=targets, index=X_test.index)
    y_pred_full = pd.DataFrame(np.column_stack([y_pred] + [np.zeros(len(y_pred))]*(len(targets)-1)),
                               columns=targets, index=X_test.index)

    y_test_inv = pt.inverse_transform(y_test_full)[targets.index(target)]
    y_pred_inv = pt.inverse_transform(y_pred_full)[targets.index(target)]

    # Safe MAPE
    mask = y_test_inv != 0
    test_mape = np.mean(np.abs((y_test_inv[mask] - y_pred_inv[mask]) / y_test_inv[mask]) * 100) if mask.any() else 0.0
    print(f"Test MAPE (original scale): {test_mape:.2f}%")


🔹 Target: BlendProperty1
Best params: {'max_depth': 10, 'n_estimators': 200}
Cross-validated MAPE (transformed): 77.03%
Test MAPE (original scale): 0.70%

🔹 Target: BlendProperty2
Best params: {'max_depth': 10, 'n_estimators': 100}
Cross-validated MAPE (transformed): 74.20%
Test MAPE (original scale): 1.27%

🔹 Target: BlendProperty3
Best params: {'max_depth': 10, 'n_estimators': 200}
Cross-validated MAPE (transformed): 69.74%
Test MAPE (original scale): 0.38%

🔹 Target: BlendProperty4
Best params: {'max_depth': 10, 'n_estimators': 200}
Cross-validated MAPE (transformed): 81.47%
Test MAPE (original scale): 34.82%

🔹 Target: BlendProperty5
Best params: {'max_depth': 10, 'n_estimators': 100}
Cross-validated MAPE (transformed): 57.89%
Test MAPE (original scale): 0.71%

🔹 Target: BlendProperty6
Best params: {'max_depth': 10, 'n_estimators': 200}
Cross-validated MAPE (transformed): 74.37%
Test MAPE (original scale): 1.51%

🔹 Target: BlendProperty7
Best params: {'max_depth': 10, 'n_estimator

In [59]:
import optuna
from catboost import CatBoostRegressor, Pool
from optuna.integration import CatBoostPruningCallback  



In [65]:
import optuna
from sklearn.metrics import mean_absolute_percentage_error
from catboost import CatBoostRegressor

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
best_models = {}  # ← define this before the loop

# Inside the for-loop after training
 # ← define this before the loop

# Inside the for-loop after training


# Loop through each target
for target in targets:
    print(f"\n🔹 Optimizing Target: {target}")

    X = scaled_df_features[top_features[target]]
    y = transformed_targets_df[target]

    # Split for Optuna
    X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

    # Define objective function
    def objective(trial):
        params = {
            "iterations": trial.suggest_int("iterations", 100, 1000),
            "depth": trial.suggest_int("depth", 4, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
            "random_strength": trial.suggest_float("random_strength", 1e-9, 10.0),
            "loss_function": "MAPE",
            "verbose": 0
        }
        model = CatBoostRegressor(**params)
        model.fit(X_train, y_train)
        preds = model.predict(X_valid)
        return mean_absolute_percentage_error(y_valid, preds)

    # Run Optuna
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=40, show_progress_bar=True)

    best_params = study.best_params
    print(f"✅ Best params for {target}: {best_params}")

    # Train on full training data
    best_model = CatBoostRegressor(**best_params)
    best_model.fit(X_train_full, y_train_full)

    # Predict on test
    y_pred = best_model.predict(X_test)

    # Inverse transform for original scale MAPE
    y_test_full = pd.DataFrame(np.column_stack([y_test] + [np.zeros(len(y_test))]*(len(targets)-1)),
                               columns=targets, index=X_test.index)
    y_pred_full = pd.DataFrame(np.column_stack([y_pred] + [np.zeros(len(y_pred))]*(len(targets)-1)),
                               columns=targets, index=X_test.index)

    y_test_inv = pt.inverse_transform(y_test_full)[targets.index(target)]
    y_pred_inv = pt.inverse_transform(y_pred_full)[targets.index(target)]

    # Safe MAPE
    mask = y_test_inv != 0
    test_mape = np.mean(np.abs((y_test_inv[mask] - y_pred_inv[mask]) / y_test_inv[mask]) * 100) if mask.any() else 0.0
    best_models[target] = best_model
    print(f"📉 Test MAPE (original scale): {test_mape:.2f}%")



[I 2025-07-18 21:56:29,267] A new study created in memory with name: no-name-9211d18e-1804-4974-a25e-307ebd48e6db



🔹 Optimizing Target: BlendProperty1


  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-07-18 21:56:30,117] Trial 0 finished with value: 0.5928647771089773 and parameters: {'iterations': 577, 'depth': 7, 'learning_rate': 0.16646430134710483, 'l2_leaf_reg': 4.085085903168572, 'random_strength': 1.729196130398932}. Best is trial 0 with value: 0.5928647771089773.
[I 2025-07-18 21:56:30,444] Trial 1 finished with value: 0.32901394301072734 and parameters: {'iterations': 725, 'depth': 4, 'learning_rate': 0.05717806760131673, 'l2_leaf_reg': 7.478248408001809, 'random_strength': 2.3754029664715888}. Best is trial 1 with value: 0.32901394301072734.
[I 2025-07-18 21:56:30,507] Trial 2 finished with value: 0.9997409923388499 and parameters: {'iterations': 119, 'depth': 4, 'learning_rate': 0.0573837215674319, 'l2_leaf_reg': 6.9681479202023375, 'random_strength': 7.0363807003214065}. Best is trial 1 with value: 0.32901394301072734.
[I 2025-07-18 21:56:32,807] Trial 3 finished with value: 0.8544460228091026 and parameters: {'iterations': 920, 'depth': 8, 'learning_rate': 0.217

[I 2025-07-18 21:57:03,114] A new study created in memory with name: no-name-83799fd9-b5f8-4771-b74d-51d7c92804a6


545:	learn: 0.0962336	total: 381ms	remaining: 119ms
546:	learn: 0.0961840	total: 382ms	remaining: 118ms
547:	learn: 0.0960447	total: 383ms	remaining: 117ms
548:	learn: 0.0959515	total: 384ms	remaining: 117ms
549:	learn: 0.0959008	total: 384ms	remaining: 116ms
550:	learn: 0.0957259	total: 385ms	remaining: 115ms
551:	learn: 0.0955662	total: 386ms	remaining: 115ms
552:	learn: 0.0954832	total: 387ms	remaining: 114ms
553:	learn: 0.0952779	total: 387ms	remaining: 113ms
554:	learn: 0.0950585	total: 388ms	remaining: 113ms
555:	learn: 0.0948993	total: 389ms	remaining: 112ms
556:	learn: 0.0946481	total: 389ms	remaining: 111ms
557:	learn: 0.0945891	total: 390ms	remaining: 110ms
558:	learn: 0.0944068	total: 391ms	remaining: 110ms
559:	learn: 0.0942680	total: 392ms	remaining: 109ms
560:	learn: 0.0942191	total: 392ms	remaining: 108ms
561:	learn: 0.0941414	total: 393ms	remaining: 108ms
562:	learn: 0.0939233	total: 394ms	remaining: 107ms
563:	learn: 0.0937026	total: 395ms	remaining: 106ms
564:	learn: 

  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-07-18 21:57:03,725] Trial 0 finished with value: 0.4202754815210767 and parameters: {'iterations': 858, 'depth': 5, 'learning_rate': 0.29235293786020206, 'l2_leaf_reg': 2.191632904076784, 'random_strength': 5.884300540071432}. Best is trial 0 with value: 0.4202754815210767.
[I 2025-07-18 21:57:07,209] Trial 1 finished with value: 0.7468873143736007 and parameters: {'iterations': 694, 'depth': 9, 'learning_rate': 0.22645224608048867, 'l2_leaf_reg': 4.525642787150336, 'random_strength': 5.241778195142645}. Best is trial 0 with value: 0.4202754815210767.
[I 2025-07-18 21:57:11,288] Trial 2 finished with value: 0.7585611456874692 and parameters: {'iterations': 482, 'depth': 10, 'learning_rate': 0.14602407921672522, 'l2_leaf_reg': 5.681901896233288, 'random_strength': 5.567635672870703}. Best is trial 0 with value: 0.4202754815210767.
[I 2025-07-18 21:57:14,108] Trial 3 finished with value: 1.0059441469973538 and parameters: {'iterations': 525, 'depth': 9, 'learning_rate': 0.2946616

KeyboardInterrupt: 

In [47]:
import numpy as np
import pandas as pd

# STEP 1: Create weighted features for test_df
weighted_data_test = {}
for i in range(1, 6):
    for j in range(1, 11):
        prop_col = f"Component{i}_Property{j}"
        frac_col = f"Component{i}_fraction"
        weighted_col = f"Weighted_Component{i}_Property{j}"
        weighted_data_test[weighted_col] = test_df[prop_col] * test_df[frac_col]

# STEP 2: Combine with base features
base_features = [col for col in test_df.columns if col not in [f"BlendProperty{i}" for i in range(1, 11)]]
data_with_weighted_test = pd.concat([test_df[base_features], pd.DataFrame(weighted_data_test)], axis=1)
data_with_weighted_test = data_with_weighted_test.copy()
if 'ID' in data_with_weighted_test.columns:
    data_with_weighted_test = data_with_weighted_test.drop(columns=['ID'])


# STEP 3: Scale test data using training-time scaler (RobustScaler)
scaled_test_features = scaler.transform(data_with_weighted_test)
scaled_df_test = pd.DataFrame(scaled_test_features, columns=data_with_weighted_test.columns)

# STEP 4: Predict for each target using best Optuna model + selected features
all_preds_transformed = []

for target in targets:
    print(f"🔹 Predicting for: {target}")
    
    model = best_models[target]
    selected_features = top_features[target]
    X_test_target = scaled_df_test[selected_features]
    
    y_pred_transformed = model.predict(X_test_target)
    all_preds_transformed.append(y_pred_transformed)

# STEP 5: Inverse transform the predictions to original scale
all_preds_transformed = np.column_stack(all_preds_transformed)  # Shape: (n_samples, 10)
all_preds_original = pt.inverse_transform(all_preds_transformed)

# STEP 6: Create submission DataFrame
submission = pd.DataFrame(all_preds_original, columns=targets, index=test_df.index)

# Include 'ID' column if present
if 'ID' in test_df.columns:
    submission.insert(0, 'ID', test_df['ID'].values)

# STEP 7: Save to CSV
submission.to_csv("submission4.csv", index=False)
print("✅ submission4.csv saved successfully!")

🔹 Predicting for: BlendProperty1
🔹 Predicting for: BlendProperty2
🔹 Predicting for: BlendProperty3
🔹 Predicting for: BlendProperty4
🔹 Predicting for: BlendProperty5
🔹 Predicting for: BlendProperty6
🔹 Predicting for: BlendProperty7
🔹 Predicting for: BlendProperty8
🔹 Predicting for: BlendProperty9
🔹 Predicting for: BlendProperty10
✅ submission4.csv saved successfully!
