In [1]:
import pandas as pd
import numpy as np
import joblib

# --- Model Imports ---
from lightgbm import LGBMRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# --- Sklearn Helpers ---
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_percentage_error
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge

In [2]:
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [4]:

# --- 1. Load Data & Define Columns ---
# Assuming these files are in the correct path from your previous notebooks
train = pd.read_csv('../dataset/train_featured.csv')
test = pd.read_csv('../dataset/test_featured.csv')


In [5]:

target_columns = [col for col in train.columns if col.startswith('BlendProperty')]
feature_columns = [col for col in train.columns if col not in target_columns and col != 'ID']


In [6]:

X_train = train[feature_columns]
y_train = train[target_columns]
X_test = test[feature_columns]


In [7]:

# --- 2. Define Evaluation Functions ---
def calculate_leaderboard_score(mape_cost):
    ref_cost = 2.72
    score = max(10, 100 - (90 * mape_cost) / ref_cost)
    return score

def evaluate_model(y_true, y_pred):
    epsilon = 1e-8
    mape = mean_absolute_percentage_error(y_true + epsilon, y_pred)
    score = calculate_leaderboard_score(mape)
    return score


In [11]:

# --- 3. 10-Fold Cross-Validation for the Ensemble ---

N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# OOF (Out-of-Fold) predictions for each model to get a robust overall score
oof_lgbm = np.zeros(y_train.shape)
oof_hgbm = np.zeros(y_train.shape)
oof_xgb = np.zeros(y_train.shape)
oof_cat = np.zeros(y_train.shape)
oof_svr=np.zeros(y_train.shape)
ensemble_scores = []


In [None]:

print("Starting 10-Fold CV for 4-Model Ensemble...")
print("-" * 70)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f"Fold {fold+1}/{N_SPLITS}")
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # --- Train and Predict with each model ---
    # Model 3: XGBoost
    print("  - Training XGBoost...")
    xgb = MultiOutputRegressor(XGBRegressor(random_state=42, objective='reg:absoluteerror',n_estimators=1000, n_jobs=-1))
    xgb.fit(X_train_fold, y_train_fold)
    preds_xgb = xgb.predict(X_val_fold)
    oof_xgb[val_idx] = preds_xgb
    # Model 2: HistGradientBoostingRegressor
    print("  - Training HistGradientRegressor...")
    hgbm = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=42, loss='absolute_error',max_iter=1000,learning_rate=0.05, max_leaf_nodes=31))
    hgbm.fit(X_train_fold, y_train_fold)
    preds_hgbm = hgbm.predict(X_val_fold)
    oof_hgbm[val_idx] = preds_hgbm
      # Model 1: LightGBM
    print("  - Training LightGBM...")
    lgbm = MultiOutputRegressor(LGBMRegressor(random_state=42, objective='mape', n_estimators=1000, verbose=-1, n_jobs=-1))
    lgbm.fit(X_train_fold, y_train_fold)
    preds_lgbm = lgbm.predict(X_val_fold)
    oof_lgbm[val_idx] = preds_lgbm

    # Model 4: CatBoost
    print("  - Training CatBoost...")
    cat = MultiOutputRegressor(CatBoostRegressor(random_state=42, loss_function='MAPE', iterations=1000, verbose=0))
    cat.fit(X_train_fold, y_train_fold)
    preds_cat = cat.predict(X_val_fold)
    oof_cat[val_idx] = preds_cat
    # # Model 5: Support Vector Regressor
    # svr_pipeline = Pipeline([
    # ('scaler', StandardScaler()),
    # ('svr', SVR(C=10, gamma='scale', kernel='rbf')) # Tune these values!
    # ])
    # # Model 6: Random Forest Regressor
    
    # multi_output_svr = MultiOutputRegressor(svr_pipeline)
    # multi_output_svr.fit(X_train_fold,y_train_fold)
    # preds_svr=multi_output_svr.predict(X_val_fold)
    # oof_svr[val_idx]=preds_svr
    # --- Ensemble and Evaluate for the Fold ---
    ensemble_preds_fold = (preds_lgbm + preds_hgbm + preds_xgb + preds_cat) / 4.0
    score = evaluate_model(y_val_fold, ensemble_preds_fold)
    ensemble_scores.append(score)
    print(f"  > Fold {fold+1} Ensemble Score: {score:.4f}\n")


Starting 10-Fold CV for 4-Model Ensemble...
----------------------------------------------------------------------
Fold 1/5
  - Training XGBoost...
  - Training HistGradientRegressor...
  - Training LightGBM...
  - Training CatBoost...
  > Fold 1 Ensemble Score: 21.4593

Fold 2/5
  - Training XGBoost...
  - Training HistGradientRegressor...
  - Training LightGBM...
  - Training CatBoost...
  > Fold 2 Ensemble Score: 67.2434

Fold 3/5
  - Training XGBoost...
  - Training HistGradientRegressor...
  - Training LightGBM...
  - Training CatBoost...
  > Fold 3 Ensemble Score: 46.5714

Fold 4/5
  - Training XGBoost...
  - Training HistGradientRegressor...
  - Training LightGBM...
  - Training CatBoost...
  > Fold 4 Ensemble Score: 67.4117

Fold 5/5
  - Training XGBoost...
  - Training HistGradientRegressor...
  - Training LightGBM...
  - Training CatBoost...
  > Fold 5 Ensemble Score: 71.5241



In [13]:

# --- Overall CV Score ---
print("-" * 70)
print("Cross-Validation Complete.")
print(f"Average Ensemble Score across {N_SPLITS} folds: {np.mean(ensemble_scores):.4f}")
print(f"Std Dev of Ensemble Scores: {np.std(ensemble_scores):.4f}")


----------------------------------------------------------------------
Cross-Validation Complete.
Average Ensemble Score across 5 folds: 54.8420
Std Dev of Ensemble Scores: 18.8302


In [14]:

# Calculate overall OOF score for a single reliable metric
overall_oof_preds = (oof_lgbm + oof_hgbm + oof_xgb + oof_cat) / 4.0
overall_oof_score = evaluate_model(y_train, overall_oof_preds)
print(f"\nOverall Out-of-Fold (OOF) Ensemble Score: {overall_oof_score:.4f}")
print("-" * 70)



Overall Out-of-Fold (OOF) Ensemble Score: 54.8420
----------------------------------------------------------------------


In [None]:
# --- 4. Final Training and Prediction on Full Data ---

print("Training final models on ALL data...")

# Train each model on the full training data
print("- Training final LightGBM...")
final_lgbm = MultiOutputRegressor(LGBMRegressor(random_state=42, objective='mape', n_estimators=2000, verbose=-1, n_jobs=-1)).fit(X_train, y_train)

print("- Training final HistGradientRegressor...")
final_hgbm = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=42, loss='absolute_error', max_iter=2000)).fit(X_train, y_train)

print("- Training final XGBoost...")
final_xgb = MultiOutputRegressor(XGBRegressor(random_state=42, objective='reg:absoluteerror', n_estimators=2000, n_jobs=-1)).fit(X_train, y_train)

print("- Training final CatBoost...")
final_cat = MultiOutputRegressor(CatBoostRegressor(random_state=42, loss_function='MAE', iterations=2000, verbose=0)).fit(X_train, y_train)
# print("--Training SVR")
# final_svr=MultiOutputRegressor(svr_pipeline).fit(X_train, y_train)
print("All final models trained.")



Training final models on ALL data...
- Training final LightGBM...
- Training final HistGradientRegressor...
- Training final XGBoost...
- Training final CatBoost...
All final models trained.


In [23]:

# --- Make predictions with each final model on the test set ---
print("\nMaking predictions on the test set...")
test_preds_lgbm = final_lgbm.predict(X_test)
test_preds_hgbm = final_hgbm.predict(X_test)
test_preds_xgb = final_xgb.predict(X_test)
test_preds_cat = final_cat.predict(X_test)
# test_preds_svr= final_svr.predict(X_test)
# --- Average the predictions for the final ensemble result ---
final_ensemble_predictions = (test_preds_lgbm + test_preds_hgbm + test_preds_xgb + test_preds_cat) / 4.0

# --- 5. Create Submission File ---
print("Creating submission file...")
submission_df = test[['ID']].copy()
submission_df[target_columns] = final_ensemble_predictions

submission_filename = '../submission/5-model_ensemble_submission.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission file created successfully: {submission_filename}")


Making predictions on the test set...
Creating submission file...
Submission file created successfully: ../submission/5-model_ensemble_submission.csv
