In [None]:
# --- INSTALL DEPENDENCIES ---
# !pip install catboost lightgbm scikit-learn

# --- IMPORTS ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

# ===============================
# 1. LOAD & PREPROCESS TRAIN DATA
# ===============================
train_df = pd.read_csv('train.csv')
train_df.drop('id', axis=1, inplace=True)

# Fill missing values
numerical_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols.remove('efficiency')
for col in numerical_cols:
    train_df[col].fillna(train_df[col].mean(), inplace=True)

categorical_cols = train_df.select_dtypes(include='object').columns.tolist()
for col in categorical_cols:
    train_df[col].fillna(train_df[col].mode()[0], inplace=True)

# Save raw version for CatBoost
X_cat = train_df.drop('efficiency', axis=1).copy()

# Encode categorical features for LightGBM
X_lgb = train_df.drop('efficiency', axis=1).copy()
for col in categorical_cols:
    X_lgb[col], _ = pd.factorize(X_lgb[col])

y = train_df['efficiency']

# =========================
# 2. TRAIN BASE MODELS
# =========================
X_train_cat, X_val_cat, y_train, y_val = train_test_split(X_cat, y, test_size=0.3, random_state=42)
X_train_lgb, X_val_lgb, _, _ = train_test_split(X_lgb, y, test_size=0.3, random_state=42)

# --- CatBoost ---
cat_features = [X_cat.columns.get_loc(col) for col in categorical_cols]
cat_model = CatBoostRegressor(
    iterations=1000, learning_rate=0.05, depth=6,
    loss_function='RMSE', eval_metric='RMSE',
    verbose=0, random_state=42
)
cat_model.fit(X_train_cat, y_train, cat_features=cat_features, eval_set=(X_val_cat, y_val), early_stopping_rounds=50)

# --- LightGBM ---
lgbm_model = LGBMRegressor(
    n_estimators=1000, learning_rate=0.05,
    max_depth=6, random_state=42, n_jobs=-1
)
lgbm_model.fit(
    X_train_lgb, y_train,
    eval_set=[(X_val_lgb, y_val)],
    callbacks=[early_stopping(50), log_evaluation(0)]
)

# =============================
# 3. TRAIN META MODEL (RIDGE)
# =============================
val_pred_cat = cat_model.predict(X_val_cat)
val_pred_lgbm = lgbm_model.predict(X_val_lgb)
stacked_val = np.vstack([val_pred_cat, val_pred_lgbm]).T

meta_model = Ridge()
meta_model.fit(stacked_val, y_val)

# Evaluate
final_val_pred = meta_model.predict(stacked_val)
rmse = np.sqrt(mean_squared_error(y_val, final_val_pred))
score = 100 * (1 - rmse)
print(f"\n✅ Validation RMSE: {rmse:.4f}")
print(f"✅ Score: {score:.2f}")

# =========================
# 4. LOAD & PREPROCESS TEST DATA
# =========================
test_df_original = pd.read_csv('test.csv')
test_ids = test_df_original['id']
test_df = test_df_original.drop('id', axis=1)

# Fill missing
for col in test_df.select_dtypes(include=[np.number]).columns:
    test_df[col].fillna(test_df[col].mean(), inplace=True)
for col in categorical_cols:
    test_df[col].fillna(test_df[col].mode()[0], inplace=True)

# Test data for CatBoost
X_test_cat = test_df.copy()

# Test data for LightGBM
X_test_lgb = test_df.copy()
for col in categorical_cols:
    X_test_lgb[col], _ = pd.factorize(X_test_lgb[col])

# =========================
# 5. PREDICT ON TEST DATA
# =========================
test_pred_cat = cat_model.predict(X_test_cat)
test_pred_lgbm = lgbm_model.predict(X_test_lgb)
stacked_test = np.vstack([test_pred_cat, test_pred_lgbm]).T
final_test_preds = meta_model.predict(stacked_test)

# =========================
# 6. SAVE SUBMISSION FILE
# =========================
submission = pd.DataFrame({
    'id': test_ids,
    'efficiency': final_test_preds
})
submission.to_csv('submission11.csv', index=False)
print("\n📁 Submission saved to 'submission.csv'")

# ========================================================================================
# Machine Learning Pipeline for Regression Task: Efficiency Prediction
#
# Description:
# This script implements a stacked regression model combining CatBoost, LightGBM, and Ridge
# regression to predict the 'efficiency' target variable. It includes preprocessing,
# model training, validation, prediction on test data, and preparation of the submission file.
#
# Major Steps:
# 1. Install required libraries (if needed).
# 2. Import necessary Python libraries for data handling, modeling, and evaluation.
# 3. Load and preprocess training data:
#    - Remove 'id' column as it is irrelevant for modeling.
#    - Impute missing numerical values with column means.
#    - Impute missing categorical values with column modes.
#    - Prepare two versions of feature sets:
#         a) Raw (for CatBoost which handles categorical features internally).
#         b) Encoded (for LightGBM which requires numerical encoding of categorical features).
#    - Separate the target variable 'efficiency'.
# 4. Train Base Models:
#    - Split data into training and validation sets.
#    - Train CatBoostRegressor:
#         - Specify categorical features by index.
#         - Use early stopping to prevent overfitting.
#    - Train LGBMRegressor:
#         - Use factorized categorical features.
#         - Apply early stopping and suppress verbose output.
# 5. Train Meta Model (Ridge Regression):
#    - Generate predictions from CatBoost and LightGBM on the validation set.
#    - Stack predictions as features for Ridge Regression.
#    - Fit Ridge model on stacked predictions to learn optimal combination.
#    - Evaluate the stacked model using RMSE and calculate a custom score metric.
# 6. Load and preprocess test data:
#    - Remove 'id' column.
#    - Apply the same missing value imputation as training data.
#    - Prepare both raw and encoded feature sets for CatBoost and LightGBM respectively.
# 7. Predict on test data:
#    - Generate predictions using trained CatBoost and LightGBM models.
#    - Stack these predictions and apply the Ridge meta-model to obtain final predictions.
# 8. Save the final predictions in a CSV file formatted for submission.
#
# Note:
# - Consistent random seed (42) ensures reproducibility.
# - Early stopping enhances generalization by preventing overfitting.
# - Model stacking leverages the strengths of both CatBoost and LightGBM.
# ========================================================================================
