In [None]:
# ==============================================================================
#                 ROBUST 3-GBDT PRICE FORECASTER (v2 - WITH SMAPE)
#
#   Author: [Your Name]
#   Strategy: An ensemble of three GBDT models. This version includes a
#             train-validation split to calculate a reliable SMAPE score before submission.
# ==============================================================================

# %% [markdown]
# ## 1. Environment Setup & Initializations
# Install and import necessary libraries for our forecasting task.

# %%
!pip install catboost lightgbm xgboost scikit-learn -q

import pandas as pd
import numpy as np
import re
import os
import gc # Memory Management
import lightgbm as lgb
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# --- Model Imports ---
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# --- Mount Drive and Configure Paths ---
print("Mounting Google Drive...")
drive.mount('/content/drive')

BASE_PATH = "/content/drive/MyDrive/ML_Challenge_2025"
DATA_PATH = os.path.join(BASE_PATH, "data")
SUBMISSIONS_PATH = os.path.join(BASE_PATH, "submissions")
os.makedirs(SUBMISSIONS_PATH, exist_ok=True)

# --- Configuration ---
RANDOM_SEED = 2024
N_FEATURES_TFIDF = 2200

# %% [markdown]
# ## 2. SMAPE Metric and Feature Engineering

# %%
def smape(y_true, y_pred):
    """Calculates the Symmetric Mean Absolute Percentage Error (SMAPE)."""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    # Add a small epsilon to the denominator to avoid division by zero
    return np.mean(numerator / (denominator + 1e-9)) * 100

def create_features(df):
    processed_df = df.copy()
    processed_df['catalog_content'] = processed_df['catalog_content'].fillna('missing')
    def get_ipq(text):
        match = re.search(r'(?:pack of|set of|pk of|of|x|\s)(\d{1,3})', text.lower())
        if match: num = int(match.group(1)); return num if 1 < num <= 100 else 1
        return 1
    processed_df['pack_quantity'] = processed_df['catalog_content'].apply(get_ipq)
    processed_df['content_length_chars'] = processed_df['catalog_content'].str.len()
    processed_df['content_word_count'] = processed_df['catalog_content'].str.split().str.len()
    return processed_df

print("Loading source datasets...")
source_train_df_full = pd.read_csv(f"{DATA_PATH}/train.csv")
source_test_df = pd.read_csv(f"{DATA_PATH}/test.csv")

print("Engineering features for all data...")
train_featured_full = create_features(source_train_df_full)
test_featured = create_features(source_test_df)

# %% [markdown]
# ## 3. Train-Validation Split
# We split the full training data to create a validation set for reliable performance evaluation.

# %%
# Split the featured data into training (80%) and validation (20%) sets
train_featured, val_featured = train_test_split(
    train_featured_full,
    test_size=0.2,
    random_state=RANDOM_SEED
)

print(f"Training set size: {len(train_featured)}")
print(f"Validation set size: {len(val_featured)}")

# %% [markdown]
# ## 4. Text-to-Vector Transformation
# The TF-IDF vectorizer is now FIT ONLY on the new training split to prevent data leakage.

# %%
print(f"Vectorizing text with TF-IDF using top {N_FEATURES_TFIDF} features...")
tfidf_vec = TfidfVectorizer(max_features=N_FEATURES_TFIDF, stop_words='english', ngram_range=(1, 2))

# Fit on the training split, then transform all three sets
train_text_features = tfidf_vec.fit_transform(train_featured['catalog_content'])
val_text_features = tfidf_vec.transform(val_featured['catalog_content'])
test_text_features = tfidf_vec.transform(test_featured['catalog_content'])

tfidf_train = pd.DataFrame(train_text_features.toarray(), columns=tfidf_vec.get_feature_names_out())
tfidf_val = pd.DataFrame(val_text_features.toarray(), columns=tfidf_vec.get_feature_names_out())
tfidf_test = pd.DataFrame(test_text_features.toarray(), columns=tfidf_vec.get_feature_names_out())

# %% [markdown]
# ## 5. Final Feature Assembly

# %%
numerical_cols = ['pack_quantity', 'content_length_chars', 'content_word_count']

# Create final matrices for train, validation, and test sets
X_train = pd.concat([train_featured[numerical_cols].reset_index(drop=True), tfidf_train], axis=1)
X_val = pd.concat([val_featured[numerical_cols].reset_index(drop=True), tfidf_val], axis=1)
X_test = pd.concat([test_featured[numerical_cols].reset_index(drop=True), tfidf_test], axis=1)

# Create log-transformed target variables for train and validation
y_train_log = np.log1p(train_featured['price'])
y_val_log = np.log1p(val_featured['price'])

print(f"Final Training Matrix Shape: {X_train.shape}")
print(f"Final Validation Matrix Shape: {X_val.shape}")
del train_featured_full, train_featured, val_featured, tfidf_train, tfidf_val, tfidf_test
gc.collect()

# %% [markdown]
# ## 6. Model Training Pipeline

# %%
def train_all_models(X_tr, y_tr, X_vl, y_vl):
    models = {}
    print("Training CatBoost Regressor..."); model_catboost = CatBoostRegressor(iterations=2500, learning_rate=0.045, depth=8, loss_function='RMSE', random_seed=RANDOM_SEED, verbose=500, early_stopping_rounds=100); model_catboost.fit(X_tr, y_tr, eval_set=(X_vl, y_vl)); models['catboost'] = model_catboost; print("CatBoost training complete.")
    print("\nTraining LightGBM Regressor..."); model_lightgbm = LGBMRegressor(n_estimators=2500, learning_rate=0.045, num_leaves=31, random_state=RANDOM_SEED, n_jobs=-1); model_lightgbm.fit(X_tr, y_tr, eval_set=[(X_vl, y_vl)], callbacks=[lgb.early_stopping(100, verbose=False), lgb.log_evaluation(period=500)]); models['lightgbm'] = model_lightgbm; print("LightGBM training complete.")
    X_tr_xgb = X_tr.copy(); X_tr_xgb.columns = ["".join(c if c.isalnum() else "_" for c in str(x)) for x in X_tr_xgb.columns]
    X_vl_xgb = X_vl.copy(); X_vl_xgb.columns = ["".join(c if c.isalnum() else "_" for c in str(x)) for x in X_vl_xgb.columns]
    print("\nTraining XGBoost Regressor..."); model_xgboost = XGBRegressor(n_estimators=2500, learning_rate=0.045, max_depth=7, objective='reg:squarederror', random_state=RANDOM_SEED, n_jobs=-1, eval_metric='rmse', early_stopping_rounds=100); model_xgboost.fit(X_tr_xgb, y_tr, eval_set=[(X_vl_xgb, y_vl)], verbose=500); models['xgboost'] = model_xgboost; print("XGBoost training complete.")
    return models

trained_models = train_all_models(X_train, y_train_log, X_val, y_val_log)

# %% [markdown]
# ## 7. Validation Performance and Prediction

# %%
print("\n--- Performance Evaluation on Validation Set ---")

# Predict on the validation set
preds_val_log_cat = trained_models['catboost'].predict(X_val)
preds_val_log_lgb = trained_models['lightgbm'].predict(X_val)
X_val_xgb = X_val.copy(); X_val_xgb.columns = ["".join(c if c.isalnum() else "_" for c in str(x)) for x in X_val_xgb.columns]
preds_val_log_xgb = trained_models['xgboost'].predict(X_val_xgb)

# Ensemble the validation predictions
ensemble_preds_val_log = np.mean([preds_val_log_cat, preds_val_log_lgb, preds_val_log_xgb], axis=0)

# Convert predictions and true values back to original price scale
y_val_true_price = np.expm1(y_val_log)
preds_val_price = np.expm1(ensemble_preds_val_log)

# Calculate and print the SMAPE score
validation_smape = smape(y_val_true_price, preds_val_price)
print(f"\nVALIDATION SMAPE SCORE: {validation_smape:.4f}%")
print("-------------------------------------------------")

# --- Generate Predictions for the Test Set ---
print("\nGenerating predictions for the final submission...")
preds_test_log_cat = trained_models['catboost'].predict(X_test)
preds_test_log_lgb = trained_models['lightgbm'].predict(X_test)
X_test_xgb = X_test.copy(); X_test_xgb.columns = ["".join(c if c.isalnum() else "_" for c in str(x)) for x in X_test_xgb.columns]
preds_test_log_xgb = trained_models['xgboost'].predict(X_test_xgb)

ensemble_preds_test_log = np.mean([preds_test_log_cat, preds_test_log_lgb, preds_test_log_xgb], axis=0)
final_price_predictions = np.expm1(ensemble_preds_test_log)
final_price_predictions[final_price_predictions < 0] = 0.01

# %% [markdown]
# ## 8. Submission File Generation

# %%
print("\nCreating final submission file...")
submission_df = pd.DataFrame({'sample_id': source_test_df['sample_id'], 'price': final_price_predictions})
submission_path = os.path.join(SUBMISSIONS_PATH, 'submission_3-gbdt_ensemble_with_val.csv')
submission_df.to_csv(submission_path, index=False)
print(f"Submission file saved to: {submission_path}")
print(submission_df.head())

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m119.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.3/47.3 kB[0m [31m723.9 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.9/295.9 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hMounting Google Drive...
Mounted at /content/drive
Loading source datasets...
Engineering features for all data...
Training set size: 60000
Validation set size: 15000
Vectorizing text with TF-IDF using top 2200 features...
Final Training Matrix Shape: (60000, 2203)
Final Validation Matrix Shape: (15000, 2203)
Training CatBoost Regressor...
0:	learn: 0.9346760	test: 0.9306880	best: 0.9306880 (0)	