In [1]:
!pip install -q sentence-transformers hnswlib timm torchvision lightgbm xgboost tqdm
!pip install -q optuna
import optuna
import os
import time
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from PIL import Image
import torch
import torch.nn as nn  # Added for fusion network
import torchvision.transforms as T
import timm
import hnswlib
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split, KFold
from xgboost import XGBRegressor
from lightgbm import early_stopping, log_evaluation
import lightgbm as lgb
import joblib
import requests
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor, as_completed

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m94.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m3.1 MB/s[0m eta 

2025-10-13 10:09:49.744556: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760350189.936136      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760350189.988913      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)

Using device: cuda


In [3]:
DATA_ROOT = '/kaggle/input/amazonmlc-testdata'  
train_path = os.path.join(DATA_ROOT, 'train.csv')
test_path  = os.path.join(DATA_ROOT, 'test.csv')

# writable cache & outputs on Kaggle
cache_folder = '/kaggle/working/image_embeddings_cache'
os.makedirs(cache_folder, exist_ok=True)
os.makedirs('/kaggle/working/models', exist_ok=True)

# load
print("Listing /kaggle/input contents:")
for d,_,files in os.walk('/kaggle/input'):
    print(d, len(files))

Listing /kaggle/input contents:
/kaggle/input 0
/kaggle/input/amazonmlc-testdata 2


In [4]:
train = pd.read_csv(train_path, sep=',', quotechar='"', engine='python', on_bad_lines='skip')
test  = pd.read_csv(test_path,  sep=',', quotechar='"', engine='python', on_bad_lines='skip')

print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (75000, 4)
Test shape: (75000, 3)


In [5]:
def load_image_from_url(url, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content)).convert("RGB")
            return img
        except:
            if attempt < max_retries - 1:
                time.sleep(1)
                continue
    return None 

In [6]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower().replace('\n',' ')
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

train['clean_text'] = train['catalog_content'].apply(clean_text)
test['clean_text'] = test['catalog_content'].apply(clean_text)

In [7]:
clip_low, clip_high = 1, 100000
train['price_clipped'] = train['price'].clip(lower=clip_low, upper=clip_high)
y_train = np.log1p(train['price_clipped'].values)  # Use clipped price for model target

In [8]:
def prepare_fine_tune_examples(df, threshold=0.1):
    from sentence_transformers import InputExample
    examples = []
    prices = df['price_clipped'].values
    texts = df['clean_text'].tolist()
    n = len(df)
    for i in range(n):
        for j in range(i + 1, n):
            same_price = abs(prices[i] - prices[j]) / max(prices[i], prices[j]) < threshold
            label = 1.0 if same_price else 0.0
            examples.append(InputExample(texts=[texts[i], texts[j]], label=label))
            if len(examples) > 150000:
                return examples
    return examples

In [9]:
fine_tune_examples = prepare_fine_tune_examples(train)

# Add: Fine-tuning the sentence transformer model
fine_tune_model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
model_finetune = SentenceTransformer(fine_tune_model_name, device=device)
train_loader = DataLoader(fine_tune_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model_finetune)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
import os
os.environ['WANDB_DISABLED'] = 'true'

print("Starting fine-tuning...")
model_finetune.fit(
    train_objectives=[(train_loader, train_loss)],
    epochs=3,
    warmup_steps=100,
    show_progress_bar=True
)

Starting fine-tuning...


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0695
1000,0.0621
1500,0.0607
2000,0.0628
2500,0.0604
3000,0.0574
3500,0.0607
4000,0.0595
4500,0.0615
5000,0.0584


In [11]:
# Replace: Encoding text embeddings using the fine-tuned model
print("Encoding using fine-tuned model...")
X_train_text = model_finetune.encode(train['clean_text'].tolist(),
                                    batch_size=32, show_progress_bar=True, convert_numpy=True)
X_test_text = model_finetune.encode(test['clean_text'].tolist(),
                                   batch_size=32, show_progress_bar=True, convert_numpy=True)
print("Text embeddings shape:", X_train_text.shape)

Encoding using fine-tuned model...


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

Text embeddings shape: (75000, 384)


In [12]:
# Extract numeric value and unit
def extract_value_unit(text):
    value, unit = None, None
    value_match = re.search(r'Value[: ]+\s*([\d\.]+)', text, re.IGNORECASE)
    if value_match:
        try: value = float(value_match.group(1))
        except: value = None
    unit_match = re.search(r'Unit[: ]+\s*([a-zA-Z]+)', text, re.IGNORECASE)
    if unit_match: unit = unit_match.group(1).lower()
    return value, unit

train[['value_num', 'unit']] = train['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))
test[['value_num', 'unit']] = test['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))

In [13]:
# Extract brand
def extract_brand(text):
    if pd.isna(text): return None
    brand_match = re.match(r'([^,]+)', text)
    if brand_match: return brand_match.group(1).strip().lower()
    return None

train['brand'] = train['catalog_content'].apply(extract_brand)
test['brand'] = test['catalog_content'].apply(extract_brand)

In [14]:
#Categorical features
TOP_K_BRANDS = 50
top_brands = train['brand'].value_counts().nlargest(TOP_K_BRANDS).index.tolist()
train_top_brands = train['brand'].apply(lambda x: x if x in top_brands else 'other')
test_top_brands = test['brand'].apply(lambda x: x if x in top_brands else 'other')

train_brands_onehot = pd.get_dummies(train_top_brands, prefix='brand', dummy_na=True)
test_brands_onehot = pd.get_dummies(test_top_brands, prefix='brand', dummy_na=True)
test_brands_onehot = test_brands_onehot.reindex(columns=train_brands_onehot.columns, fill_value=0)

In [15]:
# Target mean encoding with blending for rare
brand_target_mean = train.groupby('brand')['price'].mean().to_dict()
global_mean = train['price'].mean()
alpha = 0.7
train['brand_target'] = train['brand'].apply(lambda x: alpha*brand_target_mean.get(x, global_mean) + (1-alpha)*global_mean)
test['brand_target']  = test['brand'].apply(lambda x: alpha*brand_target_mean.get(x, global_mean) + (1-alpha)*global_mean)

In [16]:
# Units
TOP_K_UNITS = 20
top_units = train['unit'].value_counts().nlargest(TOP_K_UNITS).index.tolist()
train_top_units = train['unit'].apply(lambda x: x if x in top_units else 'other')
test_top_units = test['unit'].apply(lambda x: x if x in top_units else 'other')

train_units_onehot = pd.get_dummies(train_top_units, prefix='unit', dummy_na=True)
test_units_onehot = pd.get_dummies(test_top_units, prefix='unit', dummy_na=True)
test_units_onehot = test_units_onehot.reindex(columns=train_units_onehot.columns, fill_value=0)

unit_target_mean = train.groupby('unit')['price'].mean().to_dict()
train['unit_target'] = train['unit'].apply(lambda x: alpha*unit_target_mean.get(x, global_mean) + (1-alpha)*global_mean)
test['unit_target'] = test['unit'].apply(lambda x: alpha*unit_target_mean.get(x, global_mean) + (1-alpha)*global_mean)

In [17]:
# Structured arrays
X_train_cat = np.hstack([train_brands_onehot.values, train['brand_target'].values.reshape(-1,1),
                         train_units_onehot.values, train['unit_target'].values.reshape(-1,1)]).astype(np.float32)
X_test_cat = np.hstack([test_brands_onehot.values, test['brand_target'].values.reshape(-1,1),
                        test_units_onehot.values, test['unit_target'].values.reshape(-1,1)]).astype(np.float32)

X_train_num = train[['value_num']].fillna(0).astype(np.float32).values
X_test_num = test[['value_num']].fillna(0).astype(np.float32).values

In [18]:
def generate_ann_features(train_emb, test_emb, k=10):
    dim = train_emb.shape[1]
    index = hnswlib.Index(space='cosine', dim=dim)
    index.init_index(max_elements=train_emb.shape[0], ef_construction=200, M=50)
    index.add_items(train_emb)
    index.set_ef(50)
    # Train features
    labels_train, _ = index.knn_query(train_emb, k=k)
    ann_train = np.array([train['price'].values[labels_train[i]] for i in range(len(labels_train))])
    # Test features
    labels_test, _ = index.knn_query(test_emb, k=k)
    ann_test = np.array([train['price'].values[labels_test[i]] for i in range(len(labels_test))])
    return ann_train, ann_test

from sklearn.model_selection import KFold

def generate_ann_features_cv(embeddings, targets, test_emb, n_splits=5, k=10):
    dim = embeddings.shape[1]
    ann_train = np.zeros((embeddings.shape[0], k))
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for tr_idx, val_idx in kf.split(embeddings):
        index = hnswlib.Index(space='cosine', dim=dim)
        index.init_index(max_elements=len(tr_idx), ef_construction=200, M=50)
        index.add_items(embeddings[tr_idx])
        index.set_ef(50)
        labels, _ = index.knn_query(embeddings[val_idx], k=k)
        ann_train[val_idx] = np.array([targets[tr_idx][idxs] for idxs in labels])

    # build one final ANN for test using all train embeddings
    index = hnswlib.Index(space='cosine', dim=dim)
    index.init_index(max_elements=len(embeddings), ef_construction=200, M=50)
    index.add_items(embeddings)
    index.set_ef(50)
    labels_test, _ = index.knn_query(test_emb, k=k)
    ann_test = np.array([targets[idxs] for idxs in labels_test])

    return ann_train, ann_test

ANN_train, ANN_test = generate_ann_features_cv(X_train_text, train['price'].values, X_test_text, n_splits=5, k=10)
print("ANN features shape:", ANN_train.shape)

ANN features shape: (75000, 10)


In [19]:
transform = T.Compose([
    T.Resize((224,224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

In [20]:
class FeatureFusionNet(nn.Module):
    def __init__(self, txt_dim, img_dim, cat_num_dim):
        super().__init__()
        self.txt_fc = nn.Sequential(nn.Linear(txt_dim, 256), nn.ReLU(), nn.Dropout(0.2))
        self.img_fc = nn.Sequential(nn.Linear(img_dim, 256), nn.ReLU(), nn.Dropout(0.2))
        self.final_fc = nn.Sequential(
            nn.Linear(256 + 256 + cat_num_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
    def forward(self, txt_feat, img_feat, cat_num_feat):
        t = self.txt_fc(txt_feat)
        i = self.img_fc(img_feat)
        x = torch.cat([t, i, cat_num_feat], dim=1)
        return self.final_fc(x)

# Prepare tensor features for training fusion net (example code placeholder)
# Convert features to PyTorch tensors:
X_train_cat_num = torch.tensor(np.hstack([X_train_num, X_train_cat]), dtype=torch.float32).to(device)
X_test_cat_num = torch.tensor(np.hstack([X_test_num, X_test_cat]), dtype=torch.float32).to(device)
X_train_text_t = torch.tensor(X_train_text, dtype=torch.float32).to(device)
X_test_text_t = torch.tensor(X_test_text, dtype=torch.float32).to(device)
X_train_img_t = torch.tensor(X_train_img, dtype=torch.float32).to(device)
X_test_img_t = torch.tensor(X_test_img, dtype=torch.float32).to(device)
y_train_t = torch.tensor(y_train, dtype=torch.float32).view(-1,1).to(device)
# Instantiate model


NameError: name 'X_train_img' is not defined

In [None]:
# Train fusion model with CV to avoid data leakage
print("Training fusion model with 5-fold CV...")
fusion_train_preds = np.zeros(len(train))
fusion_test_preds_list = []

kf_fusion = KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (tr_idx, val_idx) in enumerate(kf_fusion.split(X_train_text)):
    print(f"Fusion model fold {fold+1}/5")
    
    # Prepare fold tensors
    X_txt_tr = torch.tensor(X_train_text[tr_idx], dtype=torch.float32).to(device)
    X_img_tr = torch.tensor(X_train_img[tr_idx], dtype=torch.float32).to(device)
    X_cat_tr = torch.tensor(np.hstack([X_train_num[tr_idx], X_train_cat[tr_idx]]), dtype=torch.float32).to(device)
    y_tr_fold = torch.tensor(y_train[tr_idx], dtype=torch.float32).view(-1,1).to(device)
    
    X_txt_val = torch.tensor(X_train_text[val_idx], dtype=torch.float32).to(device)
    X_img_val = torch.tensor(X_train_img[val_idx], dtype=torch.float32).to(device)
    X_cat_val = torch.tensor(np.hstack([X_train_num[val_idx], X_train_cat[val_idx]]), dtype=torch.float32).to(device)
    
    # Create DataLoader for batching
    from torch.utils.data import TensorDataset, DataLoader
    train_dataset = TensorDataset(X_txt_tr, X_img_tr, X_cat_tr, y_tr_fold)
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
    
    # Create and train fold-specific fusion model
    fold_fusion_model = FeatureFusionNet(txt_dim=X_train_text.shape[1], 
                                         img_dim=X_train_img.shape[1], 
                                         cat_num_dim=X_train_cat.shape[1]).to(device)
    optimizer = torch.optim.AdamW(fold_fusion_model.parameters(), lr=1e-3, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)
    loss_fn = nn.MSELoss()
    
    fold_fusion_model.train()
    best_val_loss = float('inf')
    patience = 10
    patience_counter = 0
    best_weights = None
    
    for epoch in range(200):
        # Training loop with batching
        epoch_loss = 0
        for batch_txt, batch_img, batch_cat, batch_y in train_loader:
            optimizer.zero_grad()
            preds = fold_fusion_model(batch_txt, batch_img, batch_cat)
            loss = loss_fn(preds, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        scheduler.step()

        # Validation every 5 epochs
        if epoch % 5 == 0:
            fold_fusion_model.eval()
            with torch.no_grad():
                val_preds = fold_fusion_model(X_txt_val, X_img_val, X_cat_val)
                y_val_fold = torch.tensor(y_train[val_idx], dtype=torch.float32).view(-1,1).to(device)
                val_loss = loss_fn(val_preds, y_val_fold).item()
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                best_weights = fold_fusion_model.state_dict().copy()
            else:
                patience_counter += 1
            
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                if best_weights is not None:
                    fold_fusion_model.load_state_dict(best_weights)
                break
        
            fold_fusion_model.train()
    
    # Get out-of-fold predictions for validation indices
    fold_fusion_model.eval()
    with torch.no_grad():
        val_preds = fold_fusion_model(X_txt_val, X_img_val, X_cat_val).cpu().numpy().flatten()
        fusion_train_preds[val_idx] = val_preds
        
        # Predict on test set and save for averaging
        X_test_cat_num = torch.tensor(np.hstack([X_test_num, X_test_cat]), dtype=torch.float32).to(device)
        X_test_text_t = torch.tensor(X_test_text, dtype=torch.float32).to(device)
        X_test_img_t = torch.tensor(X_test_img, dtype=torch.float32).to(device)
        
        test_preds_fold = fold_fusion_model(X_test_text_t, X_test_img_t, X_test_cat_num).cpu().numpy().flatten()
        fusion_test_preds_list.append(test_preds_fold)

In [None]:
# Average test predictions across all folds
fusion_test_preds = np.mean(fusion_test_preds_list, axis=0)

# Reshape for concatenation
fusion_train_preds_reshaped = fusion_train_preds.reshape(-1, 1)
fusion_test_preds_reshaped = fusion_test_preds.reshape(-1, 1)


In [None]:
X_train_full = np.hstack([X_train_text, ANN_train, X_train_num, X_train_cat, X_train_img, fusion_train_preds_reshaped])
X_test_full  = np.hstack([X_test_text, ANN_test, X_test_num, X_test_cat, X_test_img, fusion_test_preds_reshaped])
print("Full feature matrix shape:", X_train_full.shape)

In [None]:
# ===== OPTUNA HYPERPARAMETER OPTIMIZATION =====
from sklearn.metrics import mean_squared_error

def objective(trial):
    """Optuna objective function to minimize validation RMSE"""
    
    # Suggest hyperparameters for XGBoost
    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 500, 3000),
        'max_depth': trial.suggest_int('xgb_max_depth', 5, 12),
        'learning_rate': trial.suggest_float('xgb_lr', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('xgb_subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample', 0.6, 1.0),
        'reg_lambda': trial.suggest_float('xgb_lambda', 0.1, 5.0),
        'reg_alpha': trial.suggest_float('xgb_alpha', 0.0, 2.0),
        'tree_method': 'gpu_hist',
        'random_state': 42
    }
    
    # Suggest hyperparameters for LightGBM
    lgb_params_trial = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('lgb_lr', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('lgb_num_leaves', 128, 1024),
        'max_depth': trial.suggest_int('lgb_max_depth', 5, 15),
        'feature_fraction': trial.suggest_float('lgb_feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('lgb_bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('lgb_bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lgb_lambda_l1', 0.0, 2.0),
        'lambda_l2': trial.suggest_float('lgb_lambda_l2', 0.0, 5.0),
        'min_data_in_leaf': trial.suggest_int('lgb_min_data_in_leaf', 10, 50),
        'device': 'gpu',
        'seed': 42,
        'verbose': -1
    }
    
    # Run 3-fold CV to evaluate these hyperparameters
    kf_optuna = KFold(n_splits=3, shuffle=True, random_state=42)
    fold_scores = []
    
    for fold, (tr_idx, val_idx) in enumerate(kf_optuna.split(X_train_full)):
        X_tr, X_val = X_train_full[tr_idx], X_train_full[val_idx]
        y_tr, y_val = y_train[tr_idx], y_train[val_idx]
        
        # Train XGBoost
        xgb_model = XGBRegressor(**xgb_params)
        xgb_model.fit(X_tr, y_tr, 
                      eval_set=[(X_val, y_val)], 
                      early_stopping_rounds=50,
                      verbose=False)
        
        # Train LightGBM
        lgb_train_data = lgb.Dataset(X_tr, y_tr)
        lgb_val_data = lgb.Dataset(X_val, y_val)
        lgb_model = lgb.train(lgb_params_trial, 
                             lgb_train_data,
                             num_boost_round=2000,
                             valid_sets=[lgb_val_data],
                             callbacks=[early_stopping(stopping_rounds=50, verbose=False)])
        
        # Ensemble predictions
        pred_val = 0.5 * (xgb_model.predict(X_val) + lgb_model.predict(X_val))
        
        # Calculate RMSE
        rmse = mean_squared_error(np.expm1(y_val), np.expm1(pred_val), squared=False)
        fold_scores.append(rmse)
    
    # Return average RMSE across folds
    return np.mean(fold_scores)


# Run optimization
print("\n===== STARTING OPTUNA HYPERPARAMETER OPTIMIZATION =====")
study = optuna.create_study(direction='minimize', study_name='price_prediction')
study.optimize(objective, n_trials=30, show_progress_bar=True)  # 30 trials ~30 mins

# Get best parameters
best_params = study.best_params
print("\n===== BEST HYPERPARAMETERS FOUND =====")
print(f"Best validation RMSE: {study.best_value:.4f}")
print(best_params)

# Extract best params for XGB and LGB
best_xgb_params = {
    'n_estimators': best_params['xgb_n_estimators'],
    'max_depth': best_params['xgb_max_depth'],
    'learning_rate': best_params['xgb_lr'],
    'subsample': best_params['xgb_subsample'],
    'colsample_bytree': best_params['xgb_colsample'],
    'reg_lambda': best_params['xgb_lambda'],
    'reg_alpha': best_params['xgb_alpha'],
    'tree_method': 'gpu_hist',
    'random_state': 42
}

best_lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': best_params['lgb_lr'],
    'num_leaves': best_params['lgb_num_leaves'],
    'max_depth': best_params['lgb_max_depth'],
    'feature_fraction': best_params['lgb_feature_fraction'],
    'bagging_fraction': best_params['lgb_bagging_fraction'],
    'bagging_freq': best_params['lgb_bagging_freq'],
    'lambda_l1': best_params['lgb_lambda_l1'],
    'lambda_l2': best_params['lgb_lambda_l2'],
    'min_data_in_leaf': best_params['lgb_min_data_in_leaf'],
    'device': 'gpu',
    'seed': 42,
    'verbose': -1
}
print("===== OPTUNA OPTIMIZATION COMPLETE =====\n")

In [None]:
from sklearn.metrics import mean_squared_error

def smape(y_true, y_pred):
    return 100*np.mean(np.abs(y_pred - y_true)/((np.abs(y_true)+np.abs(y_pred))/2))

In [None]:
# Train-validation split
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_train_full))
test_preds = np.zeros(len(X_test_full))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train_full)):
    print(f"\n===== Fold {fold+1} =====")
    X_tr, X_val = X_train_full[tr_idx], X_train_full[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    lgb_params = best_lgb_params.copy()
    lgb_params['seed'] = fold

    xgb = XGBRegressor(**best_xgb_params)
    xgb.set_params(random_state=fold)
    xgb.fit(X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=100,
            verbose=False)
    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_val = lgb.Dataset(X_val, y_val)
    lgbm = lgb.train(
        lgb_params,
        lgb_train,
        num_boost_round=2000,
        valid_sets=[lgb_val],
        callbacks=[early_stopping(stopping_rounds=100)]
    )
    pred_val = 0.5*(xgb.predict(X_val) + lgbm.predict(X_val))
    oof_preds[val_idx] = np.expm1(pred_val)
    test_preds += 0.5*(np.expm1(xgb.predict(X_test_full)) + np.expm1(lgbm.predict(X_test_full)))/kf.n_splits
        # Overfitting diagnostics
    y_tr_pred_xgb = np.expm1(xgb.predict(X_tr))
    y_val_pred_xgb = np.expm1(xgb.predict(X_val))

    y_tr_pred_lgb = np.expm1(lgbm.predict(X_tr))
    y_val_pred_lgb = np.expm1(lgbm.predict(X_val))

    y_tr_pred = 0.5 * (y_tr_pred_xgb + y_tr_pred_lgb)
    y_val_pred = 0.5 * (y_val_pred_xgb + y_val_pred_lgb)

    y_tr_true = np.expm1(y_tr)
    y_val_true = np.expm1(y_val)

    train_rmse = mean_squared_error(y_tr_true, y_tr_pred, squared=False)
    val_rmse = mean_squared_error(y_val_true, y_val_pred, squared=False)

    print(f"\nFold {fold+1} Overfitting Diagnostic")
    print(f"Train RMSE: {train_rmse:.4f}")
    print(f"Validation RMSE: {val_rmse:.4f}")
    print(f"Gap: {val_rmse - train_rmse:.4f}")



In [None]:
val_smape = smape(np.expm1(y_train), oof_preds)
print(f"Overall 5-Fold SMAPE: {val_smape:.2f}%")
submission = pd.DataFrame({'sample_id': test['sample_id'], 'price': test_preds})
submission.to_csv('/kaggle/working/submission_5fold.csv', index=False)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor

# Define base learners with full params
base_learners = [
    ('xgb', XGBRegressor(**best_xgb_params)),
    ('lgb', lgb.LGBMRegressor(
        num_leaves=best_lgb_params['num_leaves'],
        learning_rate=best_lgb_params['learning_rate'],
        n_estimators=best_params['xgb_n_estimators'],  # reuse XGB n_estimators
        max_depth=best_lgb_params['max_depth'],
        device='gpu',
        random_state=42
    ))
]

In [None]:
meta_learner = Ridge(alpha=1.0)

stacking_regressor = StackingRegressor(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5
)

In [None]:
print("Training stacking regressor...")
stacking_regressor.fit(X_train_full, y_train)

In [None]:
y_test_pred_stacking = np.expm1(stacking_regressor.predict(X_test_full))

In [None]:
submission_stacking = pd.DataFrame({'sample_id': test['sample_id'], 'price': y_test_pred_stacking})
submission_path_stacking = '/kaggle/working/test_out.csv'
submission_stacking.to_csv(submission_path_stacking, index=False)
print("Stacking submission saved to:", submission_path_stacking)

# === Save models ===
joblib.dump(stacking_regressor, '/kaggle/working/models/stacking_regressor.pkl')
print("Models saved to /kaggle/working/models/")