In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sentence_transformers import SentenceTransformer
import joblib
from tqdm import tqdm
tqdm.pandas()


# ---------------

2025-10-13 17:31:24.198784: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760376684.224109     198 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760376684.231536     198 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# =========================================================
# 2️⃣ SMAPE Metric
# =========================================================
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# =========================================================
# 3️⃣ Load Cleaned Data
# =========================================================
train = pd.read_csv("/kaggle/input/cleaned-train-csv/cleaned_train.csv")
test = pd.read_csv("/kaggle/input/cleaned-test-csv-final/data_cleaned_test.csv")

print("✅ Train shape:", train.shape)
print("✅ Test shape:", test.shape)

# Target variable (log1p)
y = np.log1p(train["price"].values)
X_train = train.drop(columns=["price", "prod_id"])
X_test = test.drop(columns=["prod_id"])

✅ Train shape: (75000, 16)
✅ Test shape: (75004, 10)


  test = pd.read_csv("/kaggle/input/cleaned-test-csv-final/data_cleaned_test.csv")


In [8]:
# =========================================================
# 3️⃣ Text Embeddings
# =========================================================
MODEL_PATH = "/kaggle/input/minilm-trans/miniLM"
encoder = SentenceTransformer(MODEL_PATH)

print("🔤 Generating MiniLM text embeddings...")
train_text = (train["item_name"].fillna('') + ". " + train["prod_desc"].fillna('')).tolist()
test_text = (test["item_name"].fillna('') + ". " + test["prod_desc"].fillna('')).tolist()

X_train_emb = encoder.encode(train_text, show_progress_bar=True, convert_to_numpy=True, batch_size=64)
X_test_emb = encoder.encode(test_text, show_progress_bar=True, convert_to_numpy=True, batch_size=64)


🔤 Generating MiniLM text embeddings...


Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

In [9]:

# =========================================================
# 4️⃣ Numeric Features + Scaling
# =========================================================
num_features = ["pack_count", "num_bullets", "desc_len", "title_len", "has_pack_info"]

scaler = StandardScaler()
X_train_num = scaler.fit_transform(train[num_features])
X_test_num = scaler.transform(test[num_features])

X_train_final = np.hstack([X_train_emb, X_train_num])
X_test_final = np.hstack([X_test_emb, X_test_num])

# =========================================================
# 5️⃣ Ensemble Model (Optimized for Speed)
# =========================================================
lgb = LGBMRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.9,
    colsample_bytree=0.9,
    device="gpu",  # use GPU on Kaggle
    random_state=42
)

cat = CatBoostRegressor(
    iterations=400,
    learning_rate=0.05,
    depth=8,
    task_type="GPU",  # use GPU on Kaggle
    verbose=0,
    random_state=42
)

ensemble = StackingRegressor(
    estimators=[("lgb", lgb), ("cat", cat)],
    final_estimator=LGBMRegressor(n_estimators=300, learning_rate=0.05, random_state=42, device="gpu"),
    n_jobs=1  # ⚠ important
)
# =========================================================
# 6️⃣ 3-Fold Cross-Validation
# =========================================================
kf = KFold(n_splits=3, shuffle=True, random_state=42)
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_final)):
    print(f"\n🔹 Fold {fold+1}")
    X_tr, X_val = X_train_final[train_idx], X_train_final[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    ensemble.fit(X_tr, y_tr)
    y_val_pred = ensemble.predict(X_val)

    y_val_pred_exp = np.expm1(y_val_pred)
    y_val_exp = np.expm1(y_val)
    fold_smape = smape(y_val_exp, y_val_pred_exp)
    print(f"SMAPE = {fold_smape:.4f}%")
    cv_scores.append(fold_smape)

print("\n📊 Mean CV SMAPE:", np.mean(cv_scores))

# =========================================================
# 7️⃣ Train on Full Data & Predict Test
# =========================================================
print("\n🚀 Training final model on full data...")
ensemble.fit(X_train_final, y)
y_test_pred = ensemble.predict(X_test_final)
y_test_pred = np.expm1(y_test_pred)
y_test_pred = np.clip(y_test_pred, 0.01, None)

submission = pd.DataFrame({
    "sample_id": test["prod_id"].values,
    "price": y_test_pred
})
submission.to_csv("/kaggle/working/submission_fast.csv", index=False)

print("💾 Submission saved → /kaggle/working/submission_fast.csv")


🔹 Fold 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 98510
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 389
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 387 dense feature groups (18.50 MB) transferred to GPU in 0.019597 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 2.741552
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 98506
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 389
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 38

In [None]:
# -------------------------
# Paths
# -------------------------
DATA_PATH = "/kaggle/input/cleaned-train-csv/cleaned_train.csv"
TEST_PATH = "/kaggle/input/cleaned-test-csv-final/data_cleaned_test.csv"  # your test CSV
MODEL_PATH = "/kaggle/input/minilm-trans/miniLM"  # local MiniLM model
MODEL_OUTPUT_PATH = "value_predictor_lgbm.pkl"
ENCODER_PATH = "text_encoder.pkl"
SUBMISSION_PATH = "/kaggle/working/submission.csv"

In [None]:
# =============================================
# Helper Functions
# =============================================

# Load CSV
def load_data(path):
    print(f"📂 Loading dataset: {path}")
    df = pd.read_csv(path)
    print(f"✅ Loaded {len(df)} rows and {len(df.columns)} columns")
    return df

def prepare_text_features(df):
    """Combine item_name, prod_desc, and bullet_points into one text column."""
    text_cols = ["item_name", "prod_desc", "bullet_points"]
    for col in text_cols:
        df[col] = df[col].fillna("")

    df["combined_text"] = (
        df["item_name"] + ". " +
        df["prod_desc"] + ". " +
        df["bullet_points"]
    ).str.strip()

    return df

# Encode text using MiniLM embeddings (batch for speed)
def encode_text(df, model_path=MODEL_PATH, batch_size=64):
    print(f"🔤 Loading SentenceTransformer model from: {model_path}")
    encoder = SentenceTransformer(model_path, local_files_only=True, trust_remote_code=False)

    texts = df["combined_text"].tolist()
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding text"):
        batch = texts[i:i+batch_size]
        batch_emb = encoder.encode(batch, show_progress_bar=False)
        embeddings.append(batch_emb)
    embeddings = np.vstack(embeddings)
    return embeddings, encoder

# Train LightGBM regression model
def train_model(X, y):
    print("🚀 Training LightGBM model...")
    model = Pipeline([
        ("scaler", StandardScaler()),
        ("regressor", LGBMRegressor(
            n_estimators=800,
            learning_rate=0.05,
            max_depth=10,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_lambda=0.1,
            random_state=92,
            n_jobs=-1
        ))
    ])
    model.fit(X, y)
    return model

# Evaluate regression model
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print("\n📊 Evaluation Results:")
    print(f"MAE  = {mae:.4f}")
    print(f"RMSE = {rmse:.4f}")
    print(f"R²   = {r2:.4f}")

# Save model and encoder
def save_model(model, encoder):
    joblib.dump(model, MODEL_OUTPUT_PATH)
    joblib.dump(encoder, ENCODER_PATH)
    print(f"💾 Saved model → {MODEL_OUTPUT_PATH}")
    print(f"💾 Saved encoder → {ENCODER_PATH}")

# SMAPE metric for submission evaluation
def smape(y_true, y_pred):
    return 100 * np.mean(np.abs(y_pred - y_true) / ((np.abs(y_true) + np.abs(y_pred)) / 2))


In [None]:
# Load data
df = load_data("/kaggle/input/cleaned-train-csv/cleaned_train.csv")

# Prepare text features
df = prepare_text_features(df)

# Target
y = df["price"].values  # predicting 'price'


In [None]:
# Encode combined text into embeddings
X, encoder = encode_text(df, model_path=MODEL_PATH, batch_size=64)
print(f"✅ Text embeddings shape: {X.shape}")


In [None]:
# Split data for evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {X_train.shape[0]}, Validation samples: {X_val.shape[0]}")


In [None]:
model = train_model(X_train, y_train)


In [None]:
evaluate(model, X_val, y_val)


In [None]:
def smape(y_true, y_pred):
    """Compute Symmetric Mean Absolute Percentage Error (SMAPE)."""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_pred - y_true) / denominator
    diff[denominator == 0] = 0  # handle zero division
    return np.mean(diff) * 100


In [None]:
# Predict on validation set
y_val_pred = model.predict(X_val)

# Compute SMAPE
val_smape = smape(y_val, y_val_pred)
print(f"📊 Validation SMAPE: {val_smape:.4f}%")


In [None]:
save_model(model, encoder)


In [None]:
def prepare_test_text(df):
    """Use catalog_content column from test set as combined_text."""
    df["catalog_content"] = df["catalog_content"].fillna("")
    df["combined_text"] = df["catalog_content"].str.strip()
    return df

In [None]:
# Load test data
df_test = pd.read_csv("/kaggle/input/cleaned-test-csv/test.csv")

# Prepare text
df_test = prepare_test_text(df_test)

# df_test["combined_text"] is now ready for encoding

# Encode test set text
X_test, _ = encode_text(df_test, model_path=MODEL_PATH, batch_size=64)


In [None]:
# Predict prices
y_pred = model.predict(X_test)

# Ensure all predicted prices are positive
y_pred = np.maximum(y_pred, 0.01)

# Prepare submission
submission = pd.DataFrame({
    "sample_id": df_test["sample_id"],
    "price": y_pred
})

# Save submission
submission.to_csv(SUBMISSION_PATH, index=False)
print(f"✅ Submission saved → {SUBMISSION_PATH}")
submission.head()
