In [None]:
# ===============================
# FULL PIPELINE: LGBM + optional CatBoost + SMAPE FEVAL
# ===============================
import os
import re
import time
import gc
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import lightgbm as lgb

# ------------- USER PARAMS (edit if needed) -------------
TRAIN_PATH = '/mnt/sagemaker-nvme/feature_merge/train_CLEAN_FINAL.csv'
TEST_PATH  = '/mnt/sagemaker-nvme/feature_merge/test_CLEAN_FINAL.csv'
TARGET_COL = 'price'            # final ground-truth column name (or 'log_price' if only that exists)
ID_COL     = 'id'               # if your test has a different id name, change it
N_SPLITS   = 3                  # CV folds (set 3 for speed; set 5 if you have time)
TOP_K      = 1200               # keep top-K features by importance (reduce if memory issues)
SEEDS      = [42, 2024]         # seeds for seed-ensemble, can add more
USE_CATBOOST = True             # try CatBoost blending if available
RANDOM_STATE = 42
# -------------------------------------------------------

# ---------- Helpers ----------
def clean_column_names(df):
    """Remove special JSON characters and ensure unique names."""
    new_names = {col: re.sub(r'[^A-Za-z0-9_]+','', col) for col in df.columns}
    seen = {}
    final = {}
    for old, new in new_names.items():
        if new == '': new = 'col'
        if new in seen:
            seen[new] += 1
            new2 = f"{new}_{seen[new]}"
            final[old] = new2
        else:
            seen[new] = 0
            final[old] = new
    return df.rename(columns=final)

def smape_true(y_true, y_pred):
    """SMAPE on original scale (in percent)."""
    num = np.abs(y_pred - y_true)
    den = (np.abs(y_true) + np.abs(y_pred)) + 1e-8
    return np.mean(200.0 * num / den)

def seed_print(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}")

# ---------- Load data (with dtype reduction) ----------
seed_print("Loading data...")
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
# seed_print(train.info())

seed_print(f"Train: {train.shape}, Test: {test.shape}")

# ---------- Target detection ----------
if TARGET_COL in train.columns:
    y = train[TARGET_COL].astype('float32')
elif 'log_price' in train.columns:
    # recover if only log exists
    y = np.expm1(train['log_price'].astype('float32'))
else:
    raise ValueError("No target column found. Change TARGET_COL or ensure 'price'/'log_price' present.")

# ---------- Prepare features ----------
X = train.drop(columns=[c for c in ['price','log_price'] if c in train.columns], errors='ignore').copy()
X_test = test.copy()

# ---------- Clean names and align ----------
seed_print("Cleaning column names and aligning train/test...")
X = clean_column_names(X)
X_test = clean_column_names(X_test)

# Ensure same order and same features
common = [c for c in X.columns if c in X_test.columns]
X = X[common].copy()
X_test = X_test[common].copy()
seed_print(f"Features count after align: {len(common)}")

# ---------- Reduce memory (float32 etc) ----------
seed_print("Converting dtypes to save memory...")
for c in X.select_dtypes(include=['float64']).columns:
    X[c] = X[c].astype('float32')
    X_test[c] = X_test[c].astype('float32')
for c in X.select_dtypes(include=['int64']).columns:
    X[c] = X[c].astype('int32')
    X_test[c] = X_test[c].astype('int32')

# ==========================
# Detect and Convert Categorical Columns Safely
# ==========================
cat_cols = []

for c in X.columns:
    # only treat as categorical if object type or low-cardinality
    if X[c].dtype == 'object' or X[c].nunique() <= 100:
        # skip numeric columns
        if X[c].dtype not in ['float64', 'int64']:
            X[c] = X[c].astype('category')
            X_test[c] = X_test[c].astype('category')
            cat_cols.append(c)

# ---------------------------
# ENSURE NO CATEGORICAL FEATURES (force numeric-only for LGBM)
# ---------------------------
seed_print("Ensuring no categorical dtypes and disabling categorical_feature for LGBM...")

# 1) If any column accidentally has pandas 'category' dtype, convert it back.
#    Try to restore numeric values; if not possible, use category codes (safe integer).
for col in X.select_dtypes(include=['category']).columns.tolist():
    seed_print(f"Reverting category dtype for column: {col}")
    try:
        # If categories are numeric-like, this will restore numbers
        X[col] = pd.to_numeric(X[col], errors='raise')
        X_test[col] = pd.to_numeric(X_test[col], errors='raise')
    except Exception:
        # fallback: use safe integer codes (0..n-1), but we prefer to not treat them as categorical
        X[col] = X[col].cat.codes.astype('int32')
        X_test[col] = X_test[col].cat.codes.astype('int32')

# 2) Remove any accidentally-created 'object' dtypes that are actually numeric strings
#    (optional but helpful). Try to coerce object cols to numeric where possible.
for col in X.select_dtypes(include=['object']).columns.tolist():
    try:
        X[col] = pd.to_numeric(X[col], errors='raise')
        X_test[col] = pd.to_numeric(X_test[col], errors='raise')
        seed_print(f"Coerced object->numeric for {col}")
    except Exception:
        # if it really is stringy, leave as object but we WILL NOT pass as categorical to LGBM
        seed_print(f"Left as object (non-numeric) column: {col}")

# 3) Now explicitly clear any cat_cols list and ensure we never pass categorical_feature to LGBM
cat_cols = []   # <- key: ensure empty

# 4) Double-check dtypes summary (for debugging / quick assertion)
seed_print("Final dtypes summary (train):")
print(X.dtypes.value_counts())
seed_print("If you see 'category' still, inspect columns above and convert them explicitly.")



In [None]:

# ---------- Quick feature importance sampling to get top-K features ----------
seed_print("Running small sample LGB for feature importance (to pick top-K)...")
SAMPLE_N = min(12000, len(X))
sample_idx = np.random.RandomState(RANDOM_STATE).choice(len(X), SAMPLE_N, replace=False)
X_s = X.iloc[sample_idx].copy()
y_s = np.log1p(y.iloc[sample_idx])  # importance training on log-target
dtrain_s = lgb.Dataset(X_s, label=y_s, categorical_feature=None)
params_small = {
    'objective': 'regression_l1', 'metric': 'mae',
    'num_leaves': 31, 'learning_rate': 0.05, 'n_estimators': 500,
    'verbose': -1, 'max_bin': 127, 'n_jobs': 4, 'min_data_in_leaf': 20
}
bst_small = lgb.train(params_small, dtrain_s, num_boost_round=300)
importances = pd.DataFrame({
    'feature': X_s.columns,
    'imp_gain': bst_small.feature_importance(importance_type='gain'),
    'imp_split': bst_small.feature_importance(importance_type='split')
}).sort_values('imp_gain', ascending=False)

TOP_K = min(TOP_K, len(importances))
top_feats = importances['feature'].values[:TOP_K].tolist()
seed_print(f"Keeping top {TOP_K} features (by gain).")
del X_s, y_s, dtrain_s, bst_small, importances; gc.collect()

X = X[top_feats].copy()
X_test = X_test[top_feats].copy()

# ---------- Prepare log1p target for training ----------
y_log = np.log1p(y.values)

# ---------- Define SMAPE FEVAL for LightGBM (IMPORTANT) ----------
def lgb_smape_eval(preds_log, train_data):
    """
    LightGBM custom eval: preds and labels are in log1p space.
    We must convert back to original scale for SMAPE calculation.
    """
    labels_log = train_data.get_label()
    # inverse transform to original space
    preds = np.expm1(preds_log)
    labels = np.expm1(labels_log)
    # compute SMAPE (not percent but mean percent; we keep percent)
    num = np.abs(preds - labels)
    den = (np.abs(preds) + np.abs(labels)) + 1e-8
    smape = np.mean(200.0 * num / den)
    return 'SMAPE', smape, False  # False -> lower is better

# ---------- LightGBM params (CPU safe) ----------
lgb_params = {
    'objective': 'regression_l1',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'learning_rate': 0.02,
    'num_leaves': 64,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'min_data_in_leaf': 30,
    'max_bin': 127,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'n_jobs': 4,
    'verbose': -1,
    'seed': 42
}

# ---------- Function to run CV for one seed ----------
def run_lgb_cv(X, X_test, y_log, params, n_splits=3, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_log = np.zeros(len(X))
    test_log = np.zeros(len(X_test))
    fold_metrics = []
    start = time.time()

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
        print(f"--- Seed {seed} | Fold {fold+1}/{n_splits} ---")
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y_log[tr_idx], y_log[val_idx]

        lgb_tr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=None)
        lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_tr, categorical_feature=None)

        bst = lgb.train(
            params,
            lgb_tr,
            num_boost_round=3000,
            valid_sets=[lgb_val],
            feval=lgb_smape_eval,
            callbacks=[
                lgb.early_stopping(150),
                lgb.log_evaluation(500)
            ]
        )

        oof_log[val_idx] = bst.predict(X_val, num_iteration=bst.best_iteration)
        test_log += bst.predict(X_test, num_iteration=bst.best_iteration) / n_splits

        # evaluate SMAPE on original scale for this fold
        y_val_orig = np.expm1(y_val)
        oof_val_orig = np.expm1(oof_log[val_idx])
        sm = smape_true(y_val_orig, oof_val_orig)
        fold_metrics.append(sm)
        print(f"Fold {fold+1} SMAPE (orig scale): {sm:.6f}")

        # cleanup
        del X_tr, X_val, y_tr, y_val, lgb_tr, lgb_val, bst
        gc.collect()

    print(f"[seed {seed}] CV SMAPE mean/std: {np.mean(fold_metrics):.6f} Â± {np.std(fold_metrics):.6f}")
    print("Elapsed (mins):", (time.time()-start)/60.0)
    return oof_log, test_log, fold_metrics



In [None]:

# ---------- Run LightGBM for multiple seeds (seed ensemble) ----------
seed_print("Starting LightGBM training for seeds: " + str(SEEDS))
oof_stack = np.zeros((len(X), len(SEEDS)))
test_stack = np.zeros((len(X_test), len(SEEDS)))
seed_metrics = []

for i, sd in enumerate(SEEDS):
    params = lgb_params.copy()
    params['seed'] = sd
    oof_log, test_log, fmetrics = run_lgb_cv(X, X_test, y_log, params, n_splits=N_SPLITS, seed=sd)
    oof_stack[:, i] = oof_log
    test_stack[:, i] = test_log
    seed_metrics.append((np.mean(fmetrics), np.std(fmetrics)))

# Convert ensemble logs -> original scale by expm1 and average
seed_print("Converting log preds to original scale and averaging seeds...")
oof_ens = np.expm1(oof_stack).mean(axis=1)
test_ens = np.expm1(test_stack).mean(axis=1)

overall_cv_smape = smape_true(np.expm1(y_log), oof_ens)
seed_print(f"Ensemble CV SMAPE (all seeds) = {overall_cv_smape:.6f}")

# ---------- Optional: CatBoost quick blend (if available) ----------
if USE_CATBOOST:
    try:
        from catboost import CatBoostRegressor
        seed_print("CatBoost found. Running quick CatBoost (single seed) on TOP_K features.")
        # convert cat columns indices for CatBoost
        cat_idx = [i for i, c in enumerate(X.columns) if c in cat_cols]
        oof_cb = np.zeros(len(X))
        test_cb = np.zeros(len(X_test))
        kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEEDS[0])
        cb_scores = []

        for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
            print(f"CatBoost Fold {fold+1}/{N_SPLITS}")
            X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
            y_tr, y_val = y_log[tr_idx], y_log[val_idx]

            model_cb = CatBoostRegressor(
                iterations=2000, learning_rate=0.03, depth=6,
                loss_function='MAE', eval_metric='MAE',
                early_stopping_rounds=150, verbose=500, random_seed=SEEDS[0]
            )
            model_cb.fit(X_tr, y_tr, eval_set=(X_val, y_val), cat_features=cat_idx)
            oof_cb[val_idx] = model_cb.predict(X_val)
            test_cb += model_cb.predict(X_test) / N_SPLITS

            # fold SMAPE on original scale
            cb_fold_smape = smape_true(np.expm1(y_val), np.expm1(oof_cb[val_idx]))
            cb_scores.append(cb_fold_smape)
            print("CatBoost Fold SMAPE:", cb_fold_smape)

        print("CatBoost CV SMAPE:", np.mean(cb_scores), np.std(cb_scores))
        # Blend LightGBM ensemble and CatBoost (adjust weights as needed)
        blend_weight_lgb = 0.65
        blend_weight_cb  = 0.35
        final_test_pred = blend_weight_lgb * test_ens + blend_weight_cb * np.expm1(test_cb)
        final_oof = blend_weight_lgb * oof_ens + blend_weight_cb * np.expm1(oof_cb)
        seed_print(f"Blended CV SMAPE = {smape_true(np.expm1(y_log), final_oof):.6f}")

    except Exception as e:
        seed_print("CatBoost not available or failed. Skipping. Error:")
        print(e)
        final_test_pred = test_ens
        final_oof = oof_ens
else:
    final_test_pred = test_ens
    final_oof = oof_ens

# ---------- Clip predictions to safe range ----------
min_price = 0.0
max_price = float(np.percentile(np.expm1(y_log), 99.99) * 3.0)
final_test_pred = np.clip(final_test_pred, min_price, max_price)

# ---------- Final evaluation on CV OOF ----------
cv_smape_final = smape_true(np.expm1(y_log), final_oof)
seed_print(f"Final OOF SMAPE (orig scale): {cv_smape_final:.6f}")

# ---------- Prepare submission ----------
seed_print("Preparing submission CSV...")
if ID_COL in test.columns:
    sub = pd.DataFrame({ID_COL: test[ID_COL], 'price': final_test_pred})
else:
    sub = pd.DataFrame({'id': np.arange(len(test)), 'price': final_test_pred})

out_path = 'submission.csv'
sub.to_csv(out_path, index=False)
seed_print(f"Saved submission to {out_path}")

# ---------- Done ----------
seed_print("Pipeline complete. If CV SMAPE is high, try: increasing TOP_K, adding seeds, training CatBoost, or stacking.")