In [13]:
import pandas as pd
import numpy as np
import os
import lightgbm as lgb
import gc
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge

print("--- FINAL ULTRA-FAST SCRIPT (CORRECTED) ---")

# --- STAGE 1: LOAD FEATURES ---
PROCESSED_DIR = "../data/processed"
full_df_text_stats = pd.read_parquet(os.path.join(PROCESSED_DIR, "features_v1.parquet"))
text_embeddings_df = pd.read_parquet(os.path.join(PROCESSED_DIR, "text_embeddings_v1.parquet"))
full_df = pd.concat([full_df_text_stats.reset_index(drop=True), text_embeddings_df.reset_index(drop=True)], axis=1)

# --- STAGE 2: DATA PREPARATION ---
train_df = full_df[full_df['is_train'] == 1].copy()
test_df = full_df[full_df['is_train'] == 0].copy()

y = train_df['log_price']
y_true_prices = train_df['price']

features_to_drop = ['sample_id', 'price', 'log_price', 'is_train']
features = [col for col in full_df.columns if col not in features_to_drop]

# Convert categorical features BEFORE creating X and X_test
categorical_features = ['extracted_unit', 'brand']
for col in categorical_features:
    full_df[col] = full_df[col].astype('category')

X = full_df[full_df['is_train'] == 1][features]
X_test = full_df[full_df['is_train'] == 0][features]

del full_df, train_df, test_df, full_df_text_stats, text_embeddings_df
gc.collect()
print("Data preparation complete.")

# --- STAGE 3: TRAINING ---
N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
oof_predictions_log = np.zeros(len(X))
test_predictions_log = np.zeros(len(X_test))

print("--- Starting ULTRA-FAST training ---")
for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    lgb_params = { 'objective': 'regression_l1', 'metric': 'rmse', 'n_estimators': 2500,
                   'learning_rate': 0.04, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 1,
                   'lambda_l1': 0.1, 'lambda_l2': 0.1, 'num_leaves': 31, 'verbose': -1, 'n_jobs': -1, 'seed': 42 + fold }
    
    model = lgb.LGBMRegressor(**lgb_params)
    def lgbm_smape(y_true_log, y_pred_log):
        y_true = np.expm1(y_true_log); y_pred = np.expm1(y_pred_log)
        denom = np.abs(y_true) + np.abs(y_pred)
        return 'SMAPE', 100.0 * np.mean(2 * np.abs(y_true - y_pred) / np.clip(denom, 1e-8, None)), False
    
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=lgbm_smape, 
              categorical_feature='auto', callbacks=[lgb.early_stopping(100, verbose=False)])
    
    oof_predictions_log[val_index] = model.predict(X_val)
    test_predictions_log += model.predict(X_test) / N_SPLITS

# --- STAGE 4: POST-PROCESSING AND SUBMISSION ---
print("\n--- Training complete. Starting post-processing. ---")
# ... (The rest of the script is the same post-processing and submission code)
y_true = y_true_prices.values
train_brands = X['brand']; test_brands = X_test['brand']

def smape(y, y_pred):
    denom = np.abs(y) + np.abs(y_pred)
    return 100.0 * np.mean(2 * np.abs(y - y_pred) / np.clip(denom, 1e-8, None))

oof_preds = np.expm1(oof_predictions_log); test_preds = np.expm1(test_predictions_log)
print(f"Initial OOF SMAPE: {smape(y_true, oof_preds):.4f}")

low, high = np.percentile(y_true, [0.5, 99.5])
oof_preds = np.clip(oof_preds, low, high); test_preds = np.clip(test_preds, low, high)
print(f"After Winsorizing: {smape(y_true, oof_preds):.4f}")

oof_df = pd.DataFrame({'brand': train_brands, 'y_true': y_true, 'pred': oof_preds})
group_shift = oof_df.groupby('brand').apply(lambda g: g['y_true'].mean() - g['pred'].mean()).to_dict()
oof_preds += train_brands.map(group_shift).fillna(0.0).values
test_preds += test_brands.map(group_shift).fillna(0.0).values
print(f"After Group Correction: {smape(y_true, oof_preds):.4f}")

meta_model = Ridge(alpha=1.0).fit(oof_preds.reshape(-1, 1), y_true)
oof_preds = meta_model.predict(oof_preds.reshape(-1, 1)); test_preds = meta_model.predict(test_preds.reshape(-1, 1))
print(f"After Final Calibration: {smape(y_true, oof_preds):.4f}")

final_test_preds = np.clip(test_preds, 0.01, None)
print(f"\nFINAL OOF SMAPE SCORE: {smape(y_true, oof_preds):.4f}")

original_test_df = pd.read_csv("../data/raw/dataset/test.csv")
submission_df = pd.DataFrame({'sample_id': original_test_df['sample_id'], 'price': final_test_preds})
submission_df.to_csv("../test_out.csv", index=False)
print(f"\n✅✅✅ FINAL SUBMISSION FILE CREATED: 'test_out.csv' ✅✅✅")

--- FINAL ULTRA-FAST SCRIPT (CORRECTED) ---
Data preparation complete.
--- Starting ULTRA-FAST training ---
--- Fold 1/5 ---
--- Fold 2/5 ---
--- Fold 3/5 ---
--- Fold 4/5 ---
--- Fold 5/5 ---

--- Training complete. Starting post-processing. ---
Initial OOF SMAPE: 54.1383
After Winsorizing: 54.1379
After Group Correction: 22.9088
After Final Calibration: 24.3318

FINAL OOF SMAPE SCORE: 24.3318

✅✅✅ FINAL SUBMISSION FILE CREATED: 'test_out.csv' ✅✅✅
