In [1]:
# =============================================================================
# BLOCK 1: SETUP, IMPORTS, AND DATA LOADING
# =============================================================================
import warnings
warnings.filterwarnings('ignore')
import time

# --- Library Imports ---
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb
import optuna

# PyTorch Imports
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

print("Libraries imported successfully.")

# --- Helper Function for Winkler Score ---
def winkler_score(y_true, lower, upper, alpha=0.1, return_coverage=False):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    if return_coverage:
        coverage = np.mean((y_true >= lower) & (y_true <= upper))
        return np.mean(score), coverage
    return np.mean(score)

# --- Global Constants ---
N_SPLITS = 5
RANDOM_STATE = 42
DATA_PATH = './'
N_OPTUNA_TRIALS = 25 # Reduced for speed, increase to 30-50 for best results
COMPETITION_ALPHA = 0.1

# --- Load Raw Data ---
try:
    df_train = pd.read_csv(DATA_PATH + 'dataset.csv', index_col="id", parse_dates=["sale_date"])
    df_test = pd.read_csv(DATA_PATH + 'test.csv', index_col="id", parse_dates=["sale_date"])
    print("Raw data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'dataset.csv' or 'test.csv'.")
    exit()

# --- Prepare Target Variable ---
y_log = np.log1p(df_train['sale_price'])
y_true = df_train['sale_price'].copy()
df_train['sale_price_log'] = y_log
df_train.drop('sale_price', axis=1, inplace=True)

print("Setup complete.")

Libraries imported successfully.
Raw data loaded successfully.
Setup complete.


In [2]:
# =============================================================================
# BLOCK 2: GOD-TIER FEATURE ENGINEERING
# =============================================================================
print("--- Starting Block 2: God-Tier Feature Engineering ---")

# Combine for consistent processing
df_train['is_train'] = 1
df_test['is_train'] = 0
all_data = pd.concat([df_train, df_test], axis=0)

# Foundational Features
all_data['sale_year'] = all_data['sale_date'].dt.year
all_data['age_at_sale'] = all_data['sale_year'] - all_data['year_built']
all_data['total_bathrooms'] = all_data['bath_full'] + 0.5 * all_data['bath_half'] + 0.75 * all_data['bath_3qtr']
all_data['total_sqft'] = all_data['sqft'] + all_data['sqft_fbsmt']
all_data['was_renovated'] = (all_data['year_reno'] > 0).astype(int)

# Location Clusters
kmeans = KMeans(n_clusters=30, random_state=RANDOM_STATE, n_init='auto')
all_data['location_cluster'] = kmeans.fit_predict(all_data[['latitude', 'longitude']])

# Peer-Comparison & Target-Encoded Features
print("Creating peer-comparison features...")
train_copy_for_aggs = all_data[all_data['is_train'] == 1].copy()
group_cols_to_agg = ['location_cluster', 'city', 'submarket']

for group_col in group_cols_to_agg:
    aggs = {'grade': ['mean', 'std'], 'age_at_sale': ['mean', 'std'], 'total_sqft': ['mean', 'std'], 'sale_price_log': ['mean']}
    group_aggs = train_copy_for_aggs.groupby(group_col).agg(aggs)
    group_aggs.columns = [f'{c[0]}_{c[1]}_{group_col}' for c in group_aggs.columns]
    all_data = all_data.merge(group_aggs, on=group_col, how='left')
    all_data[f'grade_vs_mean_{group_col}'] = all_data['grade'] - all_data[f'grade_mean_{group_col}']
    all_data[f'sqft_vs_mean_{group_col}'] = all_data['total_sqft'] - all_data[f'total_sqft_mean_{group_col}']

# Final Cleanup
cols_to_drop = ['sale_date', 'year_built', 'year_reno', 'bath_full', 'bath_half', 'bath_3qtr', 'sqft', 'sqft_fbsmt', 'latitude', 'longitude', 'sale_price_log']
all_data = all_data.drop(columns=cols_to_drop)
for col in all_data.select_dtypes(include='object').columns:
    all_data[col] = pd.Categorical(all_data[col]).codes
all_data.fillna(0, inplace=True)

print("Feature engineering complete.")

--- Starting Block 2: God-Tier Feature Engineering ---
Creating peer-comparison features...
Feature engineering complete.


In [3]:
# =============================================================================
# BLOCK 3: SMART FEATURE SELECTION
# =============================================================================
print("--- Starting Block 3: Smart Feature Selection ---")
temp_X = all_data[all_data['is_train'] == 1].drop(columns=['is_train'])
fs_model = lgb.LGBMRegressor(random_state=RANDOM_STATE, n_jobs=-1)
fs_model.fit(temp_X, y_log)
importances = pd.DataFrame({'feature': temp_X.columns, 'importance': fs_model.feature_importances_}).sort_values('importance', ascending=False)
useless_features = importances[importances['importance'] == 0]['feature'].tolist()
all_data.drop(columns=useless_features, inplace=True)
print(f"Dropped {len(useless_features)} useless features. Proceeding with {all_data.shape[1]-1} features.")
print("\nTop 20 most important features:")
display(importances.head(20))

# Final data separation
X = all_data[all_data['is_train'] == 1].drop(columns=['is_train']).copy()
X_test = all_data[all_data['is_train'] == 0].drop(columns=['is_train']).copy()
X_test = X_test[X.columns]

del temp_X, fs_model, importances, all_data
gc.collect()

--- Starting Block 3: Smart Feature Selection ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4831
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 68
[LightGBM] [Info] Start training from score 13.078327
Dropped 6 useless features. Proceeding with 62 features.

Top 20 most important features:


Unnamed: 0,feature,importance
35,sale_year,484
9,land_val,314
10,imp_val,243
36,age_at_sale,188
4,area,138
2,join_status,138
11,sqft_lot,125
1,sale_warning,105
38,total_sqft,98
3,join_year,83


55

In [4]:
# =============================================================================
# BLOCK 4: MASTER MODELING PIPELINE DEFINITION
# =============================================================================
print("--- Defining the Master Modeling Pipeline ---")

def run_full_pipeline(model_type, X_train, y_train, X_test, tune_params=True):
    """
    A full pipeline to tune, train, and get predictions for a given model type.
    """
    global best_params_lgbm, best_params_xgb # Use global to store params
    
    # --- 1. Hyperparameter Tuning with Optuna (if enabled) ---
    if tune_params:
        def objective(trial):
            train_x, val_x, train_y, val_y = train_test_split(X_train, y_train, test_size=0.2, random_state=RANDOM_STATE)
            
            if model_type == 'lgbm':
                params = {
                    'objective': 'quantile', 'metric': 'quantile', 'random_state': RANDOM_STATE, 'n_jobs': -1,
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.08, log=True),
                    'num_leaves': trial.suggest_int('num_leaves', 20, 100),
                    'max_depth': trial.suggest_int('max_depth', 5, 11),
                }
                model_class = lgb.LGBMRegressor
                fit_params = {'callbacks': [lgb.early_stopping(50, verbose=False)]}
            else: # xgb
                params = {
                    'objective': 'reg:quantileerror', 'eval_metric': 'rmse', 'tree_method': 'hist', 'random_state': RANDOM_STATE,
                    'eta': trial.suggest_float('eta', 0.01, 0.08, log=True),
                    'max_depth': trial.suggest_int('max_depth', 4, 9),
                    'min_child_weight': trial.suggest_int('min_child_weight', 2, 12),
                    'subsample': trial.suggest_float('subsample', 0.7, 1.0),
                    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
                }
                model_class = xgb.XGBRegressor
                fit_params = {} # Functional API handles this

            preds_lower, preds_upper = np.zeros(len(val_y)), np.zeros(len(val_y))
            for i, alpha in enumerate([0.05, 0.95]):
                if model_type == 'lgbm':
                    model = model_class(**params, n_estimators=1500, alpha=alpha)
                    model.fit(train_x, train_y, eval_set=[(val_x, val_y)], **fit_params)
                else: # Use functional API for XGB
                    dtrain = xgb.DMatrix(train_x, label=train_y)
                    dval = xgb.DMatrix(val_x, label=val_y)
                    xgb_params = {**params, 'quantile_alpha': alpha}
                    model = xgb.train(xgb_params, dtrain, num_boost_round=1500, evals=[(dval, 'eval')], early_stopping_rounds=50, verbose_eval=False)
                
                preds = model.predict(val_x if model_type=='lgbm' else dval)
                if i == 0: preds_lower = preds
                else: preds_upper = preds
            
            return winkler_score(np.expm1(val_y), np.expm1(preds_lower), np.expm1(preds_upper), alpha=0.1)

        print(f"\n--- Tuning {model_type.upper()} with Optuna ---")
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=N_OPTUNA_TRIALS, n_jobs=1)
        best_params = study.best_params
        if model_type == 'lgbm': best_params_lgbm = best_params
        else: best_params_xgb = best_params
        print(f"Best params for {model_type.upper()}: {best_params}")
    
    # --- 2. K-Fold Training with Best Parameters ---
    print(f"\n--- K-Fold Training {model_type.upper()} ---")
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    oof_preds = np.zeros((len(X_train), 2))
    test_preds = np.zeros((len(X_test), 2))
    grade_for_stratify = pd.read_csv(DATA_PATH + 'dataset.csv')['grade']

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, grade_for_stratify)):
        print(f"  Fold {fold+1}/{N_SPLITS}...")
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_val_fold, y_val_fold = X_train.iloc[val_idx], y_train.iloc[val_idx]
        
        for i, alpha in enumerate([0.05, 0.95]):
            if model_type == 'lgbm':
                model = lgb.LGBMRegressor(**best_params_lgbm, n_estimators=2000, objective='quantile', metric='quantile', random_state=RANDOM_STATE, n_jobs=-1, alpha=alpha)
                model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], callbacks=[lgb.early_stopping(100, verbose=False)])
            else: # Use functional API for XGB
                dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
                dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
                dtest = xgb.DMatrix(X_test)
                xgb_params = {'objective': 'reg:quantileerror', 'eval_metric': 'rmse', 'tree_method': 'hist', 'random_state': RANDOM_STATE, **best_params_xgb, 'quantile_alpha': alpha}
                model = xgb.train(xgb_params, dtrain, num_boost_round=2000, evals=[(dval, 'eval')], early_stopping_rounds=100, verbose_eval=False)

            oof_preds[val_idx, i] = model.predict(X_val_fold if model_type=='lgbm' else dval)
            test_preds[:, i] += model.predict(X_test if model_type=='lgbm' else dtest) / N_SPLITS
    
    return oof_preds, test_preds

print("Master modeling function defined.")

--- Defining the Master Modeling Pipeline ---
Master modeling function defined.


In [5]:
# =============================================================================
# BLOCK 5: EXECUTE PIPELINES & VALIDATE MODELS
# =============================================================================
start_time = time.time()
best_params_lgbm, best_params_xgb = {}, {}

# --- Run for LightGBM ---
oof_lgbm, test_lgbm = run_full_pipeline('lgbm', X, y_log, X_test, tune_params=True)
score_lgbm, coverage_lgbm = winkler_score(y_true, np.expm1(oof_lgbm[:, 0]), np.expm1(oof_lgbm[:, 1]), COMPETITION_ALPHA, return_coverage=True)
print(f"\n--- LGBM Final OOF Score: {score_lgbm:,.2f} | Coverage: {coverage_lgbm:.2%} ---")

# --- Run for XGBoost ---
oof_xgb, test_xgb = run_full_pipeline('xgb', X, y_log, X_test, tune_params=True)
score_xgb, coverage_xgb = winkler_score(y_true, np.expm1(oof_xgb[:, 0]), np.expm1(oof_xgb[:, 1]), COMPETITION_ALPHA, return_coverage=True)
print(f"\n--- XGBoost Final OOF Score: {score_xgb:,.2f} | Coverage: {coverage_xgb:.2%} ---")

# --- Run for Neural Network (we'll skip tuning for simplicity/speed here) ---
# [Note: A full NN pipeline would be similar but is omitted here to focus on tree models first]

total_time = time.time() - start_time
print(f"\nTotal training time: {total_time/60:.2f} minutes")

[I 2025-07-04 15:27:56,317] A new study created in memory with name: no-name-8349afc2-1b40-4496-ae53-877aeb1f489b



--- Tuning LGBM with Optuna ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009024 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005583 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676




[I 2025-07-04 15:28:10,211] Trial 0 finished with value: 343699.4758729358 and parameters: {'learning_rate': 0.022819559561161905, 'num_leaves': 31, 'max_depth': 6}. Best is trial 0 with value: 343699.4758729358.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042316 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117






















[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676
























[I 2025-07-04 15:28:23,539] Trial 1 finished with value: 343914.94420117163 and parameters: {'learning_rate': 0.029761555792200516, 'num_leaves': 92, 'max_depth': 6}. Best is trial 0 with value: 343699.4758729358.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005129 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005029 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676






[I 2025-07-04 15:28:33,594] Trial 2 finished with value: 347137.8581264501 and parameters: {'learning_rate': 0.046942429888129494, 'num_leaves': 87, 'max_depth': 8}. Best is trial 0 with value: 343699.4758729358.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004644 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.043811 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:29:02,536] Trial 3 finished with value: 346846.0544491997 and parameters: {'learning_rate': 0.015940759993532935, 'num_leaves': 46, 'max_depth': 9}. Best is trial 0 with value: 343699.4758729358.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005567 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117


















[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004995 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676
























[I 2025-07-04 15:29:14,707] Trial 4 finished with value: 343515.5695221941 and parameters: {'learning_rate': 0.03376581998875017, 'num_leaves': 80, 'max_depth': 6}. Best is trial 4 with value: 343515.5695221941.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:29:26,767] Trial 5 finished with value: 342247.8150911643 and parameters: {'learning_rate': 0.03002646736410111, 'num_leaves': 31, 'max_depth': 7}. Best is trial 5 with value: 342247.8150911643.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005012 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:29:40,535] Trial 6 finished with value: 343822.0307404722 and parameters: {'learning_rate': 0.027465352920190967, 'num_leaves': 41, 'max_depth': 10}. Best is trial 5 with value: 342247.8150911643.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117














[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676
















[I 2025-07-04 15:29:51,942] Trial 7 finished with value: 349038.8338172548 and parameters: {'learning_rate': 0.019801994661111485, 'num_leaves': 31, 'max_depth': 5}. Best is trial 5 with value: 342247.8150911643.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004670 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:30:07,870] Trial 8 finished with value: 346252.69706655224 and parameters: {'learning_rate': 0.017430657040933037, 'num_leaves': 56, 'max_depth': 10}. Best is trial 5 with value: 342247.8150911643.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005323 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:30:22,941] Trial 9 finished with value: 348693.4023098351 and parameters: {'learning_rate': 0.014851337275838762, 'num_leaves': 46, 'max_depth': 10}. Best is trial 5 with value: 342247.8150911643.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005316 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:30:30,679] Trial 10 finished with value: 345030.3339614305 and parameters: {'learning_rate': 0.07993894472866479, 'num_leaves': 20, 'max_depth': 8}. Best is trial 5 with value: 342247.8150911643.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004069 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117










[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676










[I 2025-07-04 15:30:42,474] Trial 11 finished with value: 344343.59704056237 and parameters: {'learning_rate': 0.03881058782837408, 'num_leaves': 75, 'max_depth': 7}. Best is trial 5 with value: 342247.8150911643.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004912 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117






















[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005102 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676




















[I 2025-07-04 15:31:05,958] Trial 12 finished with value: 342752.2065344765 and parameters: {'learning_rate': 0.05221019946313308, 'num_leaves': 73, 'max_depth': 5}. Best is trial 5 with value: 342247.8150911643.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117






















[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005099 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676




















[I 2025-07-04 15:31:16,068] Trial 13 finished with value: 343573.2755760743 and parameters: {'learning_rate': 0.061094484453452315, 'num_leaves': 63, 'max_depth': 5}. Best is trial 5 with value: 342247.8150911643.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005362 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:31:32,205] Trial 14 finished with value: 351737.47426312877 and parameters: {'learning_rate': 0.011117617480853858, 'num_leaves': 66, 'max_depth': 7}. Best is trial 5 with value: 342247.8150911643.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005878 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117


















[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005338 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676






















[I 2025-07-04 15:31:42,075] Trial 15 finished with value: 343145.69649429497 and parameters: {'learning_rate': 0.046183875002923, 'num_leaves': 73, 'max_depth': 5}. Best is trial 5 with value: 342247.8150911643.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005144 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:31:52,588] Trial 16 finished with value: 342152.5168760193 and parameters: {'learning_rate': 0.057302020929515175, 'num_leaves': 20, 'max_depth': 7}. Best is trial 16 with value: 342152.5168760193.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004671 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005243 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:31:59,461] Trial 17 finished with value: 345500.29386825033 and parameters: {'learning_rate': 0.07941526998079967, 'num_leaves': 20, 'max_depth': 8}. Best is trial 16 with value: 342152.5168760193.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005490 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005817 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:32:07,410] Trial 18 finished with value: 344193.1704471502 and parameters: {'learning_rate': 0.059994643301250206, 'num_leaves': 30, 'max_depth': 7}. Best is trial 16 with value: 342152.5168760193.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004960 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:32:23,044] Trial 19 finished with value: 343618.5250389672 and parameters: {'learning_rate': 0.0363051030088405, 'num_leaves': 37, 'max_depth': 11}. Best is trial 16 with value: 342152.5168760193.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005343 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038226 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:32:36,550] Trial 20 finished with value: 344655.69354662264 and parameters: {'learning_rate': 0.026555650189491155, 'num_leaves': 52, 'max_depth': 9}. Best is trial 16 with value: 342152.5168760193.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005349 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:32:46,035] Trial 21 finished with value: 342042.00508322107 and parameters: {'learning_rate': 0.05717788658709401, 'num_leaves': 25, 'max_depth': 6}. Best is trial 21 with value: 342042.00508322107.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005066 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005009 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:33:09,998] Trial 22 finished with value: 343068.7556303798 and parameters: {'learning_rate': 0.06215998497916692, 'num_leaves': 25, 'max_depth': 7}. Best is trial 21 with value: 342042.00508322107.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676






[I 2025-07-04 15:33:20,552] Trial 23 finished with value: 342483.8916024447 and parameters: {'learning_rate': 0.04207264804328118, 'num_leaves': 35, 'max_depth': 6}. Best is trial 21 with value: 342042.00508322107.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005049 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4783
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676


[I 2025-07-04 15:33:29,039] Trial 24 finished with value: 345307.94730001275 and parameters: {'learning_rate': 0.06699324698223576, 'num_leaves': 26, 'max_depth': 7}. Best is trial 21 with value: 342042.00508322107.


Best params for LGBM: {'learning_rate': 0.05717788658709401, 'num_leaves': 25, 'max_depth': 6}

--- K-Fold Training LGBM ---
  Fold 1/5...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005093 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4780
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005063 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4780
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.177406






  Fold 2/5...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005328 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4774
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005074 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4774
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.169683






  Fold 3/5...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4777
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005827 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4777
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.176676














  Fold 4/5...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005024 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4789
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117








[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003718 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4789
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.173185








  Fold 5/5...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005408 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4772
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 12.128117


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004792 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4772
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 62
[LightGBM] [Info] Start training from score 14.178583








[I 2025-07-04 15:34:39,035] A new study created in memory with name: no-name-8df1ac14-ff70-4754-9b9e-6a5cfbbf5dd4



--- LGBM Final OOF Score: 342,762.54 | Coverage: 87.28% ---

--- Tuning XGB with Optuna ---


[I 2025-07-04 15:35:36,119] Trial 0 finished with value: 366768.29808828124 and parameters: {'eta': 0.015096849443248388, 'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.9308870373132013, 'colsample_bytree': 0.8055747242110547}. Best is trial 0 with value: 366768.29808828124.
[I 2025-07-04 15:36:14,763] Trial 1 finished with value: 361752.86150214844 and parameters: {'eta': 0.015757605043383666, 'max_depth': 4, 'min_child_weight': 9, 'subsample': 0.8471244575557392, 'colsample_bytree': 0.7059182011847058}. Best is trial 1 with value: 361752.86150214844.
[I 2025-07-04 15:37:09,646] Trial 2 finished with value: 343055.5725431641 and parameters: {'eta': 0.061077383236891054, 'max_depth': 4, 'min_child_weight': 7, 'subsample': 0.9662363607709517, 'colsample_bytree': 0.7759612179896368}. Best is trial 2 with value: 343055.5725431641.
[I 2025-07-04 15:37:57,056] Trial 3 finished with value: 351254.9486058594 and parameters: {'eta': 0.017898172599786953, 'max_depth': 8, 'min_child_weigh

Best params for XGB: {'eta': 0.03953045149173268, 'max_depth': 5, 'min_child_weight': 4, 'subsample': 0.7813690804126607, 'colsample_bytree': 0.7420977055135587}

--- K-Fold Training XGB ---
  Fold 1/5...
  Fold 2/5...
  Fold 3/5...
  Fold 4/5...
  Fold 5/5...

--- XGBoost Final OOF Score: 339,324.54 | Coverage: 87.35% ---

Total training time: 31.83 minutes


In [6]:
# =============================================================================
# BLOCK 6: FINAL ENSEMBLE, CALIBRATION, AND SUBMISSION
# =============================================================================
print("\n--- Starting Block 6: Final Ensembling and Submission ---")

# --- Find Best Ensemble Weights on OOF Predictions ---
print("Finding best ensemble weights...")
best_w = 0.5
best_score = float('inf')

for w in np.arange(0, 1.01, 0.01):
    ensemble_oof = w * oof_lgbm + (1-w) * oof_xgb
    score, _ = winkler_score(y_true, np.expm1(ensemble_oof[:, 0]), np.expm1(ensemble_oof[:, 1]), COMPETITION_ALPHA, return_coverage=True)
    if score < best_score:
        best_score = score
        best_w = w

print(f"Best weight for LGBM: {best_w:.2f}, for XGB: {1-best_w:.2f}")
print(f"Best possible ensemble OOF score: {best_score:,.2f}")

# --- Create Final Ensemble and Calibrate ---
oof_ensemble = best_w * oof_lgbm + (1-best_w) * oof_xgb
test_ensemble = best_w * test_lgbm + (1-best_w) * test_xgb

oof_lower = np.expm1(oof_ensemble[:, 0])
oof_upper = np.expm1(oof_ensemble[:, 1])
oof_upper = np.maximum(oof_lower, oof_upper)

_, coverage = winkler_score(y_true, oof_lower, oof_upper, COMPETITION_ALPHA, return_coverage=True)
print(f"Final Ensemble OOF Coverage (before calib): {coverage:.2%}")

best_factor = 1.0
best_coverage_diff = abs(coverage - 0.90)
for factor in np.arange(0.9, 1.2, 0.001):
    center = (oof_lower + oof_upper) / 2
    width = oof_upper - oof_lower
    _, current_coverage = winkler_score(y_true, center - (width / 2) * factor, center + (width / 2) * factor, alpha=0.1, return_coverage=True)
    if abs(current_coverage - 0.90) < best_coverage_diff:
        best_coverage_diff = abs(current_coverage - 0.90)
        best_factor = factor
print(f"Best calibration factor found: {best_factor:.3f}")

# --- Create Final Submission File ---
test_lower = np.expm1(test_ensemble[:, 0])
test_upper = np.expm1(test_ensemble[:, 1])
test_center = (test_lower + test_upper) / 2
test_width = test_upper - test_lower
calibrated_lower = test_center - (test_width / 2) * best_factor
calibrated_upper = test_center + (test_width / 2) * best_factor
calibrated_upper = np.maximum(calibrated_lower, calibrated_upper)

submission_df = pd.DataFrame({'id': X_test.index, 'pi_lower': calibrated_lower, 'pi_upper': calibrated_upper})
submission_df.to_csv('submission_final_validated_ensemble.csv', index=False)
print("\n'submission_final_validated_ensemble.csv' created successfully!")
display(submission_df.head())


--- Starting Block 6: Final Ensembling and Submission ---
Finding best ensemble weights...
Best weight for LGBM: 0.41, for XGB: 0.59
Best possible ensemble OOF score: 335,867.16
Final Ensemble OOF Coverage (before calib): 87.72%
Best calibration factor found: 1.068

'submission_final_validated_ensemble.csv' created successfully!


Unnamed: 0,id,pi_lower,pi_upper
0,200000,826999.910002,1192618.0
1,200001,553928.390664,803034.3
2,200002,431634.583071,713825.7
3,200003,299586.953551,458795.4
4,200004,387417.277185,707046.8
