In [1]:
# =============================================================================
# BLOCK 1: SETUP, IMPORTS, AND DATA LOADING
# =============================================================================
import warnings
warnings.filterwarnings('ignore')
import time
import os
# --- Library Imports ---
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import catboost as cb
import optuna
from scipy.optimize import minimize
print("Libraries imported successfully.")
# --- Helper Function for Winkler Score ---
def winkler_score(y_true, lower, upper, alpha=0.1, return_coverage=False):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    if return_coverage:
        coverage = np.mean((y_true >= lower) & (y_true <= upper))
        return np.mean(score), coverage
    return np.mean(score)
# --- Global Constants ---
N_SPLITS = 5
RANDOM_STATE = 42
DATA_PATH = './'
N_OPTUNA_TRIALS = 30 # A strong number for a comprehensive search
COMPETITION_ALPHA = 0.1

# --- Load Raw Data ---
try:
    # We drop the low-variance columns they identified right away
    drop_cols=['id', 'golf', 'view_rainier', 'view_skyline', 'view_lakesamm','view_otherwater', 'view_other']
    df_train = pd.read_csv(DATA_PATH + 'dataset.csv').drop(columns=drop_cols)
    df_test = pd.read_csv(DATA_PATH + 'test.csv').drop(columns=drop_cols)
    print("Raw data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'dataset.csv' or 'test.csv'.")
    exit()
# --- Prepare Target Variable ---
y_true = df_train['sale_price'].copy()
grade_for_stratify = df_train['grade'].copy()
# The mean-error model works best when predicting the raw price directly
# So, we will NOT log-transform the target this time.
# df_train.drop('sale_price', axis=1, inplace=True) # We keep sale_price for FE
print("Setup complete.")


Libraries imported successfully.
Raw data loaded successfully.
Setup complete.


In [2]:
# Make sure to have these libraries installed
# pip install pandas numpy scikit-learn

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
import gc

# Define a random state for reproducibility
RANDOM_STATE = 42

def create_comprehensive_features(df_train, df_test):
    """
    Combines original and new advanced feature engineering steps into a single pipeline.
    """
    print("--- Starting Comprehensive Feature Engineering ---")

    # Store original indices and target variable
    train_ids = df_train.index
    test_ids = df_test.index
    y_train = df_train['sale_price'].copy() # Keep the target separate

    # Combine for consistent processing
    df_train_temp = df_train.drop(columns=['sale_price'])
    all_data = pd.concat([df_train_temp, df_test], axis=0, ignore_index=True)

    # --- Original Feature Engineering ---

    # A) Brute-Force Numerical Interactions
    print("Step 1: Creating brute-force numerical interaction features...")
    NUMS = ['area', 'land_val', 'imp_val', 'sqft_lot', 'sqft', 'sqft_1', 'grade', 'year_built']
    # Ensure all columns exist and are numeric, fill missing with 0 for safety
    for col in NUMS:
        if col not in all_data.columns:
            all_data[col] = 0
        else:
            all_data[col] = pd.to_numeric(all_data[col], errors='coerce').fillna(0)
            
    for i in range(len(NUMS)):
        for j in range(i + 1, len(NUMS)):
            all_data[f'{NUMS[i]}_x_{NUMS[j]}'] = all_data[NUMS[i]] * all_data[NUMS[j]]

    # B) Date Features
    print("Step 2: Creating date features...")
    all_data['sale_date'] = pd.to_datetime(all_data['sale_date'])
    all_data['sale_year'] = all_data['sale_date'].dt.year
    all_data['sale_month'] = all_data['sale_date'].dt.month
    all_data['sale_dayofyear'] = all_data['sale_date'].dt.dayofyear
    all_data['age_at_sale'] = all_data['sale_year'] - all_data['year_built']

    # C) TF-IDF Text Features
    print("Step 3: Creating TF-IDF features for text columns...")
    text_cols = ['subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data[text_cols] = all_data[text_cols].fillna('missing').astype(str)
    
    for col in text_cols:
        tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=128, binary=True)
        svd = TruncatedSVD(n_components=8, random_state=RANDOM_STATE)
        
        tfidf_matrix = tfidf.fit_transform(all_data[col])
        tfidf_svd = svd.fit_transform(tfidf_matrix)
        
        tfidf_df = pd.DataFrame(tfidf_svd, columns=[f'{col}_tfidf_svd_{i}' for i in range(8)])
        all_data = pd.concat([all_data, tfidf_df], axis=1)

    # D) Log transform some interaction features
    for c in ['land_val_x_imp_val', 'land_val_x_sqft', 'imp_val_x_sqft']:
        if c in all_data.columns:
            all_data[c] = np.log1p(all_data[c].fillna(0))

    # --- New Feature Engineering Ideas ---

    # F) Group-By Aggregation Features
    print("Step 4: Creating group-by aggregation features...")
    group_cols = ['submarket', 'city', 'zoning']
    num_cols_for_agg = ['grade', 'sqft', 'imp_val', 'land_val', 'age_at_sale']

    for group_col in group_cols:
        for num_col in num_cols_for_agg:
            agg_stats = all_data.groupby(group_col)[num_col].agg(['mean', 'std', 'max', 'min']).reset_index()
            agg_stats.columns = [group_col] + [f'{group_col}_{num_col}_{stat}' for stat in ['mean', 'std', 'max', 'min']]
            all_data = pd.merge(all_data, agg_stats, on=group_col, how='left')
            all_data[f'{num_col}_minus_{group_col}_mean'] = all_data[num_col] - all_data[f'{group_col}_{num_col}_mean']

    # G) Ratio Features
    print("Step 5: Creating ratio features...")
    # Add a small epsilon to prevent division by zero
    epsilon = 1e-6 
    all_data['total_val'] = all_data['imp_val'] + all_data['land_val']
    all_data['imp_val_to_land_val_ratio'] = all_data['imp_val'] / (all_data['land_val'] + epsilon)
    all_data['land_val_ratio'] = all_data['land_val'] / (all_data['total_val'] + epsilon)
    all_data['sqft_to_lot_ratio'] = all_data['sqft'] / (all_data['sqft_lot'] + epsilon)
    all_data['was_renovated'] = (all_data['year_reno'] > 0).astype(int)
    all_data['reno_age_at_sale'] = np.where(all_data['was_renovated'] == 1, all_data['sale_year'] - all_data['year_reno'], -1)

    # H) Geospatial Clustering Features
    print("Step 6: Creating geospatial clustering features...")
    coords = all_data[['latitude', 'longitude']].copy()
    coords.fillna(coords.median(), inplace=True) # Simple imputation

    # KMeans is sensitive to feature scaling, but for lat/lon it's often okay without it.
    kmeans = KMeans(n_clusters=20, random_state=RANDOM_STATE, n_init=10) 
    all_data['location_cluster'] = kmeans.fit_predict(coords)
    
    # Calculate distance to each cluster center
    cluster_centers = kmeans.cluster_centers_
    for i in range(len(cluster_centers)):
        center = cluster_centers[i]
        all_data[f'dist_to_cluster_{i}'] = np.sqrt((coords['latitude'] - center[0])**2 + (coords['longitude'] - center[1])**2)

    # --- Final Cleanup ---
    print("Step 7: Finalizing feature set...")
    cols_to_drop = ['sale_date', 'subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data = all_data.drop(columns=cols_to_drop)

    # One-hot encode the new cluster feature
    all_data = pd.get_dummies(all_data, columns=['location_cluster'], prefix='loc_cluster')
    
    # Final check for any remaining object columns to be safe (besides index)
    object_cols = all_data.select_dtypes(include='object').columns
    if len(object_cols) > 0:
        print(f"Warning: Found unexpected object columns: {object_cols}. Dropping them.")
        all_data = all_data.drop(columns=object_cols)
        
    all_data.fillna(0, inplace=True)

    # Separate back into train and test sets
    train_len = len(train_ids)
    X = all_data.iloc[:train_len].copy()
    X_test = all_data.iloc[train_len:].copy()
    
    # Restore original indices
    X.index = train_ids
    X_test.index = test_ids
    
    # Align columns - crucial for model prediction
    X_test = X_test[X.columns]
    
    print(f"\nComprehensive FE complete. Total features: {X.shape[1]}")
    gc.collect()
    
    return X, X_test, y_train
# =============================================================================
# BLOCK 2.5: EXECUTE FEATURE ENGINEERING
# =============================================================================
print("\n--- Starting Block 2.5: Executing Feature Engineering Pipeline ---")

# This is the crucial step that was missing.
# We call the function to create our training and testing dataframes.
X, X_test, y_train = create_comprehensive_features(df_train, df_test)

# Let's verify the output
print(f"Feature engineering complete. X shape: {X.shape}, X_test shape: {X_test.shape}")
gc.collect()


--- Starting Block 2.5: Executing Feature Engineering Pipeline ---
--- Starting Comprehensive Feature Engineering ---
Step 1: Creating brute-force numerical interaction features...
Step 2: Creating date features...
Step 3: Creating TF-IDF features for text columns...
Step 4: Creating group-by aggregation features...
Step 5: Creating ratio features...
Step 6: Creating geospatial clustering features...
Step 7: Finalizing feature set...

Comprehensive FE complete. Total features: 233
Feature engineering complete. X shape: (200000, 233), X_test shape: (200000, 233)


0

In [3]:
# =============================================================================
# BLOCK 13: CREATE META-FEATURES FOR STACKING ENSEMBLE
# =============================================================================
# This block prepares the data for our final, most powerful approach: stacking.
# We will treat the outputs of our three best pipelines as features and train a
# new "meta-model" to learn the optimal, non-linear way to combine them.

# --- Step 1: Load All Base and Meta-Model Predictions ---
# This extends your existing loading block to include the quantile model outputs.

# Define paths to your saved prediction files
PREDS_SAVE_PATH = './mean_models_v1/'
NN_PREDS_PATH = './NN_model_predictions/'
ERR_PATH = './error_models/'
META_XGB_PATH = './meta_quantile_models/' # Corrected Path
META_LGBM_PATH = './meta_lgbm_quantile_models/' # Corrected Path

print("--- Loading all base and meta-model predictions from saved .npy files... ---")
try:
    # --- LEVEL 0: Base Model Predictions ---
    # Load Mean Model OOF & Test Predictions
    oof_xgb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_xgb_preds.npy'))
    oof_cb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_cb_preds.npy'))
    oof_lgbm_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_lgbm_preds.npy'))
    oof_nn_preds = np.load(os.path.join(NN_PREDS_PATH, 'oof_nn_preds.npy'))
    test_xgb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_xgb_preds.npy'))
    test_cb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_cb_preds.npy'))
    test_lgbm_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_lgbm_preds.npy'))
    test_nn_preds = np.load(os.path.join(NN_PREDS_PATH, 'test_nn_preds.npy'))
    
    # Load Error Model OOF & Test Predictions
    oof_error_preds_xgb = np.load(os.path.join(ERR_PATH, 'oof_error_preds_xgb.npy'))
    oof_error_preds_cb = np.load(os.path.join(ERR_PATH, 'oof_error_preds_cb.npy'))
    test_error_preds_xgb = np.load(os.path.join(ERR_PATH, 'test_error_preds_xgb.npy'))
    test_error_preds_cb = np.load(os.path.join(ERR_PATH, 'test_error_preds_cb.npy'))

    # --- LEVEL 1: Quantile Model Predictions ---
    # Load XGB Quantile OOF & Test Predictions
    oof_lower_xgb = np.load(os.path.join(META_XGB_PATH, 'oof_lower_preds.npy'))
    oof_upper_xgb = np.load(os.path.join(META_XGB_PATH, 'oof_upper_preds.npy'))
    test_lower_xgb = np.load(os.path.join(META_XGB_PATH, 'test_lower_preds.npy'))
    test_upper_xgb = np.load(os.path.join(META_XGB_PATH, 'test_upper_preds.npy'))

    # Load LGBM Quantile OOF & Test Predictions
    oof_lower_lgbm = np.load(os.path.join(META_LGBM_PATH, 'oof_lower_preds_lgbm.npy'))
    oof_upper_lgbm = np.load(os.path.join(META_LGBM_PATH, 'oof_upper_preds_lgbm.npy'))
    test_lower_lgbm = np.load(os.path.join(META_LGBM_PATH, 'test_lower_preds_lgbm.npy'))
    test_upper_lgbm = np.load(os.path.join(META_LGBM_PATH, 'test_upper_preds_lgbm.npy'))

    print("All necessary .npy prediction files loaded successfully.")

except FileNotFoundError as e:
    print(f"\nERROR: Could not find a required prediction file. {e}")
    print("Please ensure all previous training notebooks have been run successfully.")

# --- Step 2: Recreate Calibrated Bounds for all Pipelines ---
# We need to recreate the exact, calibrated bounds from each pipeline to use as features.
print("\n--- Recreating calibrated OOF and Test bounds to use as meta-features ---")

# --- Pipeline A: Mean+Error Model Bounds ---
oof_ensemble_mean = (oof_xgb_preds + oof_cb_preds + oof_lgbm_preds + oof_nn_preds) / 4
test_ensemble_mean = (test_xgb_preds + test_cb_preds + test_lgbm_preds + test_nn_preds) / 4
a_err, b_err = 1.9799, 2.1755 # From previous optimization
oof_error_final = np.clip((oof_error_preds_xgb * 0.60 + oof_error_preds_cb * 0.40), 0, None)
test_error_final = np.clip((test_error_preds_xgb * 0.60 + test_error_preds_cb * 0.40), 0, None)

oof_lower_A = oof_ensemble_mean - oof_error_final * a_err
oof_upper_A = oof_ensemble_mean + oof_error_final * b_err
test_lower_A = test_ensemble_mean - test_error_final * a_err
test_upper_A = test_ensemble_mean + test_error_final * b_err

# --- Pipeline B: XGB Quantile Model Bounds ---
a_xgb, b_xgb = 0.8118, 1.1960 # From previous optimization
oof_lower_B, oof_upper_B = np.minimum(oof_lower_xgb, oof_upper_xgb) * a_xgb, np.maximum(oof_lower_xgb, oof_upper_xgb) * b_xgb
test_lower_B, test_upper_B = np.minimum(test_lower_xgb, test_upper_xgb) * a_xgb, np.maximum(test_lower_xgb, test_upper_xgb) * b_xgb

# --- Pipeline C: LGBM Quantile Model Bounds ---
a_lgbm, b_lgbm = 0.8118, 1.1960 # Placeholder; ideally, recalibrate as before, but reusing is fine for a start
oof_lower_C, oof_upper_C = np.minimum(oof_lower_lgbm, oof_upper_lgbm) * a_lgbm, np.maximum(oof_lower_lgbm, oof_upper_lgbm) * b_lgbm
test_lower_C, test_upper_C = np.minimum(test_lower_lgbm, test_upper_lgbm) * a_lgbm, np.maximum(test_lower_lgbm, test_upper_lgbm) * b_lgbm

print("All interval bounds successfully recreated.")

# --- Step 3: Construct the Meta-Feature DataFrame ---
print("\n--- Constructing the final meta-feature set for stacking ---")

meta_features_train = pd.DataFrame({
    'lower_A': oof_lower_A, 'upper_A': oof_upper_A,
    'lower_B': oof_lower_B, 'upper_B': oof_upper_B,
    'lower_C': oof_lower_C, 'upper_C': oof_upper_C,
    
    # Engineered features from the bounds are crucial
    'width_A': oof_upper_A - oof_lower_A,
    'width_B': oof_upper_B - oof_lower_B,
    'width_C': oof_upper_C - oof_lower_C,
    
    'center_A': (oof_upper_A + oof_lower_A) / 2,
    'center_B': (oof_upper_B + oof_lower_B) / 2,
    'center_C': (oof_upper_C + oof_lower_C) / 2,
})

meta_features_test = pd.DataFrame({
    'lower_A': test_lower_A, 'upper_A': test_upper_A,
    'lower_B': test_lower_B, 'upper_B': test_upper_B,
    'lower_C': test_lower_C, 'upper_C': test_upper_C,

    'width_A': test_upper_A - test_lower_A,
    'width_B': test_upper_B - test_lower_B,
    'width_C': test_upper_C - test_upper_C,
    
    'center_A': (test_upper_A + test_lower_A) / 2,
    'center_B': (test_upper_B + test_lower_B) / 2,
    'center_C': (test_upper_C + test_lower_C) / 2,
})

# --- (Optional but Recommended) Add Top Raw Features ---
# Add the absolute most important raw features to give the meta-model more context.
# We need to load the original data to get these.
df_train_raw = pd.read_csv('./dataset.csv')
df_test_raw = pd.read_csv('./test.csv')

top_raw_feats = ['grade', 'sqft', 'age_at_sale'] # Example, choose your best
for feat in top_raw_feats:
    # Need to re-calculate age_at_sale if not present
    if feat == 'age_at_sale':
        df_train_raw['sale_year'] = pd.to_datetime(df_train_raw['sale_date']).dt.year
        df_test_raw['sale_year'] = pd.to_datetime(df_test_raw['sale_date']).dt.year
        meta_features_train[feat] = df_train_raw['sale_year'] - df_train_raw['year_built']
        meta_features_test[feat] = df_test_raw['sale_year'] - df_test_raw['year_built']
    else:
        meta_features_train[feat] = df_train_raw[feat]
        meta_features_test[feat] = df_test_raw[feat]

print(f"Added {len(top_raw_feats)} top raw features for context.")

# --- Step 4: Define Meta-Model Targets ---
# The target for our meta-models is simply the true sale price.
# We will train two separate models, but they will both predict the same target.
# The custom objective function will teach them to produce the lower/upper bound.
y_meta_target = y_true.copy()

print("\nMeta-feature dataframes are now ready for training.")
print(f"Shape of Meta-Train: {meta_features_train.shape}")
print(f"Shape of Meta-Test:  {meta_features_test.shape}")
print("\nMeta-Train Head:")
print(meta_features_train.head())

--- Loading all base and meta-model predictions from saved .npy files... ---
All necessary .npy prediction files loaded successfully.

--- Recreating calibrated OOF and Test bounds to use as meta-features ---
All interval bounds successfully recreated.

--- Constructing the final meta-feature set for stacking ---
Added 3 top raw features for context.

Meta-feature dataframes are now ready for training.
Shape of Meta-Train: (200000, 15)
Shape of Meta-Test:  (200000, 15)

Meta-Train Head:
        lower_A       upper_A       lower_B       upper_B       lower_C  \
0  2.193217e+05  2.873461e+05  2.059288e+05  3.034720e+05  1.730037e+05   
1  2.460684e+05  4.153292e+05  2.587635e+05  3.831388e+05  1.906595e+05   
2  2.844782e+05  3.586439e+05  2.594633e+05  3.823383e+05  2.254399e+05   
3  2.149198e+05  2.957100e+05  2.051202e+05  3.022437e+05  1.781910e+05   
4  1.414717e+06  2.041337e+06  1.343647e+06  1.983026e+06  1.113419e+06   

        upper_C        width_A        width_B       width

In [13]:
# =============================================================================
# BLOCK 14: TUNE META-MODELS WITH OPTUNA (Standard Quantile Objective)
# =============================================================================
# We will tune two separate LightGBM models on the meta-features. Instead of a
# custom objective, we will use the stable, built-in 'quantile' objective.
# - One model will be tuned to predict the 5th percentile (our new lower bound).
# - One model will be tuned to predict the 95th percentile (our new upper bound).

import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split
import numpy as np

print("\n--- Preparing data for Meta-Model Optuna tuning ---")
# meta_features_train and y_meta_target should be available from the previous block
X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
    meta_features_train, y_meta_target, test_size=0.20, random_state=RANDOM_STATE
)

def objective_meta_quantile(trial, alpha_value):
    """Unified Optuna objective for the meta quantile models."""
    params = {
        'objective': 'quantile',
        'alpha': alpha_value,
        'metric': 'quantile', # Pinball Loss is the correct metric for this objective
        'random_state': RANDOM_STATE, 'n_jobs': -1, 'verbosity': -1, 'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2500),
        'num_leaves': trial.suggest_int('num_leaves', 10, 50),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.1, 100.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.1, 100.0, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
    }
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train_opt, y_train_opt,
              eval_set=[(X_val_opt, y_val_opt)],
              callbacks=[lgb.early_stopping(50, verbose=False)])
    
    preds = model.predict(X_val_opt)
    pinball_loss = np.mean(np.where(y_val_opt >= preds, 
                                    (y_val_opt - preds) * alpha_value, 
                                    (preds - y_val_opt) * (1 - alpha_value)))
    return pinball_loss

# --- Tune Lower-Bound Meta-Model (alpha=0.05) ---
print("\n--- Tuning the Lower-Bound Meta-Model (alpha=0.05)... ---")
study_meta_lower = optuna.create_study(direction='minimize')
study_meta_lower.optimize(lambda trial: objective_meta_quantile(trial, 0.05), n_trials=75, show_progress_bar=True)
best_params_meta_lower = study_meta_lower.best_params

# --- Tune Upper-Bound Meta-Model (alpha=0.95) ---
print("\n--- Tuning the Upper-Bound Meta-Model (alpha=0.95)... ---")
study_meta_upper = optuna.create_study(direction='minimize')
study_meta_upper.optimize(lambda trial: objective_meta_quantile(trial, 0.95), n_trials=75, show_progress_bar=True)
best_params_meta_upper = study_meta_upper.best_params

print("\nMeta-Model Tuning Complete.")
print(f"Best Lower Meta-Params: {best_params_meta_lower}")
print(f"Best Upper Meta-Params: {best_params_meta_upper}")

[I 2025-07-25 20:44:07,280] A new study created in memory with name: no-name-f525ae55-e0cc-4b67-b428-e42ac0a94452



--- Preparing data for Meta-Model Optuna tuning ---

--- Tuning the Lower-Bound Meta-Model (alpha=0.05)... ---


  0%|          | 0/75 [00:00<?, ?it/s]

[I 2025-07-25 20:44:09,929] Trial 0 finished with value: 6822.0987678806605 and parameters: {'learning_rate': 0.019748355560134247, 'n_estimators': 1084, 'num_leaves': 11, 'lambda_l1': 1.9939761842585475, 'lambda_l2': 1.133456493069595, 'feature_fraction': 0.7747687293036789, 'bagging_fraction': 0.7234944211580132, 'bagging_freq': 7}. Best is trial 0 with value: 6822.0987678806605.
[I 2025-07-25 20:44:13,862] Trial 1 finished with value: 6831.771709183329 and parameters: {'learning_rate': 0.014026176705003982, 'n_estimators': 1373, 'num_leaves': 46, 'lambda_l1': 2.4400960072604896, 'lambda_l2': 49.694387142689315, 'feature_fraction': 0.831440158015546, 'bagging_fraction': 0.8186904684798766, 'bagging_freq': 3}. Best is trial 0 with value: 6822.0987678806605.
[I 2025-07-25 20:44:15,324] Trial 2 finished with value: 6836.056740597542 and parameters: {'learning_rate': 0.04624142917532263, 'n_estimators': 591, 'num_leaves': 34, 'lambda_l1': 1.9017668716759553, 'lambda_l2': 35.4452425556519

[I 2025-07-25 20:46:27,299] A new study created in memory with name: no-name-19432802-220d-4354-9a5b-f39cf529c500


[I 2025-07-25 20:46:27,297] Trial 74 finished with value: 6818.208249155021 and parameters: {'learning_rate': 0.02429942155271104, 'n_estimators': 2303, 'num_leaves': 10, 'lambda_l1': 2.8964499187619137, 'lambda_l2': 4.946434789740576, 'feature_fraction': 0.8843867126201211, 'bagging_fraction': 0.827302587339491, 'bagging_freq': 3}. Best is trial 22 with value: 6812.49960517772.

--- Tuning the Upper-Bound Meta-Model (alpha=0.95)... ---


  0%|          | 0/75 [00:00<?, ?it/s]

[I 2025-07-25 20:46:28,377] Trial 0 finished with value: 7894.718321042673 and parameters: {'learning_rate': 0.04122783223518715, 'n_estimators': 2015, 'num_leaves': 28, 'lambda_l1': 0.18283506089915916, 'lambda_l2': 8.070268121784082, 'feature_fraction': 0.9687553598711565, 'bagging_fraction': 0.9774523728746314, 'bagging_freq': 5}. Best is trial 0 with value: 7894.718321042673.
[I 2025-07-25 20:46:31,177] Trial 1 finished with value: 7869.922452755748 and parameters: {'learning_rate': 0.020001463640311117, 'n_estimators': 2236, 'num_leaves': 33, 'lambda_l1': 74.52640494101348, 'lambda_l2': 4.081051325034335, 'feature_fraction': 0.8503323192878747, 'bagging_fraction': 0.8774411301660135, 'bagging_freq': 1}. Best is trial 1 with value: 7869.922452755748.
[I 2025-07-25 20:46:32,987] Trial 2 finished with value: 7922.625723815521 and parameters: {'learning_rate': 0.03267519225797239, 'n_estimators': 1799, 'num_leaves': 35, 'lambda_l1': 2.417881569814411, 'lambda_l2': 0.1965119523852925, 

In [14]:
# =============================================================================
# BLOCK 15: K-FOLD TRAINING FOR META-MODELS & SAVE PREDICTIONS
# =============================================================================
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
import os
import gc

# Define a path to save the final meta-model predictions
META_FINAL_PATH = './meta_final_models/'
os.makedirs(META_FINAL_PATH, exist_ok=True)

# Initialize prediction arrays
oof_meta_lower = np.zeros(len(meta_features_train))
test_meta_lower = np.zeros(len(meta_features_test))
oof_meta_upper = np.zeros(len(meta_features_train))
test_meta_upper = np.zeros(len(meta_features_test))

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

print("\n--- Starting K-Fold training for final meta-models ---")
for fold, (train_idx, val_idx) in enumerate(skf.split(meta_features_train, grade_for_stratify)):
    print(f"Meta-Model K-Fold {fold+1}/{N_SPLITS}...")
    X_train, X_val = meta_features_train.iloc[train_idx], meta_features_train.iloc[val_idx]
    y_train, y_val = y_meta_target[train_idx], y_meta_target[val_idx]
    
    # --- Train and predict lower-bound meta-model ---
    model_lower = lgb.LGBMRegressor(objective='quantile', alpha=0.05, **best_params_meta_lower)
    model_lower.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
    oof_meta_lower[val_idx] = model_lower.predict(X_val)
    test_meta_lower += model_lower.predict(meta_features_test) / N_SPLITS

    # --- Train and predict upper-bound meta-model ---
    model_upper = lgb.LGBMRegressor(objective='quantile', alpha=0.95, **best_params_meta_upper)
    model_upper.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
    oof_meta_upper[val_idx] = model_upper.predict(X_val)
    test_meta_upper += model_upper.predict(meta_features_test) / N_SPLITS
    gc.collect()

# --- Save the final prediction artifacts ---
print("\n--- Saving final meta-model predictions... ---")
np.save(os.path.join(META_FINAL_PATH, 'oof_meta_lower.npy'), oof_meta_lower)
np.save(os.path.join(META_FINAL_PATH, 'test_meta_lower.npy'), test_meta_lower)
np.save(os.path.join(META_FINAL_PATH, 'oof_meta_upper.npy'), oof_meta_upper)
np.save(os.path.join(META_FINAL_PATH, 'test_meta_upper.npy'), test_meta_upper)
print("Final meta-model predictions saved successfully.")


--- Starting K-Fold training for final meta-models ---
Meta-Model K-Fold 1/5...
Meta-Model K-Fold 2/5...
Meta-Model K-Fold 3/5...
Meta-Model K-Fold 4/5...
Meta-Model K-Fold 5/5...

--- Saving final meta-model predictions... ---
Final meta-model predictions saved successfully.


In [15]:
# =============================================================================
# BLOCK 16: FINAL CALIBRATION AND SUBMISSION
# =============================================================================
import numpy as np
import pandas as pd
from scipy.optimize import minimize
import os

# Helper function
def winkler_score(y_true, lower, upper, alpha=0.1):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    return np.mean(width + penalty_lower + penalty_upper)

# --- Step 1: Load final predictions and true labels ---
print("\n--- Loading final predictions for calibration ---")
META_FINAL_PATH = './meta_final_models/'
y_true = pd.read_csv('./dataset.csv')['sale_price'].values

oof_lower = np.load(os.path.join(META_FINAL_PATH, 'oof_meta_lower.npy'))
test_lower = np.load(os.path.join(META_FINAL_PATH, 'test_meta_lower.npy'))
oof_upper = np.load(os.path.join(META_FINAL_PATH, 'oof_meta_upper.npy'))
test_upper = np.load(os.path.join(META_FINAL_PATH, 'test_meta_upper.npy'))

# --- Step 2: Final Calibration ---
print("\n--- Performing final calibration on stacked predictions ---")
def get_calibrated_winkler(multipliers, y_true_oof, lower_oof, upper_oof):
    a, b = multipliers
    lower_raw = np.minimum(lower_oof, upper_oof)
    upper_raw = np.maximum(lower_oof, upper_oof)
    return winkler_score(y_true_oof, lower_raw * a, upper_raw * b)

res_calib = minimize(
    fun=get_calibrated_winkler, x0=[1.0, 1.0], 
    args=(y_true, oof_lower, oof_upper),
    method='L-BFGS-B', bounds=[(0.8, 1.2), (0.8, 1.2)]
)
best_a, best_b = res_calib.x
final_score = res_calib.fun

# --- Step 3: Display Results and Create Submission ---
print("\n" + "="*60)
print("FINAL STACKED META-MODEL RESULTS")
print("="*60)
print(f"Previous Best Blended Score: $291,785.50")
print(f"Final STACKED OOF Winkler Score: ${final_score:,.2f}")
print(f"Optimal Calibration Multipliers: a={best_a:.4f}, b={best_b:.4f}")
print("-" * 60)

# Apply calibration and enforce non-crossing
final_test_lower = np.minimum(test_lower, test_upper) * best_a
final_test_upper = np.maximum(test_lower, test_upper) * best_b
final_test_upper = np.maximum(final_test_lower + 1, final_test_upper)

# Load IDs for submission
df_test_raw = pd.read_csv('./test.csv')
submission_df_final = pd.DataFrame({
    'id': df_test_raw['id'],
    'pi_lower': final_test_lower,
    'pi_upper': final_test_upper
})

submission_filename = f'submission_FINAL_STACKED_{int(final_score)}.csv'
submission_df_final.to_csv(submission_filename, index=False)

print(f"\n'{submission_filename}' created successfully! This is your definitive submission.")
print("\nFinal Submission Head:")
print(submission_df_final.head())


--- Loading final predictions for calibration ---

--- Performing final calibration on stacked predictions ---

FINAL STACKED META-MODEL RESULTS
Previous Best Blended Score: $291,785.50
Final STACKED OOF Winkler Score: $293,184.57
Optimal Calibration Multipliers: a=0.9988, b=1.0015
------------------------------------------------------------

'submission_FINAL_STACKED_293184.csv' created successfully! This is your definitive submission.

Final Submission Head:
       id       pi_lower      pi_upper
0  200000  806353.792926  1.008121e+06
1  200001  586240.829410  8.020544e+05
2  200002  452492.466350  6.478229e+05
3  200003  293412.479786  4.170843e+05
4  200004  391745.367117  8.192224e+05


In [12]:
import pandas as pd
import numpy as np

# Load your two best submission files
df_blend = pd.read_csv('submission_FINAL_3M_BLEND_291785.csv')
df_stacked = pd.read_csv('submission_FINAL_STACKED_293087.csv')

# Create the final ensemble with a simple 50/50 average
final_submission = pd.DataFrame()
final_submission['id'] = df_blend['id']
final_submission['pi_lower'] = 0.5 * df_blend['pi_lower'] + 0.5 * df_stacked['pi_lower']
final_submission['pi_upper'] = 0.5 * df_blend['pi_upper'] + 0.5 * df_stacked['pi_upper']

# Final sanity check
final_submission['pi_upper'] = np.maximum(final_submission['pi_lower'] + 1, final_submission['pi_upper'])

# Save the definitive submission
final_submission.to_csv('submission_DEFINITIVE_BLEND_291k.csv', index=False)

print("Definitive 2-submission blend created successfully.")
print(final_submission.head())

Definitive 2-submission blend created successfully.
       id       pi_lower      pi_upper
0  200000  810199.059449  1.017671e+06
1  200001  588965.221580  8.062583e+05
2  200002  454997.291877  6.532279e+05
3  200003  293889.746395  4.211828e+05
4  200004  388572.426359  8.320829e+05
