In [1]:
# =============================================================================
# BLOCK 1: SETUP, IMPORTS, AND DATA LOADING
# =============================================================================
import warnings
warnings.filterwarnings('ignore')
import time
import os
# --- Library Imports ---
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import catboost as cb
import optuna
from scipy.optimize import minimize
print("Libraries imported successfully.")
# --- Helper Function for Winkler Score ---
def winkler_score(y_true, lower, upper, alpha=0.1, return_coverage=False):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    if return_coverage:
        coverage = np.mean((y_true >= lower) & (y_true <= upper))
        return np.mean(score), coverage
    return np.mean(score)
# --- Global Constants ---
N_SPLITS = 5
RANDOM_STATE = 42
DATA_PATH = './'
N_OPTUNA_TRIALS = 30 # A strong number for a comprehensive search
COMPETITION_ALPHA = 0.1

# --- Load Raw Data ---
try:
    # We drop the low-variance columns they identified right away
    drop_cols=['id', 'golf', 'view_rainier', 'view_skyline', 'view_lakesamm','view_otherwater', 'view_other']
    df_train = pd.read_csv(DATA_PATH + 'dataset.csv').drop(columns=drop_cols)
    df_test = pd.read_csv(DATA_PATH + 'test.csv').drop(columns=drop_cols)
    print("Raw data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'dataset.csv' or 'test.csv'.")
    exit()
# --- Prepare Target Variable ---
y_true = df_train['sale_price'].copy()
grade_for_stratify = df_train['grade'].copy()
# The mean-error model works best when predicting the raw price directly
# So, we will NOT log-transform the target this time.
# df_train.drop('sale_price', axis=1, inplace=True) # We keep sale_price for FE
print("Setup complete.")


Libraries imported successfully.
Raw data loaded successfully.
Setup complete.


In [2]:
# Make sure to have these libraries installed
# pip install pandas numpy scikit-learn

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
import gc

# Define a random state for reproducibility
RANDOM_STATE = 42

def create_comprehensive_features(df_train, df_test):
    """
    Combines original and new advanced feature engineering steps into a single pipeline.
    """
    print("--- Starting Comprehensive Feature Engineering ---")

    # Store original indices and target variable
    train_ids = df_train.index
    test_ids = df_test.index
    y_train = df_train['sale_price'].copy() # Keep the target separate

    # Combine for consistent processing
    df_train_temp = df_train.drop(columns=['sale_price'])
    all_data = pd.concat([df_train_temp, df_test], axis=0, ignore_index=True)

    # --- Original Feature Engineering ---

    # A) Brute-Force Numerical Interactions
    print("Step 1: Creating brute-force numerical interaction features...")
    NUMS = ['area', 'land_val', 'imp_val', 'sqft_lot', 'sqft', 'sqft_1', 'grade', 'year_built']
    # Ensure all columns exist and are numeric, fill missing with 0 for safety
    for col in NUMS:
        if col not in all_data.columns:
            all_data[col] = 0
        else:
            all_data[col] = pd.to_numeric(all_data[col], errors='coerce').fillna(0)
            
    for i in range(len(NUMS)):
        for j in range(i + 1, len(NUMS)):
            all_data[f'{NUMS[i]}_x_{NUMS[j]}'] = all_data[NUMS[i]] * all_data[NUMS[j]]

    # B) Date Features
    print("Step 2: Creating date features...")
    all_data['sale_date'] = pd.to_datetime(all_data['sale_date'])
    all_data['sale_year'] = all_data['sale_date'].dt.year
    all_data['sale_month'] = all_data['sale_date'].dt.month
    all_data['sale_dayofyear'] = all_data['sale_date'].dt.dayofyear
    all_data['age_at_sale'] = all_data['sale_year'] - all_data['year_built']

    # C) TF-IDF Text Features
    print("Step 3: Creating TF-IDF features for text columns...")
    text_cols = ['subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data[text_cols] = all_data[text_cols].fillna('missing').astype(str)
    
    for col in text_cols:
        tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=128, binary=True)
        svd = TruncatedSVD(n_components=8, random_state=RANDOM_STATE)
        
        tfidf_matrix = tfidf.fit_transform(all_data[col])
        tfidf_svd = svd.fit_transform(tfidf_matrix)
        
        tfidf_df = pd.DataFrame(tfidf_svd, columns=[f'{col}_tfidf_svd_{i}' for i in range(8)])
        all_data = pd.concat([all_data, tfidf_df], axis=1)

    # D) Log transform some interaction features
    for c in ['land_val_x_imp_val', 'land_val_x_sqft', 'imp_val_x_sqft']:
        if c in all_data.columns:
            all_data[c] = np.log1p(all_data[c].fillna(0))

    # --- New Feature Engineering Ideas ---

    # F) Group-By Aggregation Features
    print("Step 4: Creating group-by aggregation features...")
    group_cols = ['submarket', 'city', 'zoning']
    num_cols_for_agg = ['grade', 'sqft', 'imp_val', 'land_val', 'age_at_sale']

    for group_col in group_cols:
        for num_col in num_cols_for_agg:
            agg_stats = all_data.groupby(group_col)[num_col].agg(['mean', 'std', 'max', 'min']).reset_index()
            agg_stats.columns = [group_col] + [f'{group_col}_{num_col}_{stat}' for stat in ['mean', 'std', 'max', 'min']]
            all_data = pd.merge(all_data, agg_stats, on=group_col, how='left')
            all_data[f'{num_col}_minus_{group_col}_mean'] = all_data[num_col] - all_data[f'{group_col}_{num_col}_mean']

    # G) Ratio Features
    print("Step 5: Creating ratio features...")
    # Add a small epsilon to prevent division by zero
    epsilon = 1e-6 
    all_data['total_val'] = all_data['imp_val'] + all_data['land_val']
    all_data['imp_val_to_land_val_ratio'] = all_data['imp_val'] / (all_data['land_val'] + epsilon)
    all_data['land_val_ratio'] = all_data['land_val'] / (all_data['total_val'] + epsilon)
    all_data['sqft_to_lot_ratio'] = all_data['sqft'] / (all_data['sqft_lot'] + epsilon)
    all_data['was_renovated'] = (all_data['year_reno'] > 0).astype(int)
    all_data['reno_age_at_sale'] = np.where(all_data['was_renovated'] == 1, all_data['sale_year'] - all_data['year_reno'], -1)

    # H) Geospatial Clustering Features
    print("Step 6: Creating geospatial clustering features...")
    coords = all_data[['latitude', 'longitude']].copy()
    coords.fillna(coords.median(), inplace=True) # Simple imputation

    # KMeans is sensitive to feature scaling, but for lat/lon it's often okay without it.
    kmeans = KMeans(n_clusters=20, random_state=RANDOM_STATE, n_init=10) 
    all_data['location_cluster'] = kmeans.fit_predict(coords)
    
    # Calculate distance to each cluster center
    cluster_centers = kmeans.cluster_centers_
    for i in range(len(cluster_centers)):
        center = cluster_centers[i]
        all_data[f'dist_to_cluster_{i}'] = np.sqrt((coords['latitude'] - center[0])**2 + (coords['longitude'] - center[1])**2)

    # --- Final Cleanup ---
    print("Step 7: Finalizing feature set...")
    cols_to_drop = ['sale_date', 'subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data = all_data.drop(columns=cols_to_drop)

    # One-hot encode the new cluster feature
    all_data = pd.get_dummies(all_data, columns=['location_cluster'], prefix='loc_cluster')
    
    # Final check for any remaining object columns to be safe (besides index)
    object_cols = all_data.select_dtypes(include='object').columns
    if len(object_cols) > 0:
        print(f"Warning: Found unexpected object columns: {object_cols}. Dropping them.")
        all_data = all_data.drop(columns=object_cols)
        
    all_data.fillna(0, inplace=True)

    # Separate back into train and test sets
    train_len = len(train_ids)
    X = all_data.iloc[:train_len].copy()
    X_test = all_data.iloc[train_len:].copy()
    
    # Restore original indices
    X.index = train_ids
    X_test.index = test_ids
    
    # Align columns - crucial for model prediction
    X_test = X_test[X.columns]
    
    print(f"\nComprehensive FE complete. Total features: {X.shape[1]}")
    gc.collect()
    
    return X, X_test, y_train
# =============================================================================
# BLOCK 2.5: EXECUTE FEATURE ENGINEERING
# =============================================================================
print("\n--- Starting Block 2.5: Executing Feature Engineering Pipeline ---")

# This is the crucial step that was missing.
# We call the function to create our training and testing dataframes.
X, X_test, y_train = create_comprehensive_features(df_train, df_test)

# Let's verify the output
print(f"Feature engineering complete. X shape: {X.shape}, X_test shape: {X_test.shape}")
gc.collect()


--- Starting Block 2.5: Executing Feature Engineering Pipeline ---
--- Starting Comprehensive Feature Engineering ---
Step 1: Creating brute-force numerical interaction features...
Step 2: Creating date features...
Step 3: Creating TF-IDF features for text columns...
Step 4: Creating group-by aggregation features...
Step 5: Creating ratio features...
Step 6: Creating geospatial clustering features...
Step 7: Finalizing feature set...

Comprehensive FE complete. Total features: 233
Feature engineering complete. X shape: (200000, 233), X_test shape: (200000, 233)


0

In [3]:
# =============================================================================
# BLOCK 3: LOAD ALL PRE-TRAINED MODEL PREDICTIONS
# =============================================================================

# Define paths to your saved prediction files
PREDS_SAVE_PATH = './mean_models_v1/' # For XGB and CatBoost preds
NN_PREDS_PATH = './NN_model_predictions/' # For NN preds
ERR_PATH = './error_models/' # For error preds

print("--- Loading all base model predictions from saved .npy files... ---")
try:
    # Load Mean Model OOF (Out-of-Fold) Predictions
    oof_xgb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_xgb_preds.npy'))
    oof_cb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_cb_preds.npy'))
    oof_lgbm_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_lgbm_preds.npy'))
    oof_nn_preds = np.load(os.path.join(NN_PREDS_PATH, 'oof_nn_preds.npy'))

    oof_error_preds_cb = np.load(os.path.join(ERR_PATH, 'oof_error_preds_cb.npy'))
    oof_error_preds_lgbm = np.load(os.path.join(ERR_PATH, 'oof_error_preds_lgbm.npy'))
    oof_error_preds_xgb = np.load(os.path.join(ERR_PATH, 'oof_error_preds_xgb.npy'))
    
    # Load Mean Model Test Predictions
    test_xgb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_xgb_preds.npy'))
    test_cb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_cb_preds.npy'))
    test_lgbm_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_lgbm_preds.npy'))
    test_nn_preds = np.load(os.path.join(NN_PREDS_PATH, 'test_nn_preds.npy'))

    test_error_preds_cb = np.load(os.path.join(ERR_PATH, 'test_error_preds_cb.npy'))
    test_error_preds_lgbm = np.load(os.path.join(ERR_PATH, 'test_error_preds_lgbm.npy'))
    test_error_preds_xgb = np.load(os.path.join(ERR_PATH, 'test_error_preds_xgb.npy'))

     
    
    print("All MEAN AND ERROR models predictions loaded successfully.")
    
except FileNotFoundError as e:
    print(f"\nERROR: Could not find a required prediction file. {e}")
    print("Please ensure you have run all training notebooks and saved their predictions first.")

--- Loading all base model predictions from saved .npy files... ---
All MEAN AND ERROR models predictions loaded successfully.


In [4]:
# =============================================================================
# BLOCK 5.5 (Corrected): ELITE FEATURE SET WITH VOLATILITY FEATURES
# =============================================================================
# This block creates the definitive feature set for the quantile models.
#
# NEW IMPROVEMENT: We are now engineering features specifically designed to
# capture price VOLATILITY. This gives the quantile models a direct signal
# about which groups of houses have a wider or narrower price distribution,
# which is exactly what they need to predict the tails accurately.
#
# We also drastically reduce the number of raw features to N=25 to combat
# overfitting, forcing the model to rely on these powerful new signals.

# --- Step 1: Generate Feature Importance Ranking (Same as before) ---
print("\n--- Training a simple model to determine feature importance... ---")
dtrain_importance = xgb.DMatrix(X, label=y_true)
params = {'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'seed': RANDOM_STATE, 'n_jobs': -1}
bst_for_importance = xgb.train(params, dtrain_importance, num_boost_round=500, verbose_eval=False)
importance_scores = bst_for_importance.get_score(importance_type='gain')
feature_importance = pd.DataFrame({
    'Feature': importance_scores.keys(),
    'Importance': importance_scores.values()
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)
print("Feature importance ranking created.")

# --- Step 2: Build the ELITE Feature Set ---
print("\n--- Building an ELITE feature set with reduced features and stacked predictions ---")

# Drastically reduce feature count to combat overfitting.
N_TOP_FEATURES = 25
elite_raw_features = feature_importance['Feature'].head(N_TOP_FEATURES).tolist()
print(f"Selected the top {N_TOP_FEATURES} raw features to reduce noise.")

# Create the base dataframes
X_for_quantile = X[elite_raw_features].copy()
X_test_for_quantile = X_test[elite_raw_features].copy()

# Add the stacked predictions (mean and error models)
# These are still the most powerful features.
# (Code to add oof_mean_*, oof_error_*, test_mean_*, test_error_* preds)
for pred_name, oof_pred, test_pred in [
    ('oof_mean_xgb', oof_xgb_preds, test_xgb_preds), ('oof_mean_cb', oof_cb_preds, test_cb_preds),
    ('oof_mean_lgbm', oof_lgbm_preds, test_lgbm_preds), ('oof_mean_nn', oof_nn_preds, test_nn_preds),
    ('oof_error_xgb', oof_error_preds_xgb, test_error_preds_xgb), ('oof_error_cb', oof_error_preds_cb, test_error_preds_cb),
    ('oof_error_lgbm', oof_error_preds_lgbm, test_error_preds_lgbm)
]:
    X_for_quantile[pred_name] = oof_pred
    X_test_for_quantile[pred_name] = test_pred

# --- Step 3: Create and Add Volatility Features (The Leakage-Proof Way) ---
# This is the key improvement. We must create these features inside a CV loop.
print("\n--- Engineering and adding VOLATILITY features (leakage-proof method) ---")

# We will create OOF features for the train set and a single set for the test set.
# Initialize new feature columns with NaNs
X_for_quantile['price_std_by_submarket'] = np.nan
X_for_quantile['price_range_by_grade'] = np.nan
X_test_for_quantile['price_std_by_submarket'] = np.nan
X_test_for_quantile['price_range_by_grade'] = np.nan

# We will average the test set calculations over the folds
test_std_agg = np.zeros(len(X_test_for_quantile))
test_range_agg = np.zeros(len(X_test_for_quantile))

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, grade_for_stratify)):
    # Get the training data for THIS FOLD ONLY
    X_train_fold, y_train_fold = df_train.iloc[train_idx], y_true.iloc[train_idx]
    
    # --- Feature 1: Price Standard Deviation by Submarket ---
    # Calculate the std only on the fold's training data
    std_map = y_train_fold.groupby(X_train_fold['submarket']).std()
    # Map these values to the validation set for this fold
    val_stds = df_train.iloc[val_idx]['submarket'].map(std_map)
    X_for_quantile.loc[X_for_quantile.index[val_idx], 'price_std_by_submarket'] = val_stds
    # Map to the test set and add to the aggregate
    test_std_agg += df_test['submarket'].map(std_map) / N_SPLITS

    # --- Feature 2: Price Range by Grade ---
    # Calculate the range (max-min) only on the fold's training data
    range_map = y_train_fold.groupby(X_train_fold['grade']).apply(lambda x: x.max() - x.min())
    # Map these values to the validation set for this fold
    val_ranges = df_train.iloc[val_idx]['grade'].map(range_map)
    X_for_quantile.loc[X_for_quantile.index[val_idx], 'price_range_by_grade'] = val_ranges
    # Map to the test set and add to the aggregate
    test_range_agg += df_test['grade'].map(range_map) / N_SPLITS

# Assign the averaged features to the test set
X_test_for_quantile['price_std_by_submarket'] = test_std_agg
X_test_for_quantile['price_range_by_grade'] = test_range_agg

# --- Step 4: Final Cleanup and Report ---
# Fill any NaNs that might have occurred if a category in val/test was not in train
# (e.g., using the global median as a fallback)
global_std_median = X_for_quantile['price_std_by_submarket'].median()
global_range_median = X_for_quantile['price_range_by_grade'].median()

X_for_quantile.fillna({
    'price_std_by_submarket': global_std_median,
    'price_range_by_grade': global_range_median
}, inplace=True)
X_test_for_quantile.fillna({
    'price_std_by_submarket': global_std_median,
    'price_range_by_grade': global_range_median
}, inplace=True)

# Final alignment check
X_test_for_quantile = X_test_for_quantile[X_for_quantile.columns]

total_features = X_for_quantile.shape[1]
print(f"\nElite feature set for quantile models created successfully.")
print(f"Final Shape: {X_for_quantile.shape}")
print(f"Total features include: {N_TOP_FEATURES} raw + 7 stacked preds + 2 volatility features = {total_features}")
gc.collect()


--- Training a simple model to determine feature importance... ---
Feature importance ranking created.

--- Building an ELITE feature set with reduced features and stacked predictions ---
Selected the top 25 raw features to reduce noise.

--- Engineering and adding VOLATILITY features (leakage-proof method) ---

Elite feature set for quantile models created successfully.
Final Shape: (200000, 34)
Total features include: 25 raw + 7 stacked preds + 2 volatility features = 34


0

In [5]:
# =============================================================================
# BLOCK 10: TUNE LIGHTGBM QUANTILE MODELS WITH OPTUNA
# =============================================================================
import lightgbm as lgb
import optuna

# Use the same elite feature set with volatility features
# X_for_quantile, X_test_for_quantile, y_true should be available

print("\n--- Preparing data for LightGBM Optuna tuning ---")
X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
    X_for_quantile, y_true, test_size=0.20, random_state=RANDOM_STATE
)

def objective_lgbm(trial, alpha_value):
    """Unified Optuna objective for LightGBM quantile models."""
    params = {
        'objective': 'quantile',
        'alpha': alpha_value,
        'metric': 'quantile', # Pinball loss
        'random_state': RANDOM_STATE,
        'n_jobs': -1,
        'verbosity': -1,
        'boosting_type': 'gbdt',
        
        # --- Hyperparameters to Tune for LightGBM ---
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 800, 3000),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-2, 100.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-2, 100.0, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
    }

    model = lgb.LGBMRegressor(**params)
    model.fit(
        X_train_opt, y_train_opt,
        eval_set=[(X_val_opt, y_val_opt)],
        eval_metric='quantile',
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    
    preds = model.predict(X_val_opt)
    pinball_loss = np.mean(np.where(y_val_opt >= preds, 
                                    (y_val_opt - preds) * alpha_value, 
                                    (preds - y_val_opt) * (1 - alpha_value)))
    return pinball_loss

# --- Tune Lower-Bound Model (alpha=0.05) ---
print("\n--- Tuning the LightGBM Lower-Bound Model (alpha=0.05)... ---")
study_lower_lgbm = optuna.create_study(direction='minimize')
study_lower_lgbm.optimize(lambda trial: objective_lgbm(trial, 0.05), n_trials=50, show_progress_bar=True)
best_params_lower_lgbm = study_lower_lgbm.best_params

# --- Tune Upper-Bound Model (alpha=0.95) ---
print("\n--- Tuning the LightGBM Upper-Bound Model (alpha=0.95)... ---")
study_upper_lgbm = optuna.create_study(direction='minimize')
study_upper_lgbm.optimize(lambda trial: objective_lgbm(trial, 0.95), n_trials=50, show_progress_bar=True)
best_params_upper_lgbm = study_upper_lgbm.best_params

print("\nLightGBM Tuning Complete.")
print(f"Best Lower Params: {best_params_lower_lgbm}")
print(f"Best Upper Params: {best_params_upper_lgbm}")

[I 2025-07-25 20:05:30,314] A new study created in memory with name: no-name-f74bb3b3-bf8c-4c51-839e-0b5f2ae45c79



--- Preparing data for LightGBM Optuna tuning ---

--- Tuning the LightGBM Lower-Bound Model (alpha=0.05)... ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-25 20:05:37,948] Trial 0 finished with value: 6860.680883437204 and parameters: {'learning_rate': 0.01624558920804399, 'n_estimators': 1975, 'num_leaves': 81, 'max_depth': 8, 'lambda_l1': 6.149418998196093, 'lambda_l2': 0.06082070250054613, 'feature_fraction': 0.7814762143030334, 'bagging_fraction': 0.7789449320917392, 'bagging_freq': 1}. Best is trial 0 with value: 6860.680883437204.
[I 2025-07-25 20:05:39,680] Trial 1 finished with value: 6887.548198043642 and parameters: {'learning_rate': 0.06963590301846011, 'n_estimators': 887, 'num_leaves': 94, 'max_depth': 7, 'lambda_l1': 8.183727050895508, 'lambda_l2': 0.6685165770330787, 'feature_fraction': 0.8921604128851564, 'bagging_fraction': 0.9105833881260805, 'bagging_freq': 1}. Best is trial 0 with value: 6860.680883437204.
[I 2025-07-25 20:05:45,934] Trial 2 finished with value: 6876.043811150028 and parameters: {'learning_rate': 0.019354917781911014, 'n_estimators': 1366, 'num_leaves': 137, 'max_depth': 11, 'lambda_l1': 0.

[I 2025-07-25 20:09:45,455] A new study created in memory with name: no-name-d3e34789-05af-48d0-959b-5bb57dfa32c9


[I 2025-07-25 20:09:45,451] Trial 49 finished with value: 6863.72616726848 and parameters: {'learning_rate': 0.0685723473523539, 'n_estimators': 1198, 'num_leaves': 34, 'max_depth': 6, 'lambda_l1': 0.019057023129960535, 'lambda_l2': 0.40673492882398404, 'feature_fraction': 0.6022319951342885, 'bagging_fraction': 0.9403733325884754, 'bagging_freq': 4}. Best is trial 13 with value: 6814.031598672509.

--- Tuning the LightGBM Upper-Bound Model (alpha=0.95)... ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-25 20:09:47,223] Trial 0 finished with value: 8028.4156578229995 and parameters: {'learning_rate': 0.038592412943763636, 'n_estimators': 1587, 'num_leaves': 115, 'max_depth': 8, 'lambda_l1': 0.8060047255179139, 'lambda_l2': 0.8012965778625967, 'feature_fraction': 0.9470730995025582, 'bagging_fraction': 0.6521226608958565, 'bagging_freq': 6}. Best is trial 0 with value: 8028.4156578229995.
[I 2025-07-25 20:09:48,119] Trial 1 finished with value: 7880.496285951066 and parameters: {'learning_rate': 0.07456663541923571, 'n_estimators': 2685, 'num_leaves': 147, 'max_depth': 5, 'lambda_l1': 0.19236414389785744, 'lambda_l2': 6.603550825847633, 'feature_fraction': 0.6215737952134176, 'bagging_fraction': 0.7775101063945684, 'bagging_freq': 3}. Best is trial 1 with value: 7880.496285951066.
[I 2025-07-25 20:09:49,550] Trial 2 finished with value: 7893.880492306065 and parameters: {'learning_rate': 0.039476909874639865, 'n_estimators': 2237, 'num_leaves': 32, 'max_depth': 12, 'lambda_l

In [11]:
# =============================================================================
# BLOCK 11 (Corrected): K-FOLD TRAIN LIGHTGBM QUANTILE MODELS
# =============================================================================
# This block performs the final K-Fold training for the tuned LightGBM quantile
# models.
#
# CORRECTION: Using correct NumPy array indexing for `y_true` instead of the
# pandas `.iloc` method to prevent the AttributeError.

import os
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import gc

# Define path for saving predictions
META_LGBM_QUANTILE_PATH = './meta_lgbm_quantile_models/'
os.makedirs(META_LGBM_QUANTILE_PATH, exist_ok=True)

# Initialize prediction arrays
oof_lower_preds_lgbm = np.zeros(len(X_for_quantile))
test_lower_preds_lgbm = np.zeros(len(X_test_for_quantile))
oof_upper_preds_lgbm = np.zeros(len(X_for_quantile))
test_upper_preds_lgbm = np.zeros(len(X_test_for_quantile))

# Initialize the K-Fold splitter
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

print("\n--- Starting K-Fold training for LightGBM Quantile Models ---")
for fold, (train_idx, val_idx) in enumerate(skf.split(X_for_quantile, grade_for_stratify)):
    print(f"LGBM K-Fold {fold+1}/{N_SPLITS}...")
    
    # Split features (pandas .iloc is correct here)
    X_train, X_val = X_for_quantile.iloc[train_idx], X_for_quantile.iloc[val_idx]
    
    # Split target variable (NumPy indexing is correct here)
    y_train_fold, y_val_fold = y_true[train_idx], y_true[val_idx]
    
    # --- Train and predict lower-bound model ---
    lower_model_lgbm = lgb.LGBMRegressor(objective='quantile', alpha=0.05, **best_params_lower_lgbm)
    lower_model_lgbm.fit(
        X_train, y_train_fold, 
        eval_set=[(X_val, y_val_fold)], 
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    oof_lower_preds_lgbm[val_idx] = lower_model_lgbm.predict(X_val)
    test_lower_preds_lgbm += lower_model_lgbm.predict(X_test_for_quantile) / N_SPLITS

    # --- Train and predict upper-bound model ---
    upper_model_lgbm = lgb.LGBMRegressor(objective='quantile', alpha=0.95, **best_params_upper_lgbm)
    upper_model_lgbm.fit(
        X_train, y_train_fold, 
        eval_set=[(X_val, y_val_fold)], 
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    oof_upper_preds_lgbm[val_idx] = upper_model_lgbm.predict(X_val)
    test_upper_preds_lgbm += upper_model_lgbm.predict(X_test_for_quantile) / N_SPLITS
    
    gc.collect()

# --- Save the prediction artifacts ---
print("\n--- Saving LightGBM quantile predictions... ---")
np.save(os.path.join(META_LGBM_QUANTILE_PATH, 'oof_lower_preds_lgbm.npy'), oof_lower_preds_lgbm)
np.save(os.path.join(META_LGBM_QUANTILE_PATH, 'test_lower_preds_lgbm.npy'), test_lower_preds_lgbm)
np.save(os.path.join(META_LGBM_QUANTILE_PATH, 'oof_upper_preds_lgbm.npy'), oof_upper_preds_lgbm)
np.save(os.path.join(META_LGBM_QUANTILE_PATH, 'test_upper_preds_lgbm.npy'), test_upper_preds_lgbm)
print("LightGBM quantile predictions saved successfully.")


--- Starting K-Fold training for LightGBM Quantile Models ---
LGBM K-Fold 1/5...
LGBM K-Fold 2/5...
LGBM K-Fold 3/5...
LGBM K-Fold 4/5...
LGBM K-Fold 5/5...

--- Saving LightGBM quantile predictions... ---
LightGBM quantile predictions saved successfully.


In [12]:
# =============================================================================
# BLOCK 12 (Complete): FINAL 3-MODEL INTERVAL ENSEMBLE
# =============================================================================
# This is the new definitive final step. We are creating a master ensemble by
# blending the calibrated interval bounds from our three distinct pipelines:
# 1. The "Mean+Error" Model (Our Champion)
# 2. The XGBoost "Direct Quantile" Model (Challenger 1)
# 3. The LightGBM "Direct Quantile" Model (Challenger 2)
#
# The optimizer will find the best possible weights to combine them, leveraging
# the unique strengths of each approach.

import os
import numpy as np
import pandas as pd
from scipy.optimize import minimize
import gc

# --- Helper Function for Winkler Score ---
def winkler_score(y_true, lower, upper, alpha=0.1):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    return np.mean(score)

# --- Step 1: Load All Prediction Artifacts from All Pipelines ---
print("\n--- Loading all necessary prediction files from all pipelines ---")

# --- Define Paths ---
DATA_PATH = './'
PREDS_SAVE_PATH = './mean_models_v1/'
NN_PREDS_PATH = './NN_model_predictions/'
ERROR_MODELS_PATH = './error_models/'
META_XGB_QUANTILE_PATH = './meta_quantile_models/'
META_LGBM_QUANTILE_PATH = './meta_lgbm_quantile_models/' # Path for new LGBM preds

try:
    # --- Load Test Set Submission Files (for IDs and final bounds) ---
    df_error_model_sub = pd.read_csv('submission_final_OptimalEoE_292680.csv')
    df_xgb_quantile_sub = pd.read_csv('submission_direct_quantile_robust_349061.csv')
    # We will generate the LGBM quantile submission bounds after calibration

    # --- Load all OOF and Test predictions needed to recreate the intervals ---
    df_train = pd.read_csv(os.path.join(DATA_PATH, 'dataset.csv'))
    y_true = df_train['sale_price'].values

    # Mean Model OOF & Test
    oof_xgb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_xgb_preds.npy'))
    oof_cb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_cb_preds.npy'))
    oof_lgbm_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_lgbm_preds.npy'))
    oof_nn_preds = np.load(os.path.join(NN_PREDS_PATH, 'oof_nn_preds.npy'))
    test_xgb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_xgb_preds.npy'))
    test_cb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_cb_preds.npy'))
    test_lgbm_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_lgbm_preds.npy'))
    test_nn_preds = np.load(os.path.join(NN_PREDS_PATH, 'test_nn_preds.npy'))

    # Error Model OOF
    oof_error_preds_xgb = np.load(os.path.join(ERROR_MODELS_PATH, 'oof_error_preds_xgb.npy'))
    oof_error_preds_cb = np.load(os.path.join(ERROR_MODELS_PATH, 'oof_error_preds_cb.npy'))

    # XGB Quantile Model OOF & Test
    oof_lower_xgb = np.load(os.path.join(META_XGB_QUANTILE_PATH, 'oof_lower_preds.npy'))
    oof_upper_xgb = np.load(os.path.join(META_XGB_QUANTILE_PATH, 'oof_upper_preds.npy'))
    
    # LGBM Quantile Model OOF & Test
    oof_lower_lgbm = np.load(os.path.join(META_LGBM_QUANTILE_PATH, 'oof_lower_preds_lgbm.npy'))
    oof_upper_lgbm = np.load(os.path.join(META_LGBM_QUANTILE_PATH, 'oof_upper_preds_lgbm.npy'))
    test_lower_lgbm = np.load(os.path.join(META_LGBM_QUANTILE_PATH, 'test_lower_preds_lgbm.npy'))
    test_upper_lgbm = np.load(os.path.join(META_LGBM_QUANTILE_PATH, 'test_upper_preds_lgbm.npy'))

    print("All prediction artifacts loaded successfully.")

except FileNotFoundError as e:
    print(f"\nERROR: Could not find a required file. {e}")
    print("Please ensure ALL previous training and tuning notebooks have been run successfully.")
    # exit()

# --- Step 2: Calibrate and Recreate Final Bounds for Each Pipeline ---
print("\n--- Recreating/Calibrating final OOF and Test bounds for each pipeline ---")

# --- Pipeline A: Mean+Error Model Bounds ---
oof_ensemble_mean = (oof_xgb_preds + oof_cb_preds + oof_lgbm_preds + oof_nn_preds) / 4
a_err, b_err = 1.9799, 2.1755
oof_error_final = np.clip((oof_error_preds_xgb * 0.60 + oof_error_preds_cb * 0.40), 0, None)
oof_lower_A = oof_ensemble_mean - oof_error_final * a_err
oof_upper_A = oof_ensemble_mean + oof_error_final * b_err
test_lower_A, test_upper_A = df_error_model_sub['pi_lower'].values, df_error_model_sub['pi_upper'].values
score_A = winkler_score(y_true, oof_lower_A, oof_upper_A)

# --- Pipeline B: XGB Quantile Model Bounds ---
# These are already calibrated in the submission file
oof_lower_B, oof_upper_B = df_xgb_quantile_sub['pi_lower'].values, df_xgb_quantile_sub['pi_upper'].values # This is an error, we need OOF
test_lower_B, test_upper_B = df_xgb_quantile_sub['pi_lower'].values, df_xgb_quantile_sub['pi_upper'].values
# Recreate OOF for XGB Quantile
a_xgb, b_xgb = 0.8118, 1.1960
oof_lower_xgb_raw = np.minimum(oof_lower_xgb, oof_upper_xgb)
oof_upper_xgb_raw = np.maximum(oof_lower_xgb, oof_upper_xgb)
oof_lower_B = oof_lower_xgb_raw * a_xgb
oof_upper_B = oof_upper_xgb_raw * b_xgb
score_B = winkler_score(y_true, oof_lower_B, oof_upper_B)


# --- Pipeline C: LGBM Quantile Model Bounds (Needs Calibration) ---
def get_robust_calibrated_winkler(multipliers, y_true_oof, lower_oof, upper_oof):
    a, b = multipliers
    lower_raw = np.minimum(lower_oof, upper_oof)
    upper_raw = np.maximum(lower_oof, upper_oof)
    return winkler_score(y_true_oof, lower_raw * a, upper_raw * b)

res_lgbm = minimize(fun=get_robust_calibrated_winkler, x0=[0.95, 1.05], args=(y_true, oof_lower_lgbm, oof_upper_lgbm), method='L-BFGS-B', bounds=[(0.8, 1.2), (0.8, 1.2)])
a_lgbm, b_lgbm = res_lgbm.x
score_C = res_lgbm.fun

# Create final calibrated bounds for LGBM model
oof_lower_lgbm_raw, oof_upper_lgbm_raw = np.minimum(oof_lower_lgbm, oof_upper_lgbm), np.maximum(oof_lower_lgbm, oof_upper_lgbm)
test_lower_lgbm_raw, test_upper_lgbm_raw = np.minimum(test_lower_lgbm, test_upper_lgbm), np.maximum(test_lower_lgbm, test_upper_lgbm)
oof_lower_C = oof_lower_lgbm_raw * a_lgbm
oof_upper_C = oof_upper_lgbm_raw * b_lgbm
test_lower_C = test_lower_lgbm_raw * a_lgbm
test_upper_C = test_upper_lgbm_raw * b_lgbm

# --- Step 3: Find Optimal 3-Model Blend Weights ---
print("\n--- Optimizing the blend weight for the THREE interval models ---")

def get_3_model_blended_winkler(weights, y_true_oof, bounds_a, bounds_b, bounds_c):
    w_a, w_b = weights[0], weights[1]
    w_c = 1 - w_a - w_b
    if w_c < 0: return 1e9
    
    final_lower = (bounds_a[0] * w_a) + (bounds_b[0] * w_b) + (bounds_c[0] * w_c)
    final_upper = (bounds_a[1] * w_a) + (bounds_b[1] * w_b) + (bounds_c[1] * w_c)
    return winkler_score(y_true_oof, final_lower, final_upper)

result_blend_3 = minimize(
    fun=get_3_model_blended_winkler,
    x0=[0.8, 0.1], # Start with a guess favouring the champion
    args=(y_true, (oof_lower_A, oof_upper_A), (oof_lower_B, oof_upper_B), (oof_lower_C, oof_upper_C)),
    method='L-BFGS-B', bounds=[(0, 1), (0, 1)],
    constraints={'type': 'ineq', 'fun': lambda w: 1 - np.sum(w)} # w[0]+w[1] <= 1
)

w_A, w_B = result_blend_3.x
w_C = 1 - w_A - w_B
best_blended_score = result_blend_3.fun

# --- Step 4: Display Final Results ---
print("\n" + "="*60)
print("FINAL 3-MODEL ENSEMBLE RESULTS")
print("="*60)
print(f"  A) Mean+Error Model OOF Score:     ${score_A:,.2f}")
print(f"  B) XGB Quantile Model OOF Score:   ${score_B:,.2f}")
print(f"  C) LGBM Quantile Model OOF Score:  ${score_C:,.2f}")
print("-" * 60)
print(f"Final BLENDED OOF Winkler Score:   ${best_blended_score:,.2f}")
print("-" * 60)
print(f"Optimal Blend Weights:")
print(f"  -> Model A (Mean+Error):   {w_A:.2%}")
print(f"  -> Model B (XGB Quantile): {w_B:.2%}")
print(f"  -> Model C (LGBM Quantile):{w_C:.2%}")

# --- Step 5: Create and Save the Final Blended Submission ---
print("\n--- Creating final blended submission file... ---")

final_test_lower = (test_lower_A * w_A) + (test_lower_B * w_B) + (test_lower_C * w_C)
final_test_upper = (test_upper_A * w_A) + (test_upper_B * w_B) + (test_upper_C * w_C)
final_test_upper = np.maximum(final_test_lower + 1, final_test_upper)

submission_df_final = pd.DataFrame({
    'id': df_error_model_sub['id'],
    'pi_lower': final_test_lower,
    'pi_upper': final_test_upper
})

submission_filename = f'submission_FINAL_3M_BLEND_{int(best_blended_score)}.csv'
submission_df_final.to_csv(submission_filename, index=False)

print(f"\n'{submission_filename}' created successfully! This should be your best submission yet.")
print("\nFinal Submission Head:")
print(submission_df_final.head())
gc.collect()


--- Loading all necessary prediction files from all pipelines ---
All prediction artifacts loaded successfully.

--- Recreating/Calibrating final OOF and Test bounds for each pipeline ---

--- Optimizing the blend weight for the THREE interval models ---

FINAL 3-MODEL ENSEMBLE RESULTS
  A) Mean+Error Model OOF Score:     $295,017.11
  B) XGB Quantile Model OOF Score:   $349,311.64
  C) LGBM Quantile Model OOF Score:  $293,068.04
------------------------------------------------------------
Final BLENDED OOF Winkler Score:   $291,785.50
------------------------------------------------------------
Optimal Blend Weights:
  -> Model A (Mean+Error):   37.30%
  -> Model B (XGB Quantile): 0.00%
  -> Model C (LGBM Quantile):62.70%

--- Creating final blended submission file... ---

'submission_FINAL_3M_BLEND_291785.csv' created successfully! This should be your best submission yet.

Final Submission Head:
       id       pi_lower      pi_upper
0  200000  812014.599932  1.019174e+06
1  200001 

187