In [1]:
# =============================================================================
# BLOCK 1: SETUP, IMPORTS, AND DATA LOADING
# =============================================================================
import warnings
warnings.filterwarnings('ignore')
import time
import os
# --- Library Imports ---
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import catboost as cb
import optuna
from scipy.optimize import minimize
print("Libraries imported successfully.")
# --- Helper Function for Winkler Score ---
def winkler_score(y_true, lower, upper, alpha=0.1, return_coverage=False):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    if return_coverage:
        coverage = np.mean((y_true >= lower) & (y_true <= upper))
        return np.mean(score), coverage
    return np.mean(score)
# --- Global Constants ---
N_SPLITS = 5
RANDOM_STATE = 42
DATA_PATH = './'
N_OPTUNA_TRIALS = 30 # A strong number for a comprehensive search
COMPETITION_ALPHA = 0.1

# --- Load Raw Data ---
try:
    # We drop the low-variance columns they identified right away
    drop_cols=['id', 'golf', 'view_rainier', 'view_skyline', 'view_lakesamm','view_otherwater', 'view_other']
    df_train = pd.read_csv(DATA_PATH + 'dataset.csv').drop(columns=drop_cols)
    df_test = pd.read_csv(DATA_PATH + 'test.csv').drop(columns=drop_cols)
    print("Raw data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'dataset.csv' or 'test.csv'.")
    exit()
# --- Prepare Target Variable ---
y_true = df_train['sale_price'].copy()
grade_for_stratify = df_train['grade'].copy()
# The mean-error model works best when predicting the raw price directly
# So, we will NOT log-transform the target this time.
# df_train.drop('sale_price', axis=1, inplace=True) # We keep sale_price for FE
print("Setup complete.")


Libraries imported successfully.
Raw data loaded successfully.
Setup complete.


In [2]:
# Make sure to have these libraries installed
# pip install pandas numpy scikit-learn

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
import gc

# Define a random state for reproducibility
RANDOM_STATE = 42

def create_comprehensive_features(df_train, df_test):
    """
    Combines original and new advanced feature engineering steps into a single pipeline.
    """
    print("--- Starting Comprehensive Feature Engineering ---")

    # Store original indices and target variable
    train_ids = df_train.index
    test_ids = df_test.index
    y_train = df_train['sale_price'].copy() # Keep the target separate

    # Combine for consistent processing
    df_train_temp = df_train.drop(columns=['sale_price'])
    all_data = pd.concat([df_train_temp, df_test], axis=0, ignore_index=True)

    # --- Original Feature Engineering ---

    # A) Brute-Force Numerical Interactions
    print("Step 1: Creating brute-force numerical interaction features...")
    NUMS = ['area', 'land_val', 'imp_val', 'sqft_lot', 'sqft', 'sqft_1', 'grade', 'year_built']
    # Ensure all columns exist and are numeric, fill missing with 0 for safety
    for col in NUMS:
        if col not in all_data.columns:
            all_data[col] = 0
        else:
            all_data[col] = pd.to_numeric(all_data[col], errors='coerce').fillna(0)
            
    for i in range(len(NUMS)):
        for j in range(i + 1, len(NUMS)):
            all_data[f'{NUMS[i]}_x_{NUMS[j]}'] = all_data[NUMS[i]] * all_data[NUMS[j]]

    # B) Date Features
    print("Step 2: Creating date features...")
    all_data['sale_date'] = pd.to_datetime(all_data['sale_date'])
    all_data['sale_year'] = all_data['sale_date'].dt.year
    all_data['sale_month'] = all_data['sale_date'].dt.month
    all_data['sale_dayofyear'] = all_data['sale_date'].dt.dayofyear
    all_data['age_at_sale'] = all_data['sale_year'] - all_data['year_built']

    # C) TF-IDF Text Features
    print("Step 3: Creating TF-IDF features for text columns...")
    text_cols = ['subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data[text_cols] = all_data[text_cols].fillna('missing').astype(str)
    
    for col in text_cols:
        tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=128, binary=True)
        svd = TruncatedSVD(n_components=8, random_state=RANDOM_STATE)
        
        tfidf_matrix = tfidf.fit_transform(all_data[col])
        tfidf_svd = svd.fit_transform(tfidf_matrix)
        
        tfidf_df = pd.DataFrame(tfidf_svd, columns=[f'{col}_tfidf_svd_{i}' for i in range(8)])
        all_data = pd.concat([all_data, tfidf_df], axis=1)

    # D) Log transform some interaction features
    for c in ['land_val_x_imp_val', 'land_val_x_sqft', 'imp_val_x_sqft']:
        if c in all_data.columns:
            all_data[c] = np.log1p(all_data[c].fillna(0))

    # --- New Feature Engineering Ideas ---

    # F) Group-By Aggregation Features
    print("Step 4: Creating group-by aggregation features...")
    group_cols = ['submarket', 'city', 'zoning']
    num_cols_for_agg = ['grade', 'sqft', 'imp_val', 'land_val', 'age_at_sale']

    for group_col in group_cols:
        for num_col in num_cols_for_agg:
            agg_stats = all_data.groupby(group_col)[num_col].agg(['mean', 'std', 'max', 'min']).reset_index()
            agg_stats.columns = [group_col] + [f'{group_col}_{num_col}_{stat}' for stat in ['mean', 'std', 'max', 'min']]
            all_data = pd.merge(all_data, agg_stats, on=group_col, how='left')
            all_data[f'{num_col}_minus_{group_col}_mean'] = all_data[num_col] - all_data[f'{group_col}_{num_col}_mean']

    # G) Ratio Features
    print("Step 5: Creating ratio features...")
    # Add a small epsilon to prevent division by zero
    epsilon = 1e-6 
    all_data['total_val'] = all_data['imp_val'] + all_data['land_val']
    all_data['imp_val_to_land_val_ratio'] = all_data['imp_val'] / (all_data['land_val'] + epsilon)
    all_data['land_val_ratio'] = all_data['land_val'] / (all_data['total_val'] + epsilon)
    all_data['sqft_to_lot_ratio'] = all_data['sqft'] / (all_data['sqft_lot'] + epsilon)
    all_data['was_renovated'] = (all_data['year_reno'] > 0).astype(int)
    all_data['reno_age_at_sale'] = np.where(all_data['was_renovated'] == 1, all_data['sale_year'] - all_data['year_reno'], -1)

    # H) Geospatial Clustering Features
    print("Step 6: Creating geospatial clustering features...")
    coords = all_data[['latitude', 'longitude']].copy()
    coords.fillna(coords.median(), inplace=True) # Simple imputation

    # KMeans is sensitive to feature scaling, but for lat/lon it's often okay without it.
    kmeans = KMeans(n_clusters=20, random_state=RANDOM_STATE, n_init=10) 
    all_data['location_cluster'] = kmeans.fit_predict(coords)
    
    # Calculate distance to each cluster center
    cluster_centers = kmeans.cluster_centers_
    for i in range(len(cluster_centers)):
        center = cluster_centers[i]
        all_data[f'dist_to_cluster_{i}'] = np.sqrt((coords['latitude'] - center[0])**2 + (coords['longitude'] - center[1])**2)

    # --- Final Cleanup ---
    print("Step 7: Finalizing feature set...")
    cols_to_drop = ['sale_date', 'subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data = all_data.drop(columns=cols_to_drop)

    # One-hot encode the new cluster feature
    all_data = pd.get_dummies(all_data, columns=['location_cluster'], prefix='loc_cluster')
    
    # Final check for any remaining object columns to be safe (besides index)
    object_cols = all_data.select_dtypes(include='object').columns
    if len(object_cols) > 0:
        print(f"Warning: Found unexpected object columns: {object_cols}. Dropping them.")
        all_data = all_data.drop(columns=object_cols)
        
    all_data.fillna(0, inplace=True)

    # Separate back into train and test sets
    train_len = len(train_ids)
    X = all_data.iloc[:train_len].copy()
    X_test = all_data.iloc[train_len:].copy()
    
    # Restore original indices
    X.index = train_ids
    X_test.index = test_ids
    
    # Align columns - crucial for model prediction
    X_test = X_test[X.columns]
    
    print(f"\nComprehensive FE complete. Total features: {X.shape[1]}")
    gc.collect()
    
    return X, X_test, y_train
# =============================================================================
# BLOCK 2.5: EXECUTE FEATURE ENGINEERING
# =============================================================================
print("\n--- Starting Block 2.5: Executing Feature Engineering Pipeline ---")

# This is the crucial step that was missing.
# We call the function to create our training and testing dataframes.
X, X_test, y_train = create_comprehensive_features(df_train, df_test)

# Let's verify the output
print(f"Feature engineering complete. X shape: {X.shape}, X_test shape: {X_test.shape}")
gc.collect()


--- Starting Block 2.5: Executing Feature Engineering Pipeline ---
--- Starting Comprehensive Feature Engineering ---
Step 1: Creating brute-force numerical interaction features...
Step 2: Creating date features...
Step 3: Creating TF-IDF features for text columns...
Step 4: Creating group-by aggregation features...
Step 5: Creating ratio features...
Step 6: Creating geospatial clustering features...
Step 7: Finalizing feature set...

Comprehensive FE complete. Total features: 233
Feature engineering complete. X shape: (200000, 233), X_test shape: (200000, 233)


0

In [3]:
# =============================================================================
# BLOCK 3: LOAD ALL PRE-TRAINED MODEL PREDICTIONS
# =============================================================================

# Define paths to your saved prediction files
PREDS_SAVE_PATH = './mean_models_v1/' # For XGB and CatBoost preds
NN_PREDS_PATH = './NN_model_predictions/' # For NN preds
ERR_PATH = './error_models/' # For error preds

print("--- Loading all base model predictions from saved .npy files... ---")
try:
    # Load Mean Model OOF (Out-of-Fold) Predictions
    oof_xgb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_xgb_preds.npy'))
    oof_cb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_cb_preds.npy'))
    oof_lgbm_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_lgbm_preds.npy'))
    oof_nn_preds = np.load(os.path.join(NN_PREDS_PATH, 'oof_nn_preds.npy'))

    oof_error_preds_cb = np.load(os.path.join(ERR_PATH, 'oof_error_preds_cb.npy'))
    oof_error_preds_lgbm = np.load(os.path.join(ERR_PATH, 'oof_error_preds_lgbm.npy'))
    oof_error_preds_xgb = np.load(os.path.join(ERR_PATH, 'oof_error_preds_xgb.npy'))
    
    # Load Mean Model Test Predictions
    test_xgb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_xgb_preds.npy'))
    test_cb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_cb_preds.npy'))
    test_lgbm_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_lgbm_preds.npy'))
    test_nn_preds = np.load(os.path.join(NN_PREDS_PATH, 'test_nn_preds.npy'))

    test_error_preds_cb = np.load(os.path.join(ERR_PATH, 'test_error_preds_cb.npy'))
    test_error_preds_lgbm = np.load(os.path.join(ERR_PATH, 'test_error_preds_lgbm.npy'))
    test_error_preds_xgb = np.load(os.path.join(ERR_PATH, 'test_error_preds_xgb.npy'))

     
    
    print("All MEAN AND ERROR models predictions loaded successfully.")
    
except FileNotFoundError as e:
    print(f"\nERROR: Could not find a required prediction file. {e}")
    print("Please ensure you have run all training notebooks and saved their predictions first.")

--- Loading all base model predictions from saved .npy files... ---
All MEAN AND ERROR models predictions loaded successfully.


In [8]:
# =============================================================================
# BLOCK 5.5 (Corrected): ELITE FEATURE SET WITH VOLATILITY FEATURES
# =============================================================================
# This block creates the definitive feature set for the quantile models.
#
# NEW IMPROVEMENT: We are now engineering features specifically designed to
# capture price VOLATILITY. This gives the quantile models a direct signal
# about which groups of houses have a wider or narrower price distribution,
# which is exactly what they need to predict the tails accurately.
#
# We also drastically reduce the number of raw features to N=25 to combat
# overfitting, forcing the model to rely on these powerful new signals.

# --- Step 1: Generate Feature Importance Ranking (Same as before) ---
print("\n--- Training a simple model to determine feature importance... ---")
dtrain_importance = xgb.DMatrix(X, label=y_true)
params = {'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'seed': RANDOM_STATE, 'n_jobs': -1}
bst_for_importance = xgb.train(params, dtrain_importance, num_boost_round=500, verbose_eval=False)
importance_scores = bst_for_importance.get_score(importance_type='gain')
feature_importance = pd.DataFrame({
    'Feature': importance_scores.keys(),
    'Importance': importance_scores.values()
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)
print("Feature importance ranking created.")

# --- Step 2: Build the ELITE Feature Set ---
print("\n--- Building an ELITE feature set with reduced features and stacked predictions ---")

# Drastically reduce feature count to combat overfitting.
N_TOP_FEATURES = 25
elite_raw_features = feature_importance['Feature'].head(N_TOP_FEATURES).tolist()
print(f"Selected the top {N_TOP_FEATURES} raw features to reduce noise.")

# Create the base dataframes
X_for_quantile = X[elite_raw_features].copy()
X_test_for_quantile = X_test[elite_raw_features].copy()

# Add the stacked predictions (mean and error models)
# These are still the most powerful features.
# (Code to add oof_mean_*, oof_error_*, test_mean_*, test_error_* preds)
for pred_name, oof_pred, test_pred in [
    ('oof_mean_xgb', oof_xgb_preds, test_xgb_preds), ('oof_mean_cb', oof_cb_preds, test_cb_preds),
    ('oof_mean_lgbm', oof_lgbm_preds, test_lgbm_preds), ('oof_mean_nn', oof_nn_preds, test_nn_preds),
    ('oof_error_xgb', oof_error_preds_xgb, test_error_preds_xgb), ('oof_error_cb', oof_error_preds_cb, test_error_preds_cb),
    ('oof_error_lgbm', oof_error_preds_lgbm, test_error_preds_lgbm)
]:
    X_for_quantile[pred_name] = oof_pred
    X_test_for_quantile[pred_name] = test_pred

# --- Step 3: Create and Add Volatility Features (The Leakage-Proof Way) ---
# This is the key improvement. We must create these features inside a CV loop.
print("\n--- Engineering and adding VOLATILITY features (leakage-proof method) ---")

# We will create OOF features for the train set and a single set for the test set.
# Initialize new feature columns with NaNs
X_for_quantile['price_std_by_submarket'] = np.nan
X_for_quantile['price_range_by_grade'] = np.nan
X_test_for_quantile['price_std_by_submarket'] = np.nan
X_test_for_quantile['price_range_by_grade'] = np.nan

# We will average the test set calculations over the folds
test_std_agg = np.zeros(len(X_test_for_quantile))
test_range_agg = np.zeros(len(X_test_for_quantile))

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, grade_for_stratify)):
    # Get the training data for THIS FOLD ONLY
    X_train_fold, y_train_fold = df_train.iloc[train_idx], y_true.iloc[train_idx]
    
    # --- Feature 1: Price Standard Deviation by Submarket ---
    # Calculate the std only on the fold's training data
    std_map = y_train_fold.groupby(X_train_fold['submarket']).std()
    # Map these values to the validation set for this fold
    val_stds = df_train.iloc[val_idx]['submarket'].map(std_map)
    X_for_quantile.loc[X_for_quantile.index[val_idx], 'price_std_by_submarket'] = val_stds
    # Map to the test set and add to the aggregate
    test_std_agg += df_test['submarket'].map(std_map) / N_SPLITS

    # --- Feature 2: Price Range by Grade ---
    # Calculate the range (max-min) only on the fold's training data
    range_map = y_train_fold.groupby(X_train_fold['grade']).apply(lambda x: x.max() - x.min())
    # Map these values to the validation set for this fold
    val_ranges = df_train.iloc[val_idx]['grade'].map(range_map)
    X_for_quantile.loc[X_for_quantile.index[val_idx], 'price_range_by_grade'] = val_ranges
    # Map to the test set and add to the aggregate
    test_range_agg += df_test['grade'].map(range_map) / N_SPLITS

# Assign the averaged features to the test set
X_test_for_quantile['price_std_by_submarket'] = test_std_agg
X_test_for_quantile['price_range_by_grade'] = test_range_agg

# --- Step 4: Final Cleanup and Report ---
# Fill any NaNs that might have occurred if a category in val/test was not in train
# (e.g., using the global median as a fallback)
global_std_median = X_for_quantile['price_std_by_submarket'].median()
global_range_median = X_for_quantile['price_range_by_grade'].median()

X_for_quantile.fillna({
    'price_std_by_submarket': global_std_median,
    'price_range_by_grade': global_range_median
}, inplace=True)
X_test_for_quantile.fillna({
    'price_std_by_submarket': global_std_median,
    'price_range_by_grade': global_range_median
}, inplace=True)

# Final alignment check
X_test_for_quantile = X_test_for_quantile[X_for_quantile.columns]

total_features = X_for_quantile.shape[1]
print(f"\nElite feature set for quantile models created successfully.")
print(f"Final Shape: {X_for_quantile.shape}")
print(f"Total features include: {N_TOP_FEATURES} raw + 7 stacked preds + 2 volatility features = {total_features}")
gc.collect()


--- Training a simple model to determine feature importance... ---
Feature importance ranking created.

--- Building an ELITE feature set with reduced features and stacked predictions ---
Selected the top 25 raw features to reduce noise.

--- Engineering and adding VOLATILITY features (leakage-proof method) ---

Elite feature set for quantile models created successfully.
Final Shape: (200000, 34)
Total features include: 25 raw + 7 stacked preds + 2 volatility features = 34


1243

In [9]:
# =============================================================================
# BLOCK 6: HYPERPARAMETER TUNING FOR QUANTILE MODELS WITH OPTUNA
# =============================================================================
# We will now use Optuna to find the best hyperparameters for our two specialist
# quantile models. We tune them independently because the optimal parameters for
# predicting the lower tail of the distribution might differ from those for the
# upper tail.
#
# We use the native XGBoost API (`xgb.train`) because it allows us to use
# callbacks like `early_stopping`, which significantly speeds up the tuning
# process by not training for a fixed, excessive number of rounds.

# --- Step 1: Create a Holdout Set for Tuning ---
# For hyperparameter tuning, we need a single, consistent validation set to
# evaluate each trial's performance. We'll split off 20% of our elite
# training data for this purpose.
print("\n--- Preparing data for Optuna tuning ---")
X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
    X_for_quantile,
    y_true,
    test_size=0.20,
    random_state=RANDOM_STATE
)

# The native XGBoost API is most efficient with its internal DMatrix format.
dtrain_opt = xgb.DMatrix(X_train_opt, label=y_train_opt)
dval_opt = xgb.DMatrix(X_val_opt, label=y_val_opt)

print(f"Data split for tuning: {len(X_train_opt)} train, {len(X_val_opt)} validation samples.")

# --- Step 2: Tune the Lower-Bound Model (alpha=0.05) ---

def objective_lower(trial):
    """Optuna objective function for the lower-bound quantile model."""
    params = {
        'objective': 'reg:quantileerror',
        'quantile_alpha': 0.05,
        # The 'quantile' eval_metric is the pinball loss, which is exactly
        # what we want to minimize for this objective.
        'eval_metric': 'quantile',
        'tree_method': 'hist',
        'seed': RANDOM_STATE,
        'n_jobs': -1,

        # --- Hyperparameters to Tune ---
        'eta': trial.suggest_float('eta', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'subsample': trial.suggest_float('subsample', 0.6, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.95),
        'lambda': trial.suggest_float('lambda', 1e-2, 100.0, log=True), # L2 Reg
        'alpha': trial.suggest_float('alpha', 1e-2, 100.0, log=True),   # L1 Reg
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
    }

    # Train the model with early stopping
    bst = xgb.train(
        params,
        dtrain_opt,
        num_boost_round=2000,  # High number, early stopping will find the best
        evals=[(dval_opt, 'validation')],
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Store the best number of boosting rounds in the trial attributes
    trial.set_user_attr('best_iteration', bst.best_iteration)

    # Return the score to be minimized (pinball loss on the validation set)
    return bst.best_score

print("\n--- Tuning the XGBoost Lower-Bound Model (alpha=0.05)... ---")
N_TRIALS = 50 # A good number for a solid search
study_lower = optuna.create_study(direction='minimize')
study_lower.optimize(objective_lower, n_trials=N_TRIALS, show_progress_bar=True)

# Extract best parameters and add the optimal number of estimators
best_params_lower = study_lower.best_params
best_params_lower['n_estimators'] = study_lower.best_trial.user_attrs['best_iteration']

print(f"\nLower-Bound Model Tuning Complete.")
print(f"Best Validation Score (Pinball Loss): {study_lower.best_value:,.4f}")
print(f"Best Parameters Found: {best_params_lower}")

# --- Step 3: Tune the Upper-Bound Model (alpha=0.95) ---

def objective_upper(trial):
    """Optuna objective function for the upper-bound quantile model."""
    # The parameter space is identical, only quantile_alpha changes.
    params = {
        'objective': 'reg:quantileerror',
        'quantile_alpha': 0.95, # The only key difference from the lower model
        'eval_metric': 'quantile',
        'tree_method': 'hist',
        'seed': RANDOM_STATE,
        'n_jobs': -1,

        # --- Hyperparameters to Tune ---
        'eta': trial.suggest_float('eta', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'subsample': trial.suggest_float('subsample', 0.6, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.95),
        'lambda': trial.suggest_float('lambda', 1e-2, 100.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-2, 100.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
    }

    bst = xgb.train(
        params,
        dtrain_opt,
        num_boost_round=2000,
        evals=[(dval_opt, 'validation')],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    trial.set_user_attr('best_iteration', bst.best_iteration)
    return bst.best_score

print("\n--- Tuning the XGBoost Upper-Bound Model (alpha=0.95)... ---")
study_upper = optuna.create_study(direction='minimize')
study_upper.optimize(objective_upper, n_trials=N_TRIALS, show_progress_bar=True)

# Extract best parameters
best_params_upper = study_upper.best_params
best_params_upper['n_estimators'] = study_upper.best_trial.user_attrs['best_iteration']

print(f"\nUpper-Bound Model Tuning Complete.")
print(f"Best Validation Score (Pinball Loss): {study_upper.best_value:,.4f}")
print(f"Best Parameters Found: {best_params_upper}")

# Clean up to free memory
del dtrain_opt, dval_opt, X_train_opt, X_val_opt, y_train_opt, y_val_opt
gc.collect()

[I 2025-07-25 19:38:40,748] A new study created in memory with name: no-name-c02667e7-78c3-437c-8014-392048230280



--- Preparing data for Optuna tuning ---
Data split for tuning: 160000 train, 40000 validation samples.

--- Tuning the XGBoost Lower-Bound Model (alpha=0.05)... ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-25 19:38:48,639] Trial 0 finished with value: 6870.290654460782 and parameters: {'eta': 0.057970079583532425, 'max_depth': 6, 'subsample': 0.7577973532493475, 'colsample_bytree': 0.624271877564692, 'lambda': 83.14652712195401, 'alpha': 48.845046642256136, 'min_child_weight': 16}. Best is trial 0 with value: 6870.290654460782.
[I 2025-07-25 19:38:54,532] Trial 1 finished with value: 6876.1039571235415 and parameters: {'eta': 0.024796089161968602, 'max_depth': 8, 'subsample': 0.6893402596863362, 'colsample_bytree': 0.6556373561581808, 'lambda': 0.5750763896207203, 'alpha': 0.11041057928325171, 'min_child_weight': 1}. Best is trial 0 with value: 6870.290654460782.
[I 2025-07-25 19:38:59,731] Trial 2 finished with value: 6879.449708960122 and parameters: {'eta': 0.038716278735454024, 'max_depth': 8, 'subsample': 0.8908824932144612, 'colsample_bytree': 0.6231163179512852, 'lambda': 14.784432457891823, 'alpha': 2.83797347548028, 'min_child_weight': 2}. Best is trial 0 with value: 

[I 2025-07-25 19:50:08,271] A new study created in memory with name: no-name-33d8e5fd-1ba5-4955-bc8b-492d1e652188


[I 2025-07-25 19:50:08,267] Trial 49 finished with value: 6821.570076007746 and parameters: {'eta': 0.013875012454619884, 'max_depth': 5, 'subsample': 0.7656751480273651, 'colsample_bytree': 0.8239448453442719, 'lambda': 0.14779033285446974, 'alpha': 0.0253047428184783, 'min_child_weight': 9}. Best is trial 28 with value: 6810.6867049341545.

Lower-Bound Model Tuning Complete.
Best Validation Score (Pinball Loss): 6,810.6867
Best Parameters Found: {'eta': 0.01262701763251539, 'max_depth': 4, 'subsample': 0.6998800758505612, 'colsample_bytree': 0.7904960107771442, 'lambda': 0.04535729555486087, 'alpha': 0.18059976328847985, 'min_child_weight': 9, 'n_estimators': 1680}

--- Tuning the XGBoost Upper-Bound Model (alpha=0.95)... ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-25 19:50:17,150] Trial 0 finished with value: 7866.627735773041 and parameters: {'eta': 0.02259648069763278, 'max_depth': 6, 'subsample': 0.7426364997144287, 'colsample_bytree': 0.6877151408915609, 'lambda': 0.03888545030675885, 'alpha': 0.06458208136736243, 'min_child_weight': 1}. Best is trial 0 with value: 7866.627735773041.
[I 2025-07-25 19:50:22,628] Trial 1 finished with value: 7871.151835758187 and parameters: {'eta': 0.041573920656684575, 'max_depth': 7, 'subsample': 0.730446736232013, 'colsample_bytree': 0.8389441984720529, 'lambda': 92.8852846914973, 'alpha': 34.82656730971083, 'min_child_weight': 20}. Best is trial 0 with value: 7866.627735773041.
[I 2025-07-25 19:50:25,728] Trial 2 finished with value: 7875.780349157314 and parameters: {'eta': 0.08401760616923908, 'max_depth': 5, 'subsample': 0.7431994070292813, 'colsample_bytree': 0.9470967973824076, 'lambda': 0.10740313852214445, 'alpha': 17.79435332401801, 'min_child_weight': 11}. Best is trial 0 with value: 7

60

In [10]:
# =============================================================================
# BLOCK 7: K-FOLD TRAINING & PREDICTION FOR QUANTILE MODELS (Corrected)
# =============================================================================
# With our tuned hyperparameters, we now perform the final model training using
# a 5-Fold cross-validation strategy.
#
# THIS IS THE CORRECTED VERSION using the native `xgb.train` API to properly
# support early stopping during the final K-Fold training loop.

# --- Step 1: Setup and Initialization ---
print("\n--- Initializing K-Fold training for tuned quantile models ---")

# Define and create the directory to save our new prediction artifacts
META_QUANTILE_PATH = './meta_quantile_models/'
os.makedirs(META_QUANTILE_PATH, exist_ok=True)
print(f"Prediction artifacts will be saved to: '{META_QUANTILE_PATH}'")

# Initialize arrays to store the predictions
oof_lower_preds = np.zeros(len(X_for_quantile))
oof_upper_preds = np.zeros(len(X_for_quantile))
test_lower_preds = np.zeros(len(X_test_for_quantile))
test_upper_preds = np.zeros(len(X_test_for_quantile))

# Initialize the consistent StratifiedKFold splitter
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

# Prepare the full test data as a DMatrix once, for efficiency
dtest = xgb.DMatrix(X_test_for_quantile)

# --- Step 2: K-Fold Training Loop with Native API ---
for fold, (train_idx, val_idx) in enumerate(skf.split(X_for_quantile, grade_for_stratify)):
    print(f"\n===== FOLD {fold+1}/{N_SPLITS} =====")

    # Split the data for this fold
    X_train, X_val = X_for_quantile.iloc[train_idx], X_for_quantile.iloc[val_idx]
    y_train, y_val = y_true.iloc[train_idx], y_true.iloc[val_idx]

    # Convert fold data to DMatrix format
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    # --- Train and Predict Lower-Bound Model ---
    print("  -> Training lower-bound model (alpha=0.05)...")
    # We remove 'n_estimators' as it's not a valid param for xgb.train
    # It's controlled by num_boost_round instead.
    num_boost_round_lower = best_params_lower.pop('n_estimators')
    
    lower_model = xgb.train(
        params=best_params_lower,
        dtrain=dtrain,
        num_boost_round=num_boost_round_lower,
        evals=[(dval, 'validation')],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    # Restore n_estimators for the next fold to avoid a pop error
    best_params_lower['n_estimators'] = num_boost_round_lower
    
    # Predict on the validation set to generate OOF predictions
    oof_lower_preds[val_idx] = lower_model.predict(dval, iteration_range=(0, lower_model.best_iteration))
    # Predict on the test set and add to the running average
    test_lower_preds += lower_model.predict(dtest, iteration_range=(0, lower_model.best_iteration)) / N_SPLITS

    # --- Train and Predict Upper-Bound Model ---
    print("  -> Training upper-bound model (alpha=0.95)...")
    num_boost_round_upper = best_params_upper.pop('n_estimators')

    upper_model = xgb.train(
        params=best_params_upper,
        dtrain=dtrain,
        num_boost_round=num_boost_round_upper,
        evals=[(dval, 'validation')],
        early_stopping_rounds=50,
        verbose_eval=False
    )

    best_params_upper['n_estimators'] = num_boost_round_upper
    
    # Predict on the validation set for OOF
    oof_upper_preds[val_idx] = upper_model.predict(dval, iteration_range=(0, upper_model.best_iteration))
    # Predict on the test set and add to the running average
    test_upper_preds += upper_model.predict(dtest, iteration_range=(0, upper_model.best_iteration)) / N_SPLITS

    gc.collect()

print("\n--- K-Fold training complete. ---")

# --- Step 3: Save the Prediction Artifacts ---
print("\n--- Saving quantile prediction arrays to disk... ---")

np.save(os.path.join(META_QUANTILE_PATH, 'oof_lower_preds.npy'), oof_lower_preds)
print(f"Saved 'oof_lower_preds.npy' successfully.")

np.save(os.path.join(META_QUANTILE_PATH, 'test_lower_preds.npy'), test_lower_preds)
print(f"Saved 'test_lower_preds.npy' successfully.")

np.save(os.path.join(META_QUANTILE_PATH, 'oof_upper_preds.npy'), oof_upper_preds)
print(f"Saved 'oof_upper_preds.npy' successfully.")

np.save(os.path.join(META_QUANTILE_PATH, 'test_upper_preds.npy'), test_upper_preds)
print(f"Saved 'test_upper_preds.npy' successfully.")

print("\nAll prediction artifacts are now ready for the final calibration and submission step.")


--- Initializing K-Fold training for tuned quantile models ---
Prediction artifacts will be saved to: './meta_quantile_models/'

===== FOLD 1/5 =====
  -> Training lower-bound model (alpha=0.05)...
  -> Training upper-bound model (alpha=0.95)...

===== FOLD 2/5 =====
  -> Training lower-bound model (alpha=0.05)...
  -> Training upper-bound model (alpha=0.95)...

===== FOLD 3/5 =====
  -> Training lower-bound model (alpha=0.05)...
  -> Training upper-bound model (alpha=0.95)...

===== FOLD 4/5 =====
  -> Training lower-bound model (alpha=0.05)...
  -> Training upper-bound model (alpha=0.95)...

===== FOLD 5/5 =====
  -> Training lower-bound model (alpha=0.05)...
  -> Training upper-bound model (alpha=0.95)...

--- K-Fold training complete. ---

--- Saving quantile prediction arrays to disk... ---
Saved 'oof_lower_preds.npy' successfully.
Saved 'test_lower_preds.npy' successfully.
Saved 'oof_upper_preds.npy' successfully.
Saved 'test_upper_preds.npy' successfully.

All prediction artifa

In [13]:
# =============================================================================
# BLOCK 9 (Corrected): FINAL ENSEMBLE OF INTERVALS (Self-Contained)
# =============================================================================
# This is the definitive final step. We will blend the bounds from our two
# best pipelines to create a final, robust submission.
#
# CORRECTION: This version now explicitly loads all required OOF prediction
# arrays from disk to ensure it can run independently without NameErrors.

import os
import numpy as np
import pandas as pd
from scipy.optimize import minimize

# --- Helper Function for Winkler Score (to make block self-contained) ---
def winkler_score(y_true, lower, upper, alpha=0.1):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    return np.mean(score)

# --- Step 1: Load All Required Predictions from Disk ---
print("\n--- Loading all necessary prediction files ---")

# --- Define Paths ---
DATA_PATH = './'
PREDS_SAVE_PATH = './mean_models_v1/'
NN_PREDS_PATH = './NN_model_predictions/'
ERROR_MODELS_PATH = './error_models/'
META_QUANTILE_PATH = './meta_quantile_models/'
ERROR_MODEL_SUB_FILE = 'submission_final_OptimalEoE_292680.csv'
QUANTILE_MODEL_SUB_FILE = 'submission_direct_quantile_robust_349061.csv'

try:
    # --- Load Test Set Bounds (from submission files) ---
    df_error_model = pd.read_csv(ERROR_MODEL_SUB_FILE)
    df_quantile_model = pd.read_csv(QUANTILE_MODEL_SUB_FILE)
    test_lower_error = df_error_model['pi_lower'].values
    test_upper_error = df_error_model['pi_upper'].values
    test_lower_quantile = df_quantile_model['pi_lower'].values
    test_upper_quantile = df_quantile_model['pi_upper'].values

    # --- Load All OOF Predictions Needed for Recreation ---
    # True labels
    df_train = pd.read_csv(os.path.join(DATA_PATH, 'dataset.csv'))
    y_true = df_train['sale_price'].values

    # OOF Mean model predictions
    oof_xgb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_xgb_preds.npy'))
    oof_cb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_cb_preds.npy'))
    oof_lgbm_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_lgbm_preds.npy'))
    oof_nn_preds = np.load(os.path.join(NN_PREDS_PATH, 'oof_nn_preds.npy'))

    # OOF Error model predictions
    oof_error_preds_xgb = np.load(os.path.join(ERROR_MODELS_PATH, 'oof_error_preds_xgb.npy'))
    oof_error_preds_cb = np.load(os.path.join(ERROR_MODELS_PATH, 'oof_error_preds_cb.npy'))

    # OOF Quantile model predictions
    oof_lower_preds = np.load(os.path.join(META_QUANTILE_PATH, 'oof_lower_preds.npy'))
    oof_upper_preds = np.load(os.path.join(META_QUANTILE_PATH, 'oof_upper_preds.npy'))

    print("All prediction artifacts loaded successfully.")

except FileNotFoundError as e:
    print(f"\nERROR: Could not find a required file. {e}")
    print("Please ensure all previous training notebooks have been run successfully.")
    # exit()


# --- Step 2: Recreate OOF Bounds for Both Pipelines ---
print("\n--- Recreating OOF interval bounds from base predictions ---")

# --- Pipeline 1: Mean+Error Model OOF Bounds ---
oof_ensemble_mean = (oof_xgb_preds + oof_cb_preds + oof_lgbm_preds + oof_nn_preds) / 4
best_a_error_model, best_b_error_model = 1.9799, 2.1755
# Using the 60/40 blend weights from the original notebook
oof_error_final = np.clip((oof_error_preds_xgb * 0.60 + oof_error_preds_cb * 0.40), 0, None)
oof_lower_error = oof_ensemble_mean - oof_error_final * best_a_error_model
oof_upper_error = oof_ensemble_mean + oof_error_final * best_b_error_model

# --- Pipeline 2: Direct Quantile Model OOF Bounds ---
best_a_quantile, best_b_quantile = 0.8118, 1.1960
# Enforce non-crossing, as was done in the robust version
oof_lower_quantile_raw = np.minimum(oof_lower_preds, oof_upper_preds)
oof_upper_quantile_raw = np.maximum(oof_lower_preds, oof_upper_preds)
oof_lower_quantile = oof_lower_quantile_raw * best_a_quantile
oof_upper_quantile = oof_upper_quantile_raw * best_b_quantile


# --- Step 3: Find the Optimal Blend Weight ---
print("\n--- Optimizing the blend weight for the two interval models ---")

def get_blended_winkler(weights, y_true_oof, lower_a, upper_a, lower_b, upper_b):
    weight_a = weights[0]
    weight_b = 1 - weight_a
    final_lower = (lower_a * weight_a) + (lower_b * weight_b)
    final_upper = (upper_a * weight_a) + (upper_b * weight_b)
    return winkler_score(y_true_oof, final_lower, final_upper)

initial_guess, bounds = [0.5], [(0, 1)]
result_blend = minimize(
    fun=get_blended_winkler,
    x0=initial_guess,
    args=(y_true, oof_lower_error, oof_upper_error, oof_lower_quantile, oof_upper_quantile),
    method='L-BFGS-B',
    bounds=bounds
)

best_weight_error_model = result_blend.x[0]
best_weight_quantile_model = 1 - best_weight_error_model
best_blended_score = result_blend.fun


# --- Step 4: Display Final Results ---
print("\n" + "="*60)
print("FINAL ENSEMBLE RESULTS")
print("="*60)
print(f"Original Error Model OOF Score:     $292,680.00") # From filename
print(f"Direct Quantile Model OOF Score:  $349,061.10") # From filename
print(f"Final BLENDED OOF Winkler Score:    ${best_blended_score:,.2f}")
print("-" * 60)
print(f"Optimal Blend Weights:")
print(f"  -> Mean+Error Model:      {best_weight_error_model:.2%}")
print(f"  -> Direct Quantile Model: {best_weight_quantile_model:.2%}")


# --- Step 5: Create and Save the Final Blended Submission ---
print("\n--- Creating final blended submission file... ---")

final_test_lower = (test_lower_error * best_weight_error_model) + (test_lower_quantile * best_weight_quantile_model)
final_test_upper = (test_upper_error * best_weight_error_model) + (test_upper_quantile * best_weight_quantile_model)
final_test_upper = np.maximum(final_test_lower + 1, final_test_upper)

submission_df_final = pd.DataFrame({
    'id': df_error_model['id'],
    'pi_lower': final_test_lower,
    'pi_upper': final_test_upper
})

submission_filename = f'submission_FINAL_BLEND_{int(best_blended_score)}.csv'
submission_df_final.to_csv(submission_filename, index=False)

print(f"\n'{submission_filename}' created successfully! This is your best shot!")
print("\nFinal Submission Head:")
print(submission_df_final.head())


--- Loading all necessary prediction files ---
All prediction artifacts loaded successfully.

--- Recreating OOF interval bounds from base predictions ---

--- Optimizing the blend weight for the two interval models ---

FINAL ENSEMBLE RESULTS
Original Error Model OOF Score:     $292,680.00
Direct Quantile Model OOF Score:  $349,061.10
Final BLENDED OOF Winkler Score:    $294,662.90
------------------------------------------------------------
Optimal Blend Weights:
  -> Mean+Error Model:      93.15%
  -> Direct Quantile Model: 6.85%

--- Creating final blended submission file... ---

'submission_FINAL_BLEND_294662.csv' created successfully! This is your best shot!

Final Submission Head:
       id       pi_lower      pi_upper
0  200000  812841.025447  1.016490e+06
1  200001  576953.492400  7.999085e+05
2  200002  450015.580139  6.540748e+05
3  200003  294411.784474  4.243998e+05
4  200004  354569.920838  7.901194e+05
