In [1]:
# =============================================================================
# BLOCK 1: SETUP, IMPORTS, AND DATA LOADING
# =============================================================================
import warnings
warnings.filterwarnings('ignore')
import time
import os
# --- Library Imports ---
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import catboost as cb
import optuna
from scipy.optimize import minimize
print("Libraries imported successfully.")
# --- Helper Function for Winkler Score ---
def winkler_score(y_true, lower, upper, alpha=0.1, return_coverage=False):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    if return_coverage:
        coverage = np.mean((y_true >= lower) & (y_true <= upper))
        return np.mean(score), coverage
    return np.mean(score)
# --- Global Constants ---
N_SPLITS = 5
RANDOM_STATE = 42
DATA_PATH = './'
N_OPTUNA_TRIALS = 30 # A strong number for a comprehensive search
COMPETITION_ALPHA = 0.1

# --- Load Raw Data ---
try:
    # We drop the low-variance columns they identified right away
    drop_cols=['id', 'golf', 'view_rainier', 'view_skyline', 'view_lakesamm','view_otherwater', 'view_other']
    df_train = pd.read_csv(DATA_PATH + 'dataset.csv').drop(columns=drop_cols)
    df_test = pd.read_csv(DATA_PATH + 'test.csv').drop(columns=drop_cols)
    print("Raw data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'dataset.csv' or 'test.csv'.")
    exit()
# --- Prepare Target Variable ---
y_true = df_train['sale_price'].copy()
grade_for_stratify = df_train['grade'].copy()
# The mean-error model works best when predicting the raw price directly
# So, we will NOT log-transform the target this time.
# df_train.drop('sale_price', axis=1, inplace=True) # We keep sale_price for FE
print("Setup complete.")


Libraries imported successfully.
Raw data loaded successfully.
Setup complete.


In [2]:
# Make sure to have these libraries installed
# pip install pandas numpy scikit-learn

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
import gc

# Define a random state for reproducibility
RANDOM_STATE = 42

def create_comprehensive_features(df_train, df_test):
    """
    Combines original and new advanced feature engineering steps into a single pipeline.
    """
    print("--- Starting Comprehensive Feature Engineering ---")

    # Store original indices and target variable
    train_ids = df_train.index
    test_ids = df_test.index
    y_train = df_train['sale_price'].copy() # Keep the target separate

    # Combine for consistent processing
    df_train_temp = df_train.drop(columns=['sale_price'])
    all_data = pd.concat([df_train_temp, df_test], axis=0, ignore_index=True)

    # --- Original Feature Engineering ---

    # A) Brute-Force Numerical Interactions
    print("Step 1: Creating brute-force numerical interaction features...")
    NUMS = ['area', 'land_val', 'imp_val', 'sqft_lot', 'sqft', 'sqft_1', 'grade', 'year_built']
    # Ensure all columns exist and are numeric, fill missing with 0 for safety
    for col in NUMS:
        if col not in all_data.columns:
            all_data[col] = 0
        else:
            all_data[col] = pd.to_numeric(all_data[col], errors='coerce').fillna(0)
            
    for i in range(len(NUMS)):
        for j in range(i + 1, len(NUMS)):
            all_data[f'{NUMS[i]}_x_{NUMS[j]}'] = all_data[NUMS[i]] * all_data[NUMS[j]]

    # B) Date Features
    print("Step 2: Creating date features...")
    all_data['sale_date'] = pd.to_datetime(all_data['sale_date'])
    all_data['sale_year'] = all_data['sale_date'].dt.year
    all_data['sale_month'] = all_data['sale_date'].dt.month
    all_data['sale_dayofyear'] = all_data['sale_date'].dt.dayofyear
    all_data['age_at_sale'] = all_data['sale_year'] - all_data['year_built']

    # C) TF-IDF Text Features
    print("Step 3: Creating TF-IDF features for text columns...")
    text_cols = ['subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data[text_cols] = all_data[text_cols].fillna('missing').astype(str)
    
    for col in text_cols:
        tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=128, binary=True)
        svd = TruncatedSVD(n_components=8, random_state=RANDOM_STATE)
        
        tfidf_matrix = tfidf.fit_transform(all_data[col])
        tfidf_svd = svd.fit_transform(tfidf_matrix)
        
        tfidf_df = pd.DataFrame(tfidf_svd, columns=[f'{col}_tfidf_svd_{i}' for i in range(8)])
        all_data = pd.concat([all_data, tfidf_df], axis=1)

    # D) Log transform some interaction features
    for c in ['land_val_x_imp_val', 'land_val_x_sqft', 'imp_val_x_sqft']:
        if c in all_data.columns:
            all_data[c] = np.log1p(all_data[c].fillna(0))

    # --- New Feature Engineering Ideas ---

    # F) Group-By Aggregation Features
    print("Step 4: Creating group-by aggregation features...")
    group_cols = ['submarket', 'city', 'zoning']
    num_cols_for_agg = ['grade', 'sqft', 'imp_val', 'land_val', 'age_at_sale']

    for group_col in group_cols:
        for num_col in num_cols_for_agg:
            agg_stats = all_data.groupby(group_col)[num_col].agg(['mean', 'std', 'max', 'min']).reset_index()
            agg_stats.columns = [group_col] + [f'{group_col}_{num_col}_{stat}' for stat in ['mean', 'std', 'max', 'min']]
            all_data = pd.merge(all_data, agg_stats, on=group_col, how='left')
            all_data[f'{num_col}_minus_{group_col}_mean'] = all_data[num_col] - all_data[f'{group_col}_{num_col}_mean']

    # G) Ratio Features
    print("Step 5: Creating ratio features...")
    # Add a small epsilon to prevent division by zero
    epsilon = 1e-6 
    all_data['total_val'] = all_data['imp_val'] + all_data['land_val']
    all_data['imp_val_to_land_val_ratio'] = all_data['imp_val'] / (all_data['land_val'] + epsilon)
    all_data['land_val_ratio'] = all_data['land_val'] / (all_data['total_val'] + epsilon)
    all_data['sqft_to_lot_ratio'] = all_data['sqft'] / (all_data['sqft_lot'] + epsilon)
    all_data['was_renovated'] = (all_data['year_reno'] > 0).astype(int)
    all_data['reno_age_at_sale'] = np.where(all_data['was_renovated'] == 1, all_data['sale_year'] - all_data['year_reno'], -1)

    # H) Geospatial Clustering Features
    print("Step 6: Creating geospatial clustering features...")
    coords = all_data[['latitude', 'longitude']].copy()
    coords.fillna(coords.median(), inplace=True) # Simple imputation

    # KMeans is sensitive to feature scaling, but for lat/lon it's often okay without it.
    kmeans = KMeans(n_clusters=20, random_state=RANDOM_STATE, n_init=10) 
    all_data['location_cluster'] = kmeans.fit_predict(coords)
    
    # Calculate distance to each cluster center
    cluster_centers = kmeans.cluster_centers_
    for i in range(len(cluster_centers)):
        center = cluster_centers[i]
        all_data[f'dist_to_cluster_{i}'] = np.sqrt((coords['latitude'] - center[0])**2 + (coords['longitude'] - center[1])**2)

    # --- Final Cleanup ---
    print("Step 7: Finalizing feature set...")
    cols_to_drop = ['sale_date', 'subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data = all_data.drop(columns=cols_to_drop)

    # One-hot encode the new cluster feature
    all_data = pd.get_dummies(all_data, columns=['location_cluster'], prefix='loc_cluster')
    
    # Final check for any remaining object columns to be safe (besides index)
    object_cols = all_data.select_dtypes(include='object').columns
    if len(object_cols) > 0:
        print(f"Warning: Found unexpected object columns: {object_cols}. Dropping them.")
        all_data = all_data.drop(columns=object_cols)
        
    all_data.fillna(0, inplace=True)

    # Separate back into train and test sets
    train_len = len(train_ids)
    X = all_data.iloc[:train_len].copy()
    X_test = all_data.iloc[train_len:].copy()
    
    # Restore original indices
    X.index = train_ids
    X_test.index = test_ids
    
    # Align columns - crucial for model prediction
    X_test = X_test[X.columns]
    
    print(f"\nComprehensive FE complete. Total features: {X.shape[1]}")
    gc.collect()
    
    return X, X_test, y_train
# =============================================================================
# BLOCK 2.5: EXECUTE FEATURE ENGINEERING
# =============================================================================
print("\n--- Starting Block 2.5: Executing Feature Engineering Pipeline ---")

# This is the crucial step that was missing.
# We call the function to create our training and testing dataframes.
X, X_test, y_train = create_comprehensive_features(df_train, df_test)

# Let's verify the output
print(f"Feature engineering complete. X shape: {X.shape}, X_test shape: {X_test.shape}")
gc.collect()


--- Starting Block 2.5: Executing Feature Engineering Pipeline ---
--- Starting Comprehensive Feature Engineering ---
Step 1: Creating brute-force numerical interaction features...
Step 2: Creating date features...
Step 3: Creating TF-IDF features for text columns...
Step 4: Creating group-by aggregation features...
Step 5: Creating ratio features...
Step 6: Creating geospatial clustering features...
Step 7: Finalizing feature set...

Comprehensive FE complete. Total features: 233
Feature engineering complete. X shape: (200000, 233), X_test shape: (200000, 233)


0

In [5]:
# =============================================================================
# BLOCK 3: LOAD ALL PRE-TRAINED MODEL PREDICTIONS
# =============================================================================

# Define paths to your saved prediction files
PREDS_SAVE_PATH = './mean_models_v1/' # For XGB and CatBoost preds
NN_PREDS_PATH = './NN_model_predictions/' # For NN preds
ERR_PATH = './error_models/' # For error preds

print("--- Loading all base model predictions from saved .npy files... ---")
try:
    # Load Mean Model OOF (Out-of-Fold) Predictions
    oof_xgb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_xgb_preds.npy'))
    oof_cb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_cb_preds.npy'))
    oof_lgbm_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'oof_lgbm_preds.npy'))
    oof_nn_preds = np.load(os.path.join(NN_PREDS_PATH, 'oof_nn_preds.npy'))

    oof_error_preds_cb = np.load(os.path.join(ERR_PATH, 'oof_error_preds_cb.npy'))
    oof_error_preds_lgbm = np.load(os.path.join(ERR_PATH, 'oof_error_preds_lgbm.npy'))
    oof_error_preds_xgb = np.load(os.path.join(ERR_PATH, 'oof_error_preds_xgb.npy'))
    
    # Load Mean Model Test Predictions
    test_xgb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_xgb_preds.npy'))
    test_cb_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_cb_preds.npy'))
    test_lgbm_preds = np.load(os.path.join(PREDS_SAVE_PATH, 'test_lgbm_preds.npy'))
    test_nn_preds = np.load(os.path.join(NN_PREDS_PATH, 'test_nn_preds.npy'))

    test_error_preds_cb = np.load(os.path.join(ERR_PATH, 'test_error_preds_cb.npy'))
    test_error_preds_lgbm = np.load(os.path.join(ERR_PATH, 'test_error_preds_lgbm.npy'))
    test_error_preds_xgb = np.load(os.path.join(ERR_PATH, 'test_error_preds_xgb.npy'))

     
    
    print("All MEAN AND ERROR models predictions loaded successfully.")
    
except FileNotFoundError as e:
    print(f"\nERROR: Could not find a required prediction file. {e}")
    print("Please ensure you have run all training notebooks and saved their predictions first.")

--- Loading all base model predictions from saved .npy files... ---
All MEAN AND ERROR models predictions loaded successfully.


In [6]:
# =============================================================================
# BLOCK 5.5: BUILD ELITE FEATURE SET FOR QUANTILE MODELS
# =============================================================================
# This block creates the definitive feature set that will be used to train the
# new quantile models. It's a "stacked" feature set, combining:
# 1. The most important raw features (to reduce noise).
# 2. The predictions from the Stage 1 Mean Ensemble (to know the central tendency).
# 3. The predictions from the Stage 2 Error Ensemble (to understand the uncertainty).
# This gives the quantile models maximum context.

# --- Step 1: Generate Feature Importance Ranking ---
# We'll train a quick XGBoost model on the full raw feature set ('X') to get a
# robust ranking of feature importance based on 'gain'.
print("\n--- Training a simple model to determine feature importance... ---")

# For performance, we can use a subset of data for importance calculation if needed
# X_sample, _, y_sample, _ = train_test_split(X, y_true, test_size=0.5, random_state=RANDOM_STATE)
# dtrain_importance = xgb.DMatrix(X_sample, label=y_sample)
dtrain_importance = xgb.DMatrix(X, label=y_true) # Using full data for best accuracy

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'seed': RANDOM_STATE,
    'n_jobs': -1
}
bst_for_importance = xgb.train(
    params,
    dtrain_importance,
    num_boost_round=500, # A reasonable number of rounds
    verbose_eval=False
)

importance_scores = bst_for_importance.get_score(importance_type='gain')
feature_importance = pd.DataFrame({
    'Feature': importance_scores.keys(),
    'Importance': importance_scores.values()
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)

print("Feature importance ranking created.")
print("Top 5 features:")
print(feature_importance.head())


# --- Step 2: Build the ELITE Feature Set ---
# Now, we construct the final feature set for our quantile models.
print("\n--- Building an ELITE feature set to reduce noise and stack predictions ---")

# Define how many of the top raw features to include
N_TOP_FEATURES = 50
elite_raw_features = feature_importance['Feature'].head(N_TOP_FEATURES).tolist()
print(f"Selected the top {N_TOP_FEATURES} raw features.")

# --- Create the training set for the quantile models (X_for_quantile) ---
# Start with the elite raw features
X_for_quantile = X[elite_raw_features].copy()

# Add the Stage 1 MEAN model predictions (OOF)
X_for_quantile['oof_mean_xgb'] = oof_xgb_preds
X_for_quantile['oof_mean_cb'] = oof_cb_preds
X_for_quantile['oof_mean_lgbm'] = oof_lgbm_preds
X_for_quantile['oof_mean_nn'] = oof_nn_preds

# Add the Stage 2 ERROR model predictions (OOF) - THIS IS THE NEW ADDITION
X_for_quantile['oof_error_xgb'] = oof_error_preds_xgb
X_for_quantile['oof_error_cb'] = oof_error_preds_cb
X_for_quantile['oof_error_lgbm'] = oof_error_preds_lgbm

# --- Create the test set for the quantile models (X_test_for_quantile) ---
# It is CRUCIAL that the test set has the exact same features in the same order.
X_test_for_quantile = X_test[elite_raw_features].copy()

# Add the Stage 1 MEAN model predictions (test)
X_test_for_quantile['oof_mean_xgb'] = test_xgb_preds
X_test_for_quantile['oof_mean_cb'] = test_cb_preds
X_test_for_quantile['oof_mean_lgbm'] = test_lgbm_preds
X_test_for_quantile['oof_mean_nn'] = test_nn_preds

# Add the Stage 2 ERROR model predictions (test)
X_test_for_quantile['oof_error_xgb'] = test_error_preds_xgb
X_test_for_quantile['oof_error_cb'] = test_error_preds_cb
X_test_for_quantile['oof_error_lgbm'] = test_error_preds_lgbm


# Final check and report
# Ensure columns are aligned
X_test_for_quantile = X_test_for_quantile[X_for_quantile.columns]

print(f"\nElite feature set for quantile models created.")
print(f"Final Shape: {X_for_quantile.shape}")
print(f"Total features include: {N_TOP_FEATURES} raw + 4 mean preds + 3 error preds = {X_for_quantile.shape[1]}")
gc.collect()


--- Training a simple model to determine feature importance... ---
Feature importance ranking created.
Top 5 features:
                   Feature    Importance
0  submarket_land_val_mean  5.867908e+13
1                total_val  4.830099e+13
2                    grade  2.531556e+13
3                join_year  2.396559e+13
4     submarket_grade_mean  1.996464e+13

--- Building an ELITE feature set to reduce noise and stack predictions ---
Selected the top 50 raw features.

Elite feature set for quantile models created.
Final Shape: (200000, 57)
Total features include: 50 raw + 4 mean preds + 3 error preds = 57


781

In [7]:
# =============================================================================
# BLOCK 6: HYPERPARAMETER TUNING FOR QUANTILE MODELS WITH OPTUNA
# =============================================================================
# We will now use Optuna to find the best hyperparameters for our two specialist
# quantile models. We tune them independently because the optimal parameters for
# predicting the lower tail of the distribution might differ from those for the
# upper tail.
#
# We use the native XGBoost API (`xgb.train`) because it allows us to use
# callbacks like `early_stopping`, which significantly speeds up the tuning
# process by not training for a fixed, excessive number of rounds.

# --- Step 1: Create a Holdout Set for Tuning ---
# For hyperparameter tuning, we need a single, consistent validation set to
# evaluate each trial's performance. We'll split off 20% of our elite
# training data for this purpose.
print("\n--- Preparing data for Optuna tuning ---")
X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
    X_for_quantile,
    y_true,
    test_size=0.20,
    random_state=RANDOM_STATE
)

# The native XGBoost API is most efficient with its internal DMatrix format.
dtrain_opt = xgb.DMatrix(X_train_opt, label=y_train_opt)
dval_opt = xgb.DMatrix(X_val_opt, label=y_val_opt)

print(f"Data split for tuning: {len(X_train_opt)} train, {len(X_val_opt)} validation samples.")

# --- Step 2: Tune the Lower-Bound Model (alpha=0.05) ---

def objective_lower(trial):
    """Optuna objective function for the lower-bound quantile model."""
    params = {
        'objective': 'reg:quantileerror',
        'quantile_alpha': 0.05,
        # The 'quantile' eval_metric is the pinball loss, which is exactly
        # what we want to minimize for this objective.
        'eval_metric': 'quantile',
        'tree_method': 'hist',
        'seed': RANDOM_STATE,
        'n_jobs': -1,

        # --- Hyperparameters to Tune ---
        'eta': trial.suggest_float('eta', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'subsample': trial.suggest_float('subsample', 0.6, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.95),
        'lambda': trial.suggest_float('lambda', 1e-2, 100.0, log=True), # L2 Reg
        'alpha': trial.suggest_float('alpha', 1e-2, 100.0, log=True),   # L1 Reg
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
    }

    # Train the model with early stopping
    bst = xgb.train(
        params,
        dtrain_opt,
        num_boost_round=2000,  # High number, early stopping will find the best
        evals=[(dval_opt, 'validation')],
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Store the best number of boosting rounds in the trial attributes
    trial.set_user_attr('best_iteration', bst.best_iteration)

    # Return the score to be minimized (pinball loss on the validation set)
    return bst.best_score

print("\n--- Tuning the XGBoost Lower-Bound Model (alpha=0.05)... ---")
N_TRIALS = 50 # A good number for a solid search
study_lower = optuna.create_study(direction='minimize')
study_lower.optimize(objective_lower, n_trials=N_TRIALS, show_progress_bar=True)

# Extract best parameters and add the optimal number of estimators
best_params_lower = study_lower.best_params
best_params_lower['n_estimators'] = study_lower.best_trial.user_attrs['best_iteration']

print(f"\nLower-Bound Model Tuning Complete.")
print(f"Best Validation Score (Pinball Loss): {study_lower.best_value:,.4f}")
print(f"Best Parameters Found: {best_params_lower}")

# --- Step 3: Tune the Upper-Bound Model (alpha=0.95) ---

def objective_upper(trial):
    """Optuna objective function for the upper-bound quantile model."""
    # The parameter space is identical, only quantile_alpha changes.
    params = {
        'objective': 'reg:quantileerror',
        'quantile_alpha': 0.95, # The only key difference from the lower model
        'eval_metric': 'quantile',
        'tree_method': 'hist',
        'seed': RANDOM_STATE,
        'n_jobs': -1,

        # --- Hyperparameters to Tune ---
        'eta': trial.suggest_float('eta', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'subsample': trial.suggest_float('subsample', 0.6, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.95),
        'lambda': trial.suggest_float('lambda', 1e-2, 100.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-2, 100.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
    }

    bst = xgb.train(
        params,
        dtrain_opt,
        num_boost_round=2000,
        evals=[(dval_opt, 'validation')],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    trial.set_user_attr('best_iteration', bst.best_iteration)
    return bst.best_score

print("\n--- Tuning the XGBoost Upper-Bound Model (alpha=0.95)... ---")
study_upper = optuna.create_study(direction='minimize')
study_upper.optimize(objective_upper, n_trials=N_TRIALS, show_progress_bar=True)

# Extract best parameters
best_params_upper = study_upper.best_params
best_params_upper['n_estimators'] = study_upper.best_trial.user_attrs['best_iteration']

print(f"\nUpper-Bound Model Tuning Complete.")
print(f"Best Validation Score (Pinball Loss): {study_upper.best_value:,.4f}")
print(f"Best Parameters Found: {best_params_upper}")

# Clean up to free memory
del dtrain_opt, dval_opt, X_train_opt, X_val_opt, y_train_opt, y_val_opt
gc.collect()

[I 2025-07-25 18:54:00,688] A new study created in memory with name: no-name-5041c12b-3691-4643-9384-faba761566df



--- Preparing data for Optuna tuning ---
Data split for tuning: 160000 train, 40000 validation samples.

--- Tuning the XGBoost Lower-Bound Model (alpha=0.05)... ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-25 18:54:06,948] Trial 0 finished with value: 6853.092056820176 and parameters: {'eta': 0.06774242455930102, 'max_depth': 5, 'subsample': 0.9297907401975716, 'colsample_bytree': 0.723331642560421, 'lambda': 0.015026612570275745, 'alpha': 17.44138634257978, 'min_child_weight': 3}. Best is trial 0 with value: 6853.092056820176.
[I 2025-07-25 18:54:16,767] Trial 1 finished with value: 6876.910690848608 and parameters: {'eta': 0.032023353865626505, 'max_depth': 8, 'subsample': 0.9221149321683481, 'colsample_bytree': 0.6807156313368463, 'lambda': 2.1625499967817516, 'alpha': 0.08197922179369121, 'min_child_weight': 15}. Best is trial 0 with value: 6853.092056820176.
[I 2025-07-25 18:54:22,697] Trial 2 finished with value: 6895.8986685040745 and parameters: {'eta': 0.03702676635881892, 'max_depth': 8, 'subsample': 0.7522119784199633, 'colsample_bytree': 0.948023536009083, 'lambda': 31.22100518074038, 'alpha': 4.184244987671205, 'min_child_weight': 2}. Best is trial 0 with value: 6

[I 2025-07-25 19:07:06,193] A new study created in memory with name: no-name-1dc32a53-e3f3-400f-9a79-026bd205e049


[I 2025-07-25 19:07:06,191] Trial 49 finished with value: 6815.43512410467 and parameters: {'eta': 0.01257153833124896, 'max_depth': 5, 'subsample': 0.7230896669246231, 'colsample_bytree': 0.9393850097154499, 'lambda': 4.316110182703192, 'alpha': 0.010588962831465362, 'min_child_weight': 19}. Best is trial 35 with value: 6802.089641840101.

Lower-Bound Model Tuning Complete.
Best Validation Score (Pinball Loss): 6,802.0896
Best Parameters Found: {'eta': 0.011179706525759989, 'max_depth': 5, 'subsample': 0.7020252586633959, 'colsample_bytree': 0.7187717643779563, 'lambda': 0.010738093929645162, 'alpha': 0.061669941714552284, 'min_child_weight': 16, 'n_estimators': 1520}

--- Tuning the XGBoost Upper-Bound Model (alpha=0.95)... ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-25 19:07:11,555] Trial 0 finished with value: 8042.0927096222385 and parameters: {'eta': 0.039920613647788455, 'max_depth': 8, 'subsample': 0.7839958704975047, 'colsample_bytree': 0.6505382733097559, 'lambda': 0.13588375646265957, 'alpha': 0.05233827925678519, 'min_child_weight': 12}. Best is trial 0 with value: 8042.0927096222385.
[I 2025-07-25 19:07:18,835] Trial 1 finished with value: 7915.0715451055985 and parameters: {'eta': 0.0364348602768857, 'max_depth': 7, 'subsample': 0.8606925696521479, 'colsample_bytree': 0.7756860117110942, 'lambda': 69.0869988765632, 'alpha': 0.016257085516729344, 'min_child_weight': 5}. Best is trial 1 with value: 7915.0715451055985.
[I 2025-07-25 19:07:25,126] Trial 2 finished with value: 7845.656902966944 and parameters: {'eta': 0.053626963192890155, 'max_depth': 5, 'subsample': 0.8823081253252743, 'colsample_bytree': 0.6964156923408096, 'lambda': 76.64140923615967, 'alpha': 0.28194921119293476, 'min_child_weight': 13}. Best is trial 2 with 

32

In [9]:
# =============================================================================
# BLOCK 7: K-FOLD TRAINING & PREDICTION FOR QUANTILE MODELS (Corrected)
# =============================================================================
# With our tuned hyperparameters, we now perform the final model training using
# a 5-Fold cross-validation strategy.
#
# THIS IS THE CORRECTED VERSION using the native `xgb.train` API to properly
# support early stopping during the final K-Fold training loop.

# --- Step 1: Setup and Initialization ---
print("\n--- Initializing K-Fold training for tuned quantile models ---")

# Define and create the directory to save our new prediction artifacts
META_QUANTILE_PATH = './meta_quantile_models/'
os.makedirs(META_QUANTILE_PATH, exist_ok=True)
print(f"Prediction artifacts will be saved to: '{META_QUANTILE_PATH}'")

# Initialize arrays to store the predictions
oof_lower_preds = np.zeros(len(X_for_quantile))
oof_upper_preds = np.zeros(len(X_for_quantile))
test_lower_preds = np.zeros(len(X_test_for_quantile))
test_upper_preds = np.zeros(len(X_test_for_quantile))

# Initialize the consistent StratifiedKFold splitter
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

# Prepare the full test data as a DMatrix once, for efficiency
dtest = xgb.DMatrix(X_test_for_quantile)

# --- Step 2: K-Fold Training Loop with Native API ---
for fold, (train_idx, val_idx) in enumerate(skf.split(X_for_quantile, grade_for_stratify)):
    print(f"\n===== FOLD {fold+1}/{N_SPLITS} =====")

    # Split the data for this fold
    X_train, X_val = X_for_quantile.iloc[train_idx], X_for_quantile.iloc[val_idx]
    y_train, y_val = y_true.iloc[train_idx], y_true.iloc[val_idx]

    # Convert fold data to DMatrix format
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    # --- Train and Predict Lower-Bound Model ---
    print("  -> Training lower-bound model (alpha=0.05)...")
    # We remove 'n_estimators' as it's not a valid param for xgb.train
    # It's controlled by num_boost_round instead.
    num_boost_round_lower = best_params_lower.pop('n_estimators')
    
    lower_model = xgb.train(
        params=best_params_lower,
        dtrain=dtrain,
        num_boost_round=num_boost_round_lower,
        evals=[(dval, 'validation')],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    # Restore n_estimators for the next fold to avoid a pop error
    best_params_lower['n_estimators'] = num_boost_round_lower
    
    # Predict on the validation set to generate OOF predictions
    oof_lower_preds[val_idx] = lower_model.predict(dval, iteration_range=(0, lower_model.best_iteration))
    # Predict on the test set and add to the running average
    test_lower_preds += lower_model.predict(dtest, iteration_range=(0, lower_model.best_iteration)) / N_SPLITS

    # --- Train and Predict Upper-Bound Model ---
    print("  -> Training upper-bound model (alpha=0.95)...")
    num_boost_round_upper = best_params_upper.pop('n_estimators')

    upper_model = xgb.train(
        params=best_params_upper,
        dtrain=dtrain,
        num_boost_round=num_boost_round_upper,
        evals=[(dval, 'validation')],
        early_stopping_rounds=50,
        verbose_eval=False
    )

    best_params_upper['n_estimators'] = num_boost_round_upper
    
    # Predict on the validation set for OOF
    oof_upper_preds[val_idx] = upper_model.predict(dval, iteration_range=(0, upper_model.best_iteration))
    # Predict on the test set and add to the running average
    test_upper_preds += upper_model.predict(dtest, iteration_range=(0, upper_model.best_iteration)) / N_SPLITS

    gc.collect()

print("\n--- K-Fold training complete. ---")

# --- Step 3: Save the Prediction Artifacts ---
print("\n--- Saving quantile prediction arrays to disk... ---")

np.save(os.path.join(META_QUANTILE_PATH, 'oof_lower_preds.npy'), oof_lower_preds)
print(f"Saved 'oof_lower_preds.npy' successfully.")

np.save(os.path.join(META_QUANTILE_PATH, 'test_lower_preds.npy'), test_lower_preds)
print(f"Saved 'test_lower_preds.npy' successfully.")

np.save(os.path.join(META_QUANTILE_PATH, 'oof_upper_preds.npy'), oof_upper_preds)
print(f"Saved 'oof_upper_preds.npy' successfully.")

np.save(os.path.join(META_QUANTILE_PATH, 'test_upper_preds.npy'), test_upper_preds)
print(f"Saved 'test_upper_preds.npy' successfully.")

print("\nAll prediction artifacts are now ready for the final calibration and submission step.")


--- Initializing K-Fold training for tuned quantile models ---
Prediction artifacts will be saved to: './meta_quantile_models/'

===== FOLD 1/5 =====
  -> Training lower-bound model (alpha=0.05)...
  -> Training upper-bound model (alpha=0.95)...

===== FOLD 2/5 =====
  -> Training lower-bound model (alpha=0.05)...
  -> Training upper-bound model (alpha=0.95)...

===== FOLD 3/5 =====
  -> Training lower-bound model (alpha=0.05)...
  -> Training upper-bound model (alpha=0.95)...

===== FOLD 4/5 =====
  -> Training lower-bound model (alpha=0.05)...
  -> Training upper-bound model (alpha=0.95)...

===== FOLD 5/5 =====
  -> Training lower-bound model (alpha=0.05)...
  -> Training upper-bound model (alpha=0.95)...

--- K-Fold training complete. ---

--- Saving quantile prediction arrays to disk... ---
Saved 'oof_lower_preds.npy' successfully.
Saved 'test_lower_preds.npy' successfully.
Saved 'oof_upper_preds.npy' successfully.
Saved 'test_upper_preds.npy' successfully.

All prediction artifa

In [12]:
# =============================================================================
# BLOCK 9: FINAL ENSEMBLE OF INTERVALS
# =============================================================================
# This is the definitive final step. We have two high-quality but different
# interval predictions:
# 1. The original "Mean + Error Model" method (which is our current best).
# 2. The new "Direct Quantile Model" method (our second-best, robust alternative).
#
# By blending these two sets of interval bounds, we can create a final submission
# that is more robust and likely more accurate than either method alone.

# --- Step 1: Load All Final Prediction Bounds ---
print("\n--- Loading final bounds from both pipelines ---")

# Define paths and file names - ADJUST THESE IF YOUR FILENAMES ARE DIFFERENT
ERROR_MODEL_SUB_FILE = 'submission_final_OptimalEoE_292680.csv' # From your original best pipeline
QUANTILE_MODEL_SUB_FILE = 'submission_direct_quantile_robust_349061.csv' # From our new pipeline

try:
    # Load the submission files which contain the final, calibrated bounds
    df_error_model = pd.read_csv(ERROR_MODEL_SUB_FILE)
    df_quantile_model = pd.read_csv(QUANTILE_MODEL_SUB_FILE)

    # For blending the test set predictions
    test_lower_error = df_error_model['pi_lower'].values
    test_upper_error = df_error_model['pi_upper'].values
    test_lower_quantile = df_quantile_model['pi_lower'].values
    test_upper_quantile = df_quantile_model['pi_upper'].values

    # To find the optimal blend weight, we need the OOF predictions that
    # CORRESPOND to these submission files. We will recreate them.
    # NOTE: This assumes the variables from the previous blocks are available.
    # If not, they would need to be loaded from .npy files.
    
    # Recreate the OOF bounds from the "Mean+Error" model
    # These values were found in your `submission_final_OptimalEoE_292680.csv` run
    best_a_error_model = 1.9799
    best_b_error_model = 2.1755
    oof_error_final = np.clip((oof_error_preds_xgb * 0.60 + oof_error_preds_cb * 0.40), 0, None)
    oof_lower_error = oof_ensemble_mean - oof_error_final * best_a_error_model
    oof_upper_error = oof_ensemble_mean + oof_error_final * best_b_error_model
    
    # Recreate the OOF bounds from the "Direct Quantile" model
    best_a_quantile = 0.8118
    best_b_quantile = 1.1960
    oof_lower_quantile_raw = np.minimum(oof_lower_preds, oof_upper_preds)
    oof_upper_quantile_raw = np.maximum(oof_lower_preds, oof_upper_preds)
    oof_lower_quantile = oof_lower_quantile_raw * best_a_quantile
    oof_upper_quantile = oof_upper_quantile_raw * best_b_quantile

    print("All necessary prediction bounds loaded/recreated successfully.")

except FileNotFoundError as e:
    print(f"\nERROR: Could not find a required submission file. {e}")
    print("Please ensure both pipelines have been run and their submissions are present.")
    # exit()


# --- Step 2: Find the Optimal Blend Weight ---
print("\n--- Optimizing the blend weight for the two interval models ---")

def get_blended_winkler(weights, y_true_oof, lower_a, upper_a, lower_b, upper_b):
    """
    Calculates Winkler score for a weighted blend of two intervals.
    `weights[0]` is the weight for model A (error model).
    Weight for model B (quantile model) is `1 - weights[0]`.
    """
    weight_error_model = weights[0]
    weight_quantile_model = 1 - weight_error_model

    # Calculate the blended interval bounds
    final_lower = (lower_a * weight_error_model) + (lower_b * weight_quantile_model)
    final_upper = (upper_a * weight_error_model) + (upper_b * weight_quantile_model)

    return winkler_score(y_true_oof, final_lower, final_upper)

# We are only optimizing one parameter: the weight.
# The constraint ensures the weights for the two models sum to 1.
initial_guess = [0.5] # Start with a 50/50 blend
bounds = [(0, 1)]     # Weight must be between 0 and 1

# Run the optimizer
result_blend = minimize(
    fun=get_blended_winkler,
    x0=initial_guess,
    args=(y_true, oof_lower_error, oof_upper_error, oof_lower_quantile, oof_upper_quantile),
    method='L-BFGS-B', # A good method for single-variable, bounded problems
    bounds=bounds
)

best_weight_error_model = result_blend.x[0]
best_weight_quantile_model = 1 - best_weight_error_model
best_blended_score = result_blend.fun

# --- Step 3: Display Final Results ---
print("\n" + "="*60)
print("FINAL ENSEMBLE RESULTS")
print("="*60)
print(f"Original Error Model OOF Score:     $292,680.00") # From your filename
print(f"Direct Quantile Model OOF Score:  $349,061.10")
print(f"Final BLENDED OOF Winkler Score:    ${best_blended_score:,.2f}")
print("-" * 60)
print(f"Optimal Blend Weights:")
print(f"  -> Mean+Error Model:   {best_weight_error_model:.2%}")
print(f"  -> Direct Quantile Model: {best_weight_quantile_model:.2%}")


# --- Step 4: Create and Save the Final Blended Submission ---
print("\n--- Creating final blended submission file... ---")

# Apply the optimal blend weights to the test set predictions
final_test_lower = (test_lower_error * best_weight_error_model) + (test_lower_quantile * best_weight_quantile_model)
final_test_upper = (test_upper_error * best_weight_error_model) + (test_upper_quantile * best_weight_quantile_model)

# Final sanity check: ensure pi_upper is always greater than pi_lower
final_test_upper = np.maximum(final_test_lower + 1, final_test_upper)

# Create the submission DataFrame using the original IDs
submission_df_final = pd.DataFrame({
    'id': df_error_model['id'],
    'pi_lower': final_test_lower,
    'pi_upper': final_test_upper
})

# Save the submission file
submission_filename = f'submission_FINAL_BLEND_{int(best_blended_score)}.csv'
submission_df_final.to_csv(submission_filename, index=False)

print(f"\n'{submission_filename}' created successfully! This is your best shot!")
print("\nFinal Submission Head:")
print(submission_df_final.head())


--- Loading final bounds from both pipelines ---
All necessary prediction bounds loaded/recreated successfully.

--- Optimizing the blend weight for the two interval models ---

FINAL ENSEMBLE RESULTS
Original Error Model OOF Score:     $292,680.00
Direct Quantile Model OOF Score:  $349,061.10
Final BLENDED OOF Winkler Score:    $294,673.93
------------------------------------------------------------
Optimal Blend Weights:
  -> Mean+Error Model:   93.23%
  -> Direct Quantile Model: 6.77%

--- Creating final blended submission file... ---

'submission_FINAL_BLEND_294673.csv' created successfully! This is your best shot!

Final Submission Head:
       id       pi_lower      pi_upper
0  200000  812909.311068  1.016441e+06
1  200001  576970.144618  7.998915e+05
2  200002  450018.505119  6.540708e+05
3  200003  294414.465019  4.243956e+05
4  200004  354485.328737  7.902132e+05
