In [6]:
# =============================================================================
# BLOCK 1: SETUP, IMPORTS, AND DATA LOADING
# =============================================================================
import warnings
warnings.filterwarnings('ignore')
import time

# --- Library Imports ---
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import optuna

print("Libraries imported successfully.")

# --- Helper Function for Winkler Score ---
def winkler_score(y_true, lower, upper, alpha=0.1, return_coverage=False):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    if return_coverage:
        coverage = np.mean((y_true >= lower) & (y_true <= upper))
        return np.mean(score), coverage
    return np.mean(score)

# --- Global Constants ---
N_SPLITS = 5
RANDOM_STATE = 42
DATA_PATH = './'
N_OPTUNA_TRIALS = 30 # A strong number for a comprehensive search
COMPETITION_ALPHA = 0.1

# --- Load Raw Data ---
try:
    # We drop the low-variance columns they identified right away
    drop_cols=['id', 'golf', 'view_rainier', 'view_skyline', 'view_lakesamm', 'view_otherwater', 'view_other']
    df_train = pd.read_csv(DATA_PATH + 'dataset.csv').drop(columns=drop_cols)
    df_test = pd.read_csv(DATA_PATH + 'test.csv').drop(columns=drop_cols)
    print("Raw data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'dataset.csv' or 'test.csv'.")
    exit()

# --- Prepare Target Variable ---
y_true = df_train['sale_price'].copy()
# The mean-error model works best when predicting the raw price directly
# So, we will NOT log-transform the target this time.
# df_train.drop('sale_price', axis=1, inplace=True) # We keep sale_price for FE

print("Setup complete.")

Libraries imported successfully.
Raw data loaded successfully.
Setup complete.


In [7]:
# =============================================================================
# BLOCK 2: SYNTHESIZED FEATURE ENGINEERING (CORRECTED)
# =============================================================================
print("--- Starting Block 2: Synthesized Feature Engineering ---")

def create_synthesized_features(df_train, df_test):
    # Combine for consistent processing and reset the index
    df_train['is_train'] = 1
    df_test['is_train'] = 0
    # Store the original id for later, as reset_index will remove it
    train_ids = df_train.index
    test_ids = df_test.index
    all_data = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)

    # --- A) Advanced Date Features ---
    print("Creating advanced date features...")
    all_data['sale_date'] = pd.to_datetime(all_data['sale_date'])
    all_data['sale_year'] = all_data['sale_date'].dt.year
    all_data['sale_month'] = all_data['sale_date'].dt.month
    all_data['sale_dayofyear'] = all_data['sale_date'].dt.dayofyear
    # Cyclical features for seasonality
    all_data['month_sin'] = np.sin(2 * np.pi * all_data['sale_month']/12)
    all_data['month_cos'] = np.cos(2 * np.pi * all_data['sale_month']/12)
    
    
    # --- B) Domain-Specific Ratio and Interaction Features ---
    print("Creating domain-specific ratio and interaction features...")
    
    # Property Age
    all_data['age'] = all_data['sale_year'] - all_data['year_built']
    all_data['age'] = all_data['age'].apply(lambda x: max(x, 0)) # Can't have negative age
    
    # Value Ratios (captures land vs. improvement value)
    # Use a small epsilon to avoid division by zero
    epsilon = 1e-6
    all_data['imp_to_land_val_ratio'] = all_data['imp_val'] / (all_data['land_val'] + epsilon)
    all_data['val_per_sqft'] = all_data['imp_val'] / (all_data['sqft'] + epsilon)
    
    # Space Ratios
    all_data['lot_to_house_ratio'] = all_data['sqft_lot'] / (all_data['sqft'] + epsilon)
    all_data['sqft_vs_sqft1'] = all_data['sqft'] - all_data['sqft_1'] # Difference in size measurement
    
    # Grade Interactions (very powerful)
    all_data['grade_x_sqft'] = all_data['grade'] * all_data['sqft']
    all_data['grade_x_age'] = all_data['grade'] * all_data['age']
    all_data['grade_x_imp_val'] = all_data['grade'] * all_data['imp_val']

    # --- A) Brute-Force Numerical Interactions ---
    print("Creating brute-force numerical interaction features...")
    NUMS = ['area', 'land_val', 'imp_val', 'sqft_lot', 'sqft', 'sqft_1', 'grade', 'year_built']
    for i in range(len(NUMS)):
        for j in range(i + 1, len(NUMS)):
            all_data[f'{NUMS[i]}_x_{NUMS[j]}'] = all_data[NUMS[i]] * all_data[NUMS[j]]
    
    
    # --- C) Polynomial Features for Key Predictors ---
    print("Creating polynomial features for key predictors...")
    for col in ['area', 'grade', 'age', 'sqft', 'imp_val']:
        all_data[f'{col}_cub'] = all_data[col]**3

    # --- C) TF-IDF Text Features ---
    print("Creating TF-IDF features for text columns...")
    text_cols = ['subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data[text_cols] = all_data[text_cols].fillna('missing').astype(str)

    for col in text_cols:
        tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=128, binary=True)
        tfidf_matrix = tfidf.fit_transform(all_data[col])
        svd = TruncatedSVD(n_components=8, random_state=RANDOM_STATE)
        tfidf_svd = svd.fit_transform(tfidf_matrix)
        tfidf_df = pd.DataFrame(tfidf_svd, columns=[f'{col}_tfidf_svd_{i}' for i in range(8)])
        # This concat will now work because both have a simple 0-based index
        all_data = pd.concat([all_data, tfidf_df], axis=1)

    # --- D) Log transform some of the new interaction features ---
    for c in ['land_val_x_imp_val', 'land_val_x_sqft', 'imp_val_x_sqft']:
        if c in all_data.columns:
            # Add a small constant to avoid log(0)
            all_data[c] = np.log1p(all_data[c].fillna(0))
            
    # --- E) Final Cleanup ---
    print("Finalizing feature set...")
    cols_to_drop = ['sale_date', 'subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data = all_data.drop(columns=cols_to_drop)
    all_data.fillna(0, inplace=True)

    # Separate final datasets
    X = all_data[all_data['is_train'] == 1].drop(columns=['is_train', 'sale_price'])
    X_test = all_data[all_data['is_train'] == 0].drop(columns=['is_train', 'sale_price'])
    
    # Restore the original 'id' as the index
    X.index = train_ids
    X_test.index = test_ids
    
    X_test = X_test[X.columns]
    
    return X, X_test

# We need to re-run this from the original dataframes
X, X_test = create_synthesized_features(df_train, df_test)

print(f"\nSynthesized FE complete. Total features: {X.shape[1]}")
gc.collect()

--- Starting Block 2: Synthesized Feature Engineering ---
Creating advanced date features...
Creating domain-specific ratio and interaction features...
Creating brute-force numerical interaction features...
Creating polynomial features for key predictors...
Creating TF-IDF features for text columns...
Finalizing feature set...

Synthesized FE complete. Total features: 126


14

In [8]:
# =============================================================================
# BLOCK 3: TWO-STAGE TUNING, TRAINING, AND SUBMISSION
# =============================================================================

print("\n--- Starting Block 3: Two-Stage Modeling Pipeline ---")

# --- STAGE 1, PART 1: Tuning Mean Prediction Model ---
print("\n# STAGE 1, PART 1: Tuning Mean Prediction Model...")
def objective_mean(trial):
    train_x, val_x, train_y, val_y = train_test_split(X, y_true, test_size=0.2, random_state=RANDOM_STATE)
    params = {
            'objective': 'reg:squarederror', 
            'eval_metric': 'rmse', 
            'tree_method':'hist',
            'eta': trial.suggest_float('eta', 0.02, 0.05),
            'max_depth': trial.suggest_int('max_depth', 8, 10),
            'subsample': trial.suggest_float('subsample', 0.72, 0.89),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.74, 0.89),
            'lambda': trial.suggest_float('lambda', 2.1, 4.5),
            'alpha': trial.suggest_float('alpha', 1e-5, 5e-4, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 3)
        }
    
    model = xgb.XGBRegressor(**params, n_estimators=2500, random_state=RANDOM_STATE, n_jobs=-1, early_stopping_rounds=100)
    model.fit(train_x, train_y, eval_set=[(val_x, val_y)], verbose=False)
    preds = model.predict(val_x)
    return np.sqrt(mean_squared_error(val_y, preds))

study_mean = optuna.create_study(direction='minimize')
study_mean.optimize(objective_mean, n_trials=N_OPTUNA_TRIALS)
best_params_mean = study_mean.best_params
print(f"# Mean Model Tuning Complete. Best Validation RMSE: ${study_mean.best_value:,.2f}")

# --- STAGE 1, PART 2: K-Fold Training of Mean Model ---
print("\n# STAGE 1, PART 2: K-Fold Training of Mean Model...")
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
oof_mean_preds = np.zeros(len(X))
test_mean_preds = np.zeros(len(X_test))
grade_for_stratify = pd.read_csv(DATA_PATH + 'dataset.csv')['grade']
for fold, (train_idx, val_idx) in enumerate(skf.split(X, grade_for_stratify)):
    print(f"  Mean Model - Fold {fold+1}/{N_SPLITS}...")
    model = xgb.XGBRegressor(**best_params_mean, n_estimators=2500, objective='reg:squarederror', eval_metric='rmse', tree_method='hist', random_state=RANDOM_STATE, n_jobs=-1, early_stopping_rounds=100)
    model.fit(X.iloc[train_idx], y_true.iloc[train_idx], eval_set=[(X.iloc[val_idx], y_true.iloc[val_idx])], verbose=False)
    oof_mean_preds[val_idx] = model.predict(X.iloc[val_idx])
    test_mean_preds += model.predict(X_test) / N_SPLITS
    
# --- NEW: CALCULATE AND PRINT FINAL OOF RMSE ---
final_mean_rmse = np.sqrt(mean_squared_error(y_true, oof_mean_preds))
print(f"\n# Mean model K-Fold training complete.")
print(f"# Final OOF RMSE for Mean Model: ${final_mean_rmse:,.2f}")
print("-" * 50)

[I 2025-07-08 21:30:34,304] A new study created in memory with name: no-name-14016501-8c0a-4e2a-bed7-3a7ae299cbf1



--- Starting Block 3: Two-Stage Modeling Pipeline ---

# STAGE 1, PART 1: Tuning Mean Prediction Model...


[I 2025-07-08 21:31:40,169] Trial 0 finished with value: 100470.74607068468 and parameters: {'eta': 0.021137828486356834, 'max_depth': 10, 'subsample': 0.8394733332069202, 'colsample_bytree': 0.8676485885208814, 'lambda': 2.448142734214261, 'alpha': 3.764944983255964e-05, 'min_child_weight': 3}. Best is trial 0 with value: 100470.74607068468.
[I 2025-07-08 21:32:32,316] Trial 1 finished with value: 100119.33743288556 and parameters: {'eta': 0.02881540129765727, 'max_depth': 9, 'subsample': 0.72873596380745, 'colsample_bytree': 0.8092045551351673, 'lambda': 2.908707509797906, 'alpha': 0.0002839704281298098, 'min_child_weight': 1}. Best is trial 1 with value: 100119.33743288556.
[I 2025-07-08 21:33:21,988] Trial 2 finished with value: 99993.28233436485 and parameters: {'eta': 0.020469782665619496, 'max_depth': 9, 'subsample': 0.7672850375627377, 'colsample_bytree': 0.8720928514378324, 'lambda': 4.458338077762658, 'alpha': 3.2738100807219774e-05, 'min_child_weight': 3}. Best is trial 2 wi

# Mean Model Tuning Complete. Best Validation RMSE: $99,545.97

# STAGE 1, PART 2: K-Fold Training of Mean Model...
  Mean Model - Fold 1/5...
  Mean Model - Fold 2/5...
  Mean Model - Fold 3/5...
  Mean Model - Fold 4/5...
  Mean Model - Fold 5/5...

# Mean model K-Fold training complete.
# Final OOF RMSE for Mean Model: $100,467.33
--------------------------------------------------


In [9]:
# =============================================================================
# STAGE 2: ERROR MODEL FEATURE ENGINEERING, TUNING, AND TRAINING
# =============================================================================

# --- Define the Error Target ---
error_target = np.abs(y_true - oof_mean_preds)


# =============================================================================
# --- PART 1A: FEATURE ENGINEERING FOR THE ERROR MODEL ---
# =============================================================================
print("\n# Performing feature engineering for the error model...")

# Start with copies of the original data
X_for_error = X.copy()
X_test_for_error = X_test.copy()

# Feature 1: The mean prediction itself
print("  Adding 'mean_pred_oof' feature...")
X_for_error['mean_pred_oof'] = oof_mean_preds
X_test_for_error['mean_pred_oof'] = test_mean_preds

# Feature 2: Deviation from the average prediction
print("  Adding 'pred_deviation' feature...")
avg_oof_pred = oof_mean_preds.mean()
X_for_error['pred_deviation'] = np.abs(X_for_error['mean_pred_oof'] - avg_oof_pred)
X_test_for_error['pred_deviation'] = np.abs(X_test_for_error['mean_pred_oof'] - avg_oof_pred)

# Feature 3: Binned predictions
print("  Adding 'pred_bin' feature...")
X_for_error['pred_bin'], bin_edges = pd.qcut(X_for_error['mean_pred_oof'], 
                                             q=10, 
                                             labels=False, 
                                             retbins=True, 
                                             duplicates='drop')
X_test_for_error['pred_bin'] = pd.cut(X_test_for_error['mean_pred_oof'], 
                                      bins=bin_edges, 
                                      labels=False, 
                                      include_lowest=True).fillna(-1).astype(int)

print(f"\n# Feature engineering complete. Total features for error model: {X_for_error.shape[1]}")
# =============================================================================


# --- PART 1B: Tuning the Error Model on the NEW Feature Set ---
print("\n# STAGE 2, PART 1: Tuning Error Prediction Model (on new features)...")

def objective_error(trial):
    # This split now correctly uses X_for_error which has the new features
    train_x, val_x, train_y, val_y = train_test_split(X_for_error, error_target, test_size=0.2, random_state=RANDOM_STATE)
    
    params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'tree_method': 'hist',
            'eta': trial.suggest_float('eta', 0.01, 0.043),
            'max_depth': trial.suggest_int('max_depth', 7, 10),
            'subsample': trial.suggest_float('subsample', 0.8, 0.99),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.692, 0.75),
            'lambda': trial.suggest_float('lambda', 0.345, 0.765, log=True),
            'alpha': trial.suggest_float('alpha', 0.455, 0.69),
        }

    model = xgb.XGBRegressor(**params, n_estimators=2500, random_state=RANDOM_STATE, n_jobs=-1, early_stopping_rounds=100)
    model.fit(train_x, train_y, eval_set=[(val_x, val_y)], verbose=False)
    preds = model.predict(val_x)
    return np.sqrt(mean_squared_error(val_y, preds))

study_error = optuna.create_study(direction='minimize')
study_error.optimize(objective_error, n_trials=N_OPTUNA_TRIALS)
best_params_error = study_error.best_params
print(f"\n# Error Model Tuning Complete. Best Validation RMSE: ${study_error.best_value:,.2f}")


# --- PART 2: K-Fold Training of Error Model with Best Params ---
print("\n# STAGE 2, PART 2: K-Fold Training of Error Model...")
oof_error_preds = np.zeros(len(X))
test_error_preds = np.zeros(len(X_test))

# Add the other required XGBoost parameters
final_params_error = {
    'objective': 'reg:squarederror', 
    'eval_metric': 'rmse',
    'tree_method': 'hist', 
    'random_state': RANDOM_STATE, 
    'n_jobs': -1, 
    **best_params_error
}

# This loop now uses X_for_error, which has the correct features
for fold, (train_idx, val_idx) in enumerate(skf.split(X_for_error, grade_for_stratify)):
    print(f"  Error Model - Fold {fold+1}/{N_SPLITS}...")
    model = xgb.XGBRegressor(**final_params_error, n_estimators=2500, early_stopping_rounds=100)
    
    X_train, X_val = X_for_error.iloc[train_idx], X_for_error.iloc[val_idx]
    y_train, y_val = error_target.iloc[train_idx], error_target.iloc[val_idx]
    
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    
    oof_error_preds[val_idx] = model.predict(X_val)
    test_error_preds += model.predict(X_test_for_error) / N_SPLITS

# --- Calculate and Print Final OOF RMSE ---
final_error_rmse = np.sqrt(mean_squared_error(error_target, oof_error_preds))
print(f"\n# Error model K-Fold training complete.")
print(f"# Final OOF RMSE for Error Model: ${final_error_rmse:,.2f}")
print("-" * 50)


# Performing feature engineering for the error model...
  Adding 'mean_pred_oof' feature...
  Adding 'pred_deviation' feature...
  Adding 'pred_bin' feature...


[I 2025-07-08 21:55:59,796] A new study created in memory with name: no-name-2f5ee471-46c4-4d48-8df0-4743eb928ed7



# Feature engineering complete. Total features for error model: 129

# STAGE 2, PART 1: Tuning Error Prediction Model (on new features)...


[I 2025-07-08 21:56:11,347] Trial 0 finished with value: 63832.8877184405 and parameters: {'eta': 0.024369832803053784, 'max_depth': 8, 'subsample': 0.9254193579648227, 'colsample_bytree': 0.7480713045559889, 'lambda': 0.401579908612991, 'alpha': 0.5170024596156164}. Best is trial 0 with value: 63832.8877184405.
[I 2025-07-08 21:56:20,586] Trial 1 finished with value: 64284.18612174746 and parameters: {'eta': 0.04037040659197692, 'max_depth': 10, 'subsample': 0.873090966105899, 'colsample_bytree': 0.7486149531632349, 'lambda': 0.6068511622257085, 'alpha': 0.46954794597295985}. Best is trial 0 with value: 63832.8877184405.
[I 2025-07-08 21:56:31,945] Trial 2 finished with value: 63987.25912455223 and parameters: {'eta': 0.026008751416083643, 'max_depth': 7, 'subsample': 0.9834624471078448, 'colsample_bytree': 0.7227054935595059, 'lambda': 0.40523964074614344, 'alpha': 0.5193664261515867}. Best is trial 0 with value: 63832.8877184405.
[I 2025-07-08 21:56:50,421] Trial 3 finished with val


# Error Model Tuning Complete. Best Validation RMSE: $63,747.02

# STAGE 2, PART 2: K-Fold Training of Error Model...
  Error Model - Fold 1/5...
  Error Model - Fold 2/5...
  Error Model - Fold 3/5...
  Error Model - Fold 4/5...
  Error Model - Fold 5/5...

# Error model K-Fold training complete.
# Final OOF RMSE for Error Model: $63,560.57
--------------------------------------------------


In [10]:
# =============================================================================
# FINAL ASYMMETRIC CALIBRATION AND SUBMISSION (ULTIMATE ROBUST VERSION)
# =============================================================================

print("\n--- Final Asymmetric Calibration ---")

# --- Safely reload y_true to ensure it's available ---
y_true = pd.read_csv('./dataset.csv')['sale_price']

# --- Your existing correct code ---
oof_error_final = np.clip(oof_error_preds, 0, None) 
best_a, best_b, best_metric = 2.0, 2.0, float('inf')

for a in np.arange(1.90, 2.31, 0.01):
    for b in np.arange(2.10, 2.51, 0.01):
        low = oof_mean_preds - oof_error_final * a
        high = oof_mean_preds + oof_error_final * b
        # We need the winkler_score function defined here or in a previous cell
        metric, coverage = winkler_score(y_true, low, high, alpha=COMPETITION_ALPHA, return_coverage=True)
        if metric < best_metric:
            best_metric = metric
            best_a, best_b = a, b
            print(f"New Best! a={best_a:.2f}, b={best_b:.2f}, Score={best_metric:,.2f}, Cov={coverage:.2%}")
print(f"\nGrid search complete. Final OOF Score: {best_metric:,.2f}. Best multipliers: a={best_a:.2f}, b={best_b:.2f}")


# --- Create Final Submission ---
print("\nCreating final submission file...")
test_error_final = np.clip(test_error_preds, 0, None)
final_lower = test_mean_preds - test_error_final * best_a
final_upper = test_mean_preds + test_error_final * best_b
final_upper = np.maximum(final_lower, final_upper)

# Your excellent, robust fix for the IDs
test_ids = pd.read_csv('./test.csv', usecols=['id'])['id']
submission_df = pd.DataFrame({
    'id': test_ids, 
    'pi_lower': final_lower, 
    'pi_upper': final_upper
})

submission_df.to_csv('submission_final_v6.csv', index=False)
print("\n'submission_final_v6.csv' created successfully!")
display(submission_df.head())


--- Final Asymmetric Calibration ---
New Best! a=1.90, b=2.10, Score=305,861.00, Cov=88.89%
New Best! a=1.90, b=2.11, Score=305,815.11, Cov=88.95%
New Best! a=1.90, b=2.12, Score=305,775.35, Cov=89.01%
New Best! a=1.90, b=2.13, Score=305,742.42, Cov=89.09%
New Best! a=1.90, b=2.14, Score=305,716.01, Cov=89.15%
New Best! a=1.90, b=2.15, Score=305,696.73, Cov=89.21%
New Best! a=1.90, b=2.16, Score=305,683.98, Cov=89.27%
New Best! a=1.90, b=2.17, Score=305,677.79, Cov=89.33%
New Best! a=1.91, b=2.14, Score=305,666.67, Cov=89.24%
New Best! a=1.91, b=2.15, Score=305,647.39, Cov=89.30%
New Best! a=1.91, b=2.16, Score=305,634.64, Cov=89.36%
New Best! a=1.91, b=2.17, Score=305,628.45, Cov=89.42%
New Best! a=1.92, b=2.14, Score=305,627.43, Cov=89.33%
New Best! a=1.92, b=2.15, Score=305,608.15, Cov=89.39%
New Best! a=1.92, b=2.16, Score=305,595.40, Cov=89.45%
New Best! a=1.92, b=2.17, Score=305,589.21, Cov=89.51%
New Best! a=1.93, b=2.15, Score=305,578.97, Cov=89.49%
New Best! a=1.93, b=2.16, S

Unnamed: 0,id,pi_lower,pi_upper
0,200000,842937.644678,1058807.0
1,200001,583226.265234,849875.2
2,200002,459273.298828,686147.1
3,200003,283237.266577,408661.7
4,200004,370852.20293,874285.3
