In [3]:
# =============================================================================
# BLOCK 1: SETUP AND IMPORTS (GENSIM REMOVED)
# =============================================================================
import warnings
warnings.filterwarnings('ignore')
import time
import pandas as pd
import numpy as np
import gc
import re
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import optuna

print("--- BLOCK 1: SETUP ---")
print("# Libraries imported successfully.")

# --- Helper Function for Winkler Score ---
def winkler_score(y_true, lower, upper, alpha=0.1, return_coverage=False):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    if return_coverage:
        coverage = np.mean((y_true >= lower) & (y_true <= upper))
        return np.mean(score), coverage
    return np.mean(score)

# --- Global Constants ---
N_SPLITS = 5
RANDOM_STATE = 42
DATA_PATH = './'
N_OPTUNA_TRIALS = 30
COMPETITION_ALPHA = 0.1
print("# Global constants defined.")

# --- Load Raw Data ---
try:
    drop_cols=['id', 'golf', 'view_rainier', 'view_skyline', 'view_lakesamm', 'view_otherwater', 'view_other']
    df_train = pd.read_csv(DATA_PATH + 'dataset.csv').drop(columns=drop_cols)
    df_test = pd.read_csv(DATA_PATH + 'test.csv').drop(columns=drop_cols)
    print("# Raw data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'dataset.csv' or 'test.csv'.")
    exit()

# --- Prepare Target Variable ---
y_true = df_train['sale_price'].copy()
print("# Target variable 'y_true' created.")
print("# Setup complete.")
print("-" * 50)


--- BLOCK 1: SETUP ---
# Libraries imported successfully.
# Global constants defined.
# Raw data loaded successfully.
# Target variable 'y_true' created.
# Setup complete.
--------------------------------------------------


In [4]:
# =============================================================================
# BLOCK 2 (UPGRADED): ADDING PEER-COMPARISON FEATURES
# =============================================================================
print("\n--- Starting Block 2: Final Feature Engineering with Peer-Comparisons ---")

def create_ultimate_features(df_train, df_test):
    # Combine for consistent processing and reset the index
    df_train['is_train'] = 1
    df_test['is_train'] = 0
    train_ids = df_train.index
    test_ids = df_test.index
    all_data = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)

    # --- A) Foundational & Brute-Force Features ---
    print("# Creating foundational and brute-force interaction features...")
    all_data['sale_date'] = pd.to_datetime(all_data['sale_date'])
    all_data['year'] = all_data['sale_date'].dt.year
    all_data['month'] = all_data['sale_date'].dt.month
    all_data['year_diff'] = all_data['year'] - all_data['year_built']
    all_data['total_sqft'] = all_data['sqft'] + all_data['sqft_fbsmt']
    
    NUMS = ['area', 'land_val', 'imp_val', 'sqft_lot', 'total_sqft', 'grade', 'year_diff']
    for i in range(len(NUMS)):
        for j in range(i + 1, len(NUMS)):
            all_data[f'{NUMS[i]}_x_{NUMS[j]}'] = all_data[NUMS[i]] * all_data[NUMS[j]]

    # --- B) TF-IDF Text Features (Proven Winner) ---
    print("# Creating TF-IDF features for text columns...")
    text_cols = ['subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data[text_cols] = all_data[text_cols].fillna('missing').astype(str)
    for col in text_cols:
        tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=128, binary=True)
        tfidf_matrix = tfidf.fit_transform(all_data[col])
        svd = TruncatedSVD(n_components=8, random_state=RANDOM_STATE)
        tfidf_svd = svd.fit_transform(tfidf_matrix)
        tfidf_df = pd.DataFrame(tfidf_svd, columns=[f'{col}_tfidf_svd_{i}' for i in range(8)])
        all_data = pd.concat([all_data, tfidf_df], axis=1)
        
    # --- C) NEW: Peer-Comparison & Contextual Features ---
    print("# Creating NEW peer-comparison (relative) features...")
    # First, create location clusters to define "neighborhoods"
    kmeans = KMeans(n_clusters=40, random_state=RANDOM_STATE, n_init='auto')
    all_data['location_cluster'] = kmeans.fit_predict(all_data[['latitude', 'longitude']])
    
    # We calculate the group averages ONLY from the training data to prevent leakage
    train_copy_for_aggs = all_data[all_data['is_train'] == 1].copy()
    
    group_cols = ['location_cluster', 'city', 'year'] # Define our peer groups
    
    for group_col in group_cols:
        # Define the stats we want to calculate for each group
        aggs = {
            'grade': ['mean', 'std'],
            'total_sqft': ['mean', 'std'],
            'year_diff': ['mean', 'std'], # Average age of houses in the group
        }
        
        group_aggs = train_copy_for_aggs.groupby(group_col).agg(aggs)
        group_aggs.columns = [f'{c[0]}_{c[1]}_by_{group_col}' for c in group_aggs.columns]
        
        # Merge these new "group stats" back to the main dataframe
        all_data = all_data.merge(group_aggs, on=group_col, how='left')

        # --- Now create the powerful relative features ---
        # How does this house's grade compare to its peers?
        all_data[f'grade_vs_mean_{group_col}'] = all_data['grade'] - all_data[f'grade_mean_by_{group_col}']
        # How does its size compare?
        all_data[f'sqft_vs_mean_{group_col}'] = all_data['total_sqft'] - all_data[f'total_sqft_mean_by_{group_col}']
        # How does its age compare (in terms of standard deviations)?
        all_data[f'age_zscore_{group_col}'] = (all_data['year_diff'] - all_data[f'year_diff_mean_by_{group_col}']) / (all_data[f'year_diff_std_by_{group_col}'] + 1e-6)

    # --- D) Final Cleanup ---
    print("# Finalizing feature set...")
    cols_to_drop = ['sale_date', 'subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket', 'latitude', 'longitude']
    all_data = all_data.drop(columns=cols_to_drop, errors='ignore')
    all_data.fillna(0, inplace=True)

    # Separate final datasets
    X = all_data[all_data['is_train'] == 1].drop(columns=['is_train', 'sale_price'])
    X_test = all_data[all_data['is_train'] == 0].drop(columns=['is_train', 'sale_price'])
    X.index = train_ids
    X_test.index = test_ids
    X_test = X_test[X.columns]
    
    return X, X_test

# We need to re-run this from the original dataframes
X, X_test = create_ultimate_features(df_train, df_test)

print(f"\nUltimate FE complete. Total features: {X.shape[1]}")
gc.collect()


--- Starting Block 2: Final Feature Engineering with Peer-Comparisons ---
# Creating foundational and brute-force interaction features...
# Creating TF-IDF features for text columns...
# Creating NEW peer-comparison (relative) features...
# Finalizing feature set...

Ultimate FE complete. Total features: 131


290

In [5]:
# =============================================================================
# BLOCK 3: TWO-STAGE TUNING, TRAINING, AND SUBMISSION (CORRECTED)
# =============================================================================
print("\n--- Starting Block 3: Two-Stage Modeling Pipeline ---")

# --- STAGE 1, PART 1: Tuning Mean Prediction Model ---
print("\n# STAGE 1, PART 1: Tuning Mean Prediction Model...")
print("# EXPLANATION: Using Optuna to find the best XGBoost settings for our new feature set.")
def objective_mean(trial):
    # This function uses the X and y_true created in the previous blocks
    train_x, val_x, train_y, val_y = train_test_split(X, y_true, test_size=0.2, random_state=RANDOM_STATE)
    
    # We are using the scikit-learn wrapper (XGBRegressor), which is compatible with most environments
    params = {
        'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'tree_method': 'hist',
        'eta': trial.suggest_float('eta', 0.01, 0.08, log=True),
        'max_depth': trial.suggest_int('max_depth', 6, 14),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-4, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-4, 10.0, log=True),
    }
    model = xgb.XGBRegressor(**params, n_estimators=1500, random_state=RANDOM_STATE, n_jobs=-1, early_stopping_rounds=50)
    model.fit(train_x, train_y, eval_set=[(val_x, val_y)], verbose=False)
    preds = model.predict(val_x)
    return np.sqrt(mean_squared_error(val_y, preds))

study_mean = optuna.create_study(direction='minimize')
study_mean.optimize(objective_mean, n_trials=N_OPTUNA_TRIALS)
best_params_mean = study_mean.best_params
print(f"# Mean Model Tuning Complete. Best Validation RMSE: ${study_mean.best_value:,.2f}")

# --- STAGE 1, PART 2: K-Fold Training of Mean Model ---
print("\n# STAGE 1, PART 2: K-Fold Training of Mean Model...")
print("# EXPLANATION: Using the best settings to train 5 models for robust mean predictions.")

# THE FIX: We need the 'grade' column for stratification. The safest way is to reload it.
grade_for_stratify = pd.read_csv(DATA_PATH + 'dataset.csv')['grade']

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
oof_mean_preds = np.zeros(len(X))
test_mean_preds = np.zeros(len(X_test))
for fold, (train_idx, val_idx) in enumerate(skf.split(X, grade_for_stratify)):
    print(f"  Mean Model - Fold {fold+1}/{N_SPLITS}...")
    model = xgb.XGBRegressor(**best_params_mean, n_estimators=2000, objective='reg:squarederror', eval_metric='rmse', tree_method='hist', random_state=RANDOM_STATE, n_jobs=-1, early_stopping_rounds=100)
    model.fit(X.iloc[train_idx], y_true.iloc[train_idx], eval_set=[(X.iloc[val_idx], y_true.iloc[val_idx])], verbose=False)
    oof_mean_preds[val_idx] = model.predict(X.iloc[val_idx])
    test_mean_preds += model.predict(X_test) / N_SPLITS

# --- STAGE 2: K-Fold Training of Error Model ---
print("\n# STAGE 2: K-Fold Training of Error Model...")
print("# EXPLANATION: Training a second model to predict the size of the first model's errors.")
error_target = np.abs(y_true - oof_mean_preds)
X_for_error = X.copy()
X_for_error['mean_pred_oof'] = oof_mean_preds
X_test_for_error = X_test.copy()
X_test_for_error['mean_pred_oof'] = test_mean_preds

# We will not tune the error model for speed, but use good default parameters
params_error = {'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'tree_method': 'hist', 'eta': 0.03, 'max_depth': 7, 'random_state': RANDOM_STATE}
oof_error_preds = np.zeros(len(X))
test_error_preds = np.zeros(len(X_test))
for fold, (train_idx, val_idx) in enumerate(skf.split(X_for_error, grade_for_stratify)):
    print(f"  Error Model - Fold {fold+1}/{N_SPLITS}...")
    model = xgb.XGBRegressor(**params_error, n_estimators=1500, n_jobs=-1, early_stopping_rounds=100)
    model.fit(X_for_error.iloc[train_idx], error_target.iloc[train_idx], eval_set=[(X_for_error.iloc[val_idx], error_target.iloc[val_idx])], verbose=False)
    oof_error_preds[val_idx] = model.predict(X_for_error.iloc[val_idx])
    test_error_preds += model.predict(X_test_for_error) / N_SPLITS

# --- FINAL ASYMMETRIC CALIBRATION AND SUBMISSION ---
print("\n# FINAL STAGE: Asymmetric Calibration & Submission...")
print("# EXPLANATION: Finding the best 'stretch' multipliers (a and b) for our intervals to hit 90% coverage.")
oof_error_final = np.clip(oof_error_preds, 0, None) # Ensure error is not negative
best_a, best_b, best_metric = 2.0, 2.0, float('inf')
for a in np.arange(1.90, 2.31, 0.01):
    for b in np.arange(2.10, 2.51, 0.01):
        low = oof_mean_preds - oof_error_final * a
        high = oof_mean_preds + oof_error_final * b
        metric, coverage = winkler_score(y_true, low, high, alpha=COMPETITION_ALPHA, return_coverage=True)
        # We only care about solutions that are close to the target coverage
        if coverage > 0.88 and metric < best_metric:
            best_metric = metric
            best_a, best_b = a, b

print(f"\n# Grid search complete.")
print(f"# Final OOF Winkler Score: {best_metric:,.2f}")
print(f"# Best multipliers found: a (lower)={best_a:.2f}, b (upper)={best_b:.2f}")


[I 2025-07-06 18:22:25,739] A new study created in memory with name: no-name-9ebb5466-445b-4179-9912-80ce31e78e8b



--- Starting Block 3: Two-Stage Modeling Pipeline ---

# STAGE 1, PART 1: Tuning Mean Prediction Model...
# EXPLANATION: Using Optuna to find the best XGBoost settings for our new feature set.


[I 2025-07-06 18:22:59,740] Trial 0 finished with value: 101084.31775503063 and parameters: {'eta': 0.032748807472869006, 'max_depth': 9, 'subsample': 0.7207252793137036, 'colsample_bytree': 0.912763112955326, 'lambda': 0.06181255372807286, 'alpha': 0.19193762440484527}. Best is trial 0 with value: 101084.31775503063.
[I 2025-07-06 18:23:25,387] Trial 1 finished with value: 101116.1265080897 and parameters: {'eta': 0.04555982599041849, 'max_depth': 8, 'subsample': 0.909441087279321, 'colsample_bytree': 0.7346101487444393, 'lambda': 0.016010670746096235, 'alpha': 0.030237178001563465}. Best is trial 0 with value: 101084.31775503063.
[I 2025-07-06 18:24:30,863] Trial 2 finished with value: 102817.06566519004 and parameters: {'eta': 0.010691927101240695, 'max_depth': 11, 'subsample': 0.7877667898407389, 'colsample_bytree': 0.7694344720198496, 'lambda': 0.0003225109581853205, 'alpha': 0.4538085146886243}. Best is trial 0 with value: 101084.31775503063.
[I 2025-07-06 18:27:24,395] Trial 3 f

# Mean Model Tuning Complete. Best Validation RMSE: $100,536.18

# STAGE 1, PART 2: K-Fold Training of Mean Model...
# EXPLANATION: Using the best settings to train 5 models for robust mean predictions.
  Mean Model - Fold 1/5...
  Mean Model - Fold 2/5...
  Mean Model - Fold 3/5...
  Mean Model - Fold 4/5...
  Mean Model - Fold 5/5...

# STAGE 2: K-Fold Training of Error Model...
# EXPLANATION: Training a second model to predict the size of the first model's errors.
  Error Model - Fold 1/5...
  Error Model - Fold 2/5...
  Error Model - Fold 3/5...
  Error Model - Fold 4/5...
  Error Model - Fold 5/5...

# FINAL STAGE: Asymmetric Calibration & Submission...
# EXPLANATION: Finding the best 'stretch' multipliers (a and b) for our intervals to hit 90% coverage.

# Grid search complete.
# Final OOF Winkler Score: 310,800.50
# Best multipliers found: a (lower)=1.96, b (upper)=2.19


In [6]:
# =============================================================================
# FINAL SUBMISSION BLOCK (CORRECTED FOR MEAN-ERROR MODEL)
# =============================================================================
print("--- Creating Final Submission File with Correct Logic ---")

# --- Reload the original test set to get the correct index ---
original_test_ids = pd.read_csv('./test.csv', usecols=['id'])['id']

# --- Use the best found multipliers from your last successful run ---
# The grid search found these values were optimal
best_a = 1.97
best_b = 2.19 # Corrected to the final best value from your log
print(f"Applying best multipliers: a={best_a}, b={best_b}")

# --- Use the prediction arrays directly (THEY ARE ALREADY IN DOLLAR SCALE) ---
# The test_mean_preds and test_error_preds arrays are still in memory.
# DO NOT apply np.expm1() to them.
test_mean_final = test_mean_preds
test_error_final = np.clip(test_error_preds, 0, None) # Still clip error to be non-negative

final_lower = test_mean_final - test_error_final * best_a
final_upper = test_mean_final + test_error_final * best_b

# Final safety checks
final_upper = np.maximum(final_lower, final_upper)

# --- Create submission dataframe with the CORRECT IDs ---
submission_df = pd.DataFrame({
    'id': original_test_ids,
    'pi_lower': final_lower,
    'pi_upper': final_upper
})

submission_df.to_csv('submission_ultimate_synthesis_CORRECT_LOGIC.csv', index=False)
print("\n'submission_ultimate_synthesis_CORRECT_LOGIC.csv' created successfully!")
display(submission_df.head())

--- Creating Final Submission File with Correct Logic ---
Applying best multipliers: a=1.97, b=2.19

'submission_ultimate_synthesis_CORRECT_LOGIC.csv' created successfully!


Unnamed: 0,id,pi_lower,pi_upper
0,200000,811591.846895,1055108.0
1,200001,555288.41041,799495.6
2,200002,465397.940078,677126.0
3,200003,299764.264575,430540.2
4,200004,314232.609277,799360.6
