In [1]:
# =============================================================================
# BLOCK 1: SETUP, IMPORTS, AND DATA LOADING
# =============================================================================
import warnings
warnings.filterwarnings('ignore')

# --- Library Imports ---
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.cluster import KMeans
import lightgbm as lgb
import xgboost as xgb
import optuna

print("Libraries imported successfully.")

# --- Global Constants ---
N_SPLITS = 5
RANDOM_STATE = 42
DATA_PATH = './'

# --- Load Raw Data ---
try:
    df_train = pd.read_csv(DATA_PATH + 'dataset.csv', index_col="id", parse_dates=["sale_date"])
    df_test = pd.read_csv(DATA_PATH + 'test.csv', index_col="id", parse_dates=["sale_date"])
    print("Raw data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'dataset.csv' or 'test.csv'.")
    exit()

# --- Prepare Target Variable ---
y_log = np.log1p(df_train['sale_price'])
# Add target to df_train for feature engineering, then drop sale_price
df_train['sale_price_log'] = y_log
df_train.drop('sale_price', axis=1, inplace=True)

print("Setup complete.")

Libraries imported successfully.
Raw data loaded successfully.
Setup complete.


In [5]:
# =============================================================================
# BLOCK 2 (V4): GOD-TIER FEATURE ENGINEERING
# =============================================================================
print("--- Starting Block 2 (V4): God-Tier Feature Engineering ---")

def create_god_tier_features(train_df, test_df):
    """
    Builds upon the successful V3 feature set with even more advanced interactions.
    """
    # 1. Combine for consistent processing
    train_df['is_train'] = 1
    test_df['is_train'] = 0
    all_data = pd.concat([train_df, test_df], axis=0)

    # 2. Foundational Feature Creation (Keep all the successful features from before)
    all_data['sale_year'] = all_data['sale_date'].dt.year
    all_data['sale_month'] = all_data['sale_date'].dt.month
    all_data['age_at_sale'] = all_data['sale_year'] - all_data['year_built']
    all_data['total_bathrooms'] = all_data['bath_full'] + 0.5 * all_data['bath_half'] + 0.75 * all_data['bath_3qtr']
    all_data['total_sqft'] = all_data['sqft'] + all_data['sqft_fbsmt']
    all_data['was_renovated'] = (all_data['year_reno'] > 0).astype(int)
    all_data['time_since_reno'] = np.where(all_data['was_renovated'] == 1, all_data['sale_year'] - all_data['year_reno'], all_data['age_at_sale'])
    
    # 3. ADVANCED FEATURE CREATION (Keep all successful features)

    # --- A) Location Clusters ---
    kmeans = KMeans(n_clusters=40, random_state=RANDOM_STATE, n_init='auto') # Tuned: More clusters
    all_data['location_cluster'] = kmeans.fit_predict(all_data[['latitude', 'longitude']])
    
    # --- B) Peer-Comparison & Target-Encoded Features ---
    train_copy_for_aggs = all_data[all_data['is_train'] == 1].copy()
    group_cols_to_agg = ['location_cluster', 'city', 'submarket']
    if 'zipcode' in all_data.columns: group_cols_to_agg.append('zipcode')
    for group_col in group_cols_to_agg:
        aggs = {'grade': ['mean', 'std'], 'age_at_sale': ['mean', 'std'], 'total_sqft': ['mean', 'std'], 'sale_price_log': ['mean']}
        group_aggs = train_copy_for_aggs.groupby(group_col).agg(aggs)
        group_aggs.columns = [f'{c[0]}_agg_{c[1]}_by_{group_col}' for c in group_aggs.columns]
        all_data = all_data.merge(group_aggs, on=group_col, how='left')
        all_data[f'grade_vs_mean_{group_col}'] = all_data['grade'] - all_data[f'grade_agg_mean_by_{group_col}']
        all_data[f'sqft_vs_mean_{group_col}'] = all_data['total_sqft'] - all_data[f'total_sqft_agg_mean_by_{group_col}']
        all_data[f'age_zscore_{group_col}'] = (all_data['age_at_sale'] - all_data[f'age_at_sale_agg_mean_by_{group_col}']) / (all_data[f'age_at_sale_agg_std_by_{group_col}'] + 1e-6)

    # --- C) Time-based Trend Feature ---
    all_data['market_trend'] = all_data.groupby('sale_year')['sale_price_log'].transform('mean')
    
    # --- D) 'sale_warning' Handling ---
    # (Keeping this the same as it was effective)
    sale_warnings_dummies = all_data['sale_warning'].fillna('').str.get_dummies(sep=' ')
    top_warnings = sale_warnings_dummies.sum().sort_values(ascending=False).head(15).index
    sale_warnings_dummies = sale_warnings_dummies[top_warnings]
    sale_warnings_dummies.columns = [f'warning_{col}' for col in top_warnings]
    all_data = all_data.join(sale_warnings_dummies)

    # --- E) Deeper Interaction & Advanced Ratio Features ---
    all_data['imp_val_to_total_val'] = all_data['imp_val'] / (all_data['land_val'] + all_data['imp_val'] + 1e-6)
    all_data['land_val_per_sqft_lot'] = all_data['land_val'] / (all_data['sqft_lot'] + 1e-6)
    all_data['grade_x_sqft_vs_mean_cluster'] = all_data['grade'] * all_data['sqft_vs_mean_location_cluster']
    all_data['age_x_grade'] = all_data['age_at_sale'] * all_data['grade']
    all_data['age_at_sale_sq'] = all_data['age_at_sale']**2
    all_data['sin_sale_month'] = np.sin(2 * np.pi * all_data['sale_month']/12)
    all_data['cos_sale_month'] = np.cos(2 * np.pi * all_data['sale_month']/12)

    # --- F) NEW: God-Tier Interaction Features ---
    print("Creating god-tier interaction features...")
    # Does being 'better than your neighbors' matter more in expensive neighborhoods?
    all_data['grade_vs_mean_x_location_price'] = all_data['grade_vs_mean_location_cluster'] * all_data['sale_price_log_agg_mean_by_location_cluster']
    # Interaction of age and renovation status with location value
    all_data['age_x_location_price'] = all_data['age_at_sale'] * all_data['sale_price_log_agg_mean_by_location_cluster']
    all_data['reno_x_location_price'] = all_data['was_renovated'] * all_data['sale_price_log_agg_mean_by_location_cluster']
    # How much of the lot is used?
    all_data['lot_utilization'] = all_data['total_sqft'] / (all_data['sqft_lot'] + 1e-6)

    # 4. Final Cleanup and Encoding
    print("Finalizing dataset...")
    for col in all_data.select_dtypes(include='object').columns:
        all_data[col] = pd.Categorical(all_data[col]).codes
        
    cols_to_drop = [
        'sale_date', 'year_built', 'year_reno', 'bath_full', 'bath_half',
        'bath_3qtr', 'sqft', 'sqft_fbsmt', 'latitude', 'longitude', 'sale_price_log'
    ]
    all_data = all_data.drop(columns=cols_to_drop)
    all_data.fillna(0, inplace=True)

    # 5. Separate back into train and test
    train_processed = all_data[all_data['is_train'] == 1].drop(columns=['is_train'])
    test_processed = all_data[all_data['is_train'] == 0].drop(columns=['is_train'])
    
    train_cols = train_processed.columns
    test_processed = test_processed[train_cols]
    
    return train_processed, test_processed

# Run the new feature engineering pipeline
X, X_test = create_god_tier_features(df_train, df_test)

print("Hyper-aggressive feature engineering (V3) complete.")
print(f"Final training features shape: {X.shape}")

# Clean up memory
del df_train, df_test
gc.collect()


--- Starting Block 2 (V4): God-Tier Feature Engineering ---
Creating god-tier interaction features...
Finalizing dataset...
Hyper-aggressive feature engineering (V3) complete.
Final training features shape: (200000, 100)


34

In [6]:
# =============================================================================
# BLOCK 3: SMART FEATURE SELECTION
# =============================================================================
print("--- Starting Block 3: Smart Feature Selection ---")

# We will train a single, fast LightGBM model on the full training data
# to get a ranked list of feature importances.

# Define the model. We use simple parameters as we only care about feature ranking.
fs_model = lgb.LGBMRegressor(
    random_state=RANDOM_STATE,
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    n_jobs=-1,
)

print("Training a temporary LightGBM model to find feature importances...")
# We use 'y_log' which was defined in the first cell
fs_model.fit(X, y_log)

# Create a DataFrame of feature importances
importances = pd.DataFrame({
    'feature': X.columns,
    'importance': fs_model.feature_importances_
}).sort_values('importance', ascending=False)

# --- Define a threshold for dropping features ---
# We will drop features that have zero importance. This is a safe and effective first step.
ZERO_IMPORTANCE_THRESHOLD = 0
useless_features = importances[importances['importance'] <= ZERO_IMPORTANCE_THRESHOLD]['feature'].tolist()

print(f"\nFound {len(useless_features)} features with zero importance.")
if len(useless_features) > 0:
    print("Useless features to be dropped:", useless_features)

# --- Drop the useless features from our datasets ---
X_selected = X.drop(columns=useless_features)
X_test_selected = X_test.drop(columns=useless_features)

# It's also good practice to align columns after dropping
X_test_selected = X_test_selected[X_selected.columns]

print(f"\nFeature selection complete.")
print(f"Original number of features: {X.shape[1]}")
print(f"Number of features after selection: {X_selected.shape[1]}")

# --- Clean up memory ---
del X, X_test, fs_model
gc.collect()

# Display the top 30 most important features for our review
print("\nTop 30 most important features:")
display(importances.head(30))

--- Starting Block 3: Smart Feature Selection ---
Training a temporary LightGBM model to find feature importances...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031212 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7877
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 100
[LightGBM] [Info] Start training from score 13.078327

Found 2 features with zero importance.

Feature selection complete.
Original number of features: 100
Number of features after selection: 98

Top 30 most important features:


Unnamed: 0,feature,importance
35,sale_year,1583
9,land_val,1065
10,imp_val,874
4,area,523
2,join_status,484
11,sqft_lot,455
39,total_sqft,346
6,zoning,338
12,sqft_1,335
41,time_since_reno,320


In [8]:
# =============================================================================
# BLOCK 4 (V3-FINAL): AGGRESSIVE OPTUNA TUNING & K-FOLD TRAINING
# =============================================================================
print("--- Starting Block 4: Aggressive Tuning and Final XGBoost Training ---")

# --- Helper Function defined within this block's scope ---
def winkler_score(y_true, lower, upper, alpha=0.1):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    return np.mean(score)

# --- 1. Optuna Objective Function ---
def objective(trial):
    train_x, val_x, train_y, val_y = train_test_split(X_selected, y_log, test_size=0.2, random_state=RANDOM_STATE)
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dval = xgb.DMatrix(val_x, label=val_y)
    
    params = {
        'objective': 'reg:quantileerror', 'eval_metric': 'rmse', 'tree_method': 'hist',
        'eta': trial.suggest_float('eta', 0.008, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 1e-8, 5.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-8, 5.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 5.0, log=True),
    }

    preds_lower, preds_upper = np.zeros(len(val_y)), np.zeros(len(val_y))
    
    # Lower model
    params_lower = params.copy()
    params_lower['quantile_alpha'] = 0.05
    model_lower = xgb.train(params_lower, dtrain, num_boost_round=1500,
                            evals=[(dval, 'eval')], early_stopping_rounds=50, verbose_eval=False)
    preds_lower = model_lower.predict(dval, iteration_range=(0, model_lower.best_iteration))

    # Upper model
    params_upper = params.copy()
    params_upper['quantile_alpha'] = 0.95
    model_upper = xgb.train(params_upper, dtrain, num_boost_round=1500,
                            evals=[(dval, 'eval')], early_stopping_rounds=50, verbose_eval=False)
    preds_upper = model_upper.predict(dval, iteration_range=(0, model_upper.best_iteration))
    
    true_vals = np.expm1(val_y)
    lower_vals = np.expm1(preds_lower)
    upper_vals = np.expm1(preds_upper)
    upper_vals = np.maximum(lower_vals, upper_vals)
    
    # Now the objective function can see winkler_score
    return winkler_score(true_vals, lower_vals, upper_vals)

# --- 2. Run the Aggressive Optuna Study ---
print("Starting AGGRESSIVE Optuna hyperparameter search (50 trials)...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, n_jobs=1) 
best_params = study.best_params
print("Optuna search complete. Best parameters found.")
print(best_params)

# --- 3. Final K-Fold Training with Best Parameters ---
print("\nStarting final K-Fold training with optimized parameters...")
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
oof_preds = np.zeros((len(X_selected), 2))
test_preds = np.zeros((len(X_test_selected), 2))
grade_for_stratify = pd.read_csv(DATA_PATH + 'dataset.csv')['grade']
dtest = xgb.DMatrix(X_test_selected)

final_xgb_params = {
    'objective': 'reg:quantileerror', 'eval_metric': 'rmse', 'tree_method': 'hist',
    'random_state': RANDOM_STATE, 'n_jobs': -1, **best_params
}

for fold, (train_idx, val_idx) in enumerate(skf.split(X_selected, grade_for_stratify)):
    print(f"===== FOLD {fold+1}/{N_SPLITS} =====")
    X_train_fold, y_train_fold = X_selected.iloc[train_idx], y_log.iloc[train_idx]
    X_val_fold, y_val_fold = X_selected.iloc[val_idx], y_log.iloc[val_idx]
    dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
    dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
    
    for i, alpha in enumerate([0.05, 0.95]):
        print(f"Training alpha={alpha}...")
        current_params = final_xgb_params.copy()
        current_params['quantile_alpha'] = alpha
        model = xgb.train(
            params=current_params, dtrain=dtrain, num_boost_round=2000,
            evals=[(dval, 'validation')], early_stopping_rounds=100, verbose_eval=False
        )
        oof_preds[val_idx, i] = model.predict(dval, iteration_range=(0, model.best_iteration))
        test_preds[:, i] += model.predict(dtest, iteration_range=(0, model.best_iteration)) / N_SPLITS

print("\nFinal K-Fold training complete.")

# --- 4. Final Calibration and Submission ---
print("\nCalibrating and creating final submission file...")
oof_lower = np.expm1(oof_preds[:, 0])
oof_upper = np.expm1(oof_preds[:, 1])
oof_upper = np.maximum(oof_lower, oof_upper)
score, coverage = winkler_score(y_true.values, oof_lower, oof_upper, alpha=COMPETITION_ALPHA, return_coverage=True)
print(f"Final OOF Winkler Score (before calib): {score:,.2f} | Coverage: {coverage:.2%}")

best_factor = 1.0
best_coverage_diff = abs(coverage - 0.90)
for factor in np.arange(0.9, 1.2, 0.001):
    center = (oof_lower + oof_upper) / 2
    width = oof_upper - oof_lower
    _, current_coverage = winkler_score(y_true.values, center - (width / 2) * factor, center + (width / 2) * factor, alpha=COMPETITION_ALPHA, return_coverage=True)
    if abs(current_coverage - 0.90) < best_coverage_diff:
        best_coverage_diff = abs(current_coverage - 0.90)
        best_factor = factor
print(f"Best calibration factor found: {best_factor:.3f}")

test_lower = np.expm1(test_preds[:, 0])
test_upper = np.expm1(test_preds[:, 1])
test_center = (test_lower + test_upper) / 2
test_width = test_upper - test_lower
calibrated_lower = test_center - (test_width / 2) * best_factor
calibrated_upper = test_center + (width / 2) * best_factor
calibrated_upper = np.maximum(calibrated_lower, calibrated_upper)

submission_df = pd.DataFrame({
    'id': X_test_selected.index,
    'pi_lower': calibrated_lower,
    'pi_upper': calibrated_upper
})
submission_df.to_csv('submission_god_tier.csv', index=False)
print("\n'submission_god_tier.csv' created successfully!")
display(submission_df.head())

[I 2025-07-05 15:28:13,928] A new study created in memory with name: no-name-fafd74a3-b430-45aa-9c76-6fedec407da1


--- Starting Block 4: Aggressive Tuning and Final XGBoost Training ---
Starting AGGRESSIVE Optuna hyperparameter search (50 trials)...


[I 2025-07-05 15:29:14,638] Trial 0 finished with value: 366112.99016230466 and parameters: {'eta': 0.04426402247976999, 'max_depth': 9, 'subsample': 0.8314579891443503, 'colsample_bytree': 0.5549591570016461, 'min_child_weight': 11, 'gamma': 5.580926678261224e-06, 'lambda': 3.619454111588163e-05, 'alpha': 0.0003817032591822216}. Best is trial 0 with value: 366112.99016230466.
[I 2025-07-05 15:29:54,404] Trial 1 finished with value: 368237.65300371096 and parameters: {'eta': 0.008319384613803637, 'max_depth': 7, 'subsample': 0.5778812975263543, 'colsample_bytree': 0.6319768821167984, 'min_child_weight': 8, 'gamma': 0.641140482357029, 'lambda': 0.004980868856917993, 'alpha': 8.271982630359596e-08}. Best is trial 0 with value: 366112.99016230466.
[I 2025-07-05 15:30:49,027] Trial 2 finished with value: 338425.602528125 and parameters: {'eta': 0.0511811421569362, 'max_depth': 5, 'subsample': 0.829349425517965, 'colsample_bytree': 0.9267797958750108, 'min_child_weight': 15, 'gamma': 2.7708

Optuna search complete. Best parameters found.
{'eta': 0.04902562075576043, 'max_depth': 5, 'subsample': 0.7164661796431884, 'colsample_bytree': 0.5202536986175267, 'min_child_weight': 17, 'gamma': 7.633184418883976e-07, 'lambda': 4.964839984645649e-08, 'alpha': 0.0014344253987733778}

Starting final K-Fold training with optimized parameters...
===== FOLD 1/5 =====
Training alpha=0.05...
Training alpha=0.95...
===== FOLD 2/5 =====
Training alpha=0.05...
Training alpha=0.95...
===== FOLD 3/5 =====
Training alpha=0.05...
Training alpha=0.95...
===== FOLD 4/5 =====
Training alpha=0.05...
Training alpha=0.95...
===== FOLD 5/5 =====
Training alpha=0.05...
Training alpha=0.95...

Final K-Fold training complete.

Calibrating and creating final submission file...


NameError: name 'y_true' is not defined

In [9]:
# =============================================================================
# FINAL BLOCK: CALIBRATION AND SUBMISSION (STANDALONE)
# =============================================================================
print("--- Starting Final Calibration and Submission ---")

# --- 1. Redefine Helper Function and Reload Necessary Data ---

# Define the Correct Winkler Score function
def winkler_score(y_true, lower, upper, alpha=0.1, return_coverage=False):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    
    if return_coverage:
        inside = (y_true >= lower) & (y_true <= upper)
        coverage = np.mean(inside)
        return np.mean(score), coverage
        
    return np.mean(score)

# Reload the true target values, just in case
y_true = pd.read_csv('./dataset.csv')['sale_price']
print("True target values reloaded.")


# --- 2. Evaluate the Out-of-Fold (OOF) Predictions ---
# The oof_preds and test_preds arrays should still be in memory from the last run.
oof_lower = np.expm1(oof_preds[:, 0])
oof_upper = np.expm1(oof_preds[:, 1])
oof_upper = np.maximum(oof_lower, oof_upper)

# Calculate the final validation score and coverage
final_score, final_coverage = winkler_score(y_true.values, oof_lower, oof_upper, alpha=0.1, return_coverage=True)
print(f"\nFinal OOF Winkler Score (before calibration): {final_score:,.2f}")
print(f"Final OOF Coverage (before calibration):    {final_coverage:.2%}")


# --- 3. Find the Best Calibration Factor ---
print("\nSearching for best calibration factor to target 90% coverage...")
best_factor = 1.0
best_coverage_diff = abs(final_coverage - 0.90)

for factor in np.arange(0.9, 1.2, 0.001):
    center = (oof_lower + oof_upper) / 2
    width = oof_upper - oof_lower
    
    new_lower = center - (width / 2) * factor
    new_upper = center + (width / 2) * factor
    
    _, current_coverage = winkler_score(y_true.values, new_lower, new_upper, alpha=0.1, return_coverage=True)
    
    if abs(current_coverage - 0.90) < best_coverage_diff:
        best_coverage_diff = abs(current_coverage - 0.90)
        best_factor = factor

print(f"Best calibration factor found: {best_factor:.3f}")


# --- 4. Create Final Submission File ---
print("\nCreating final submission file...")
# Inverse transform the test set predictions
test_lower = np.expm1(test_preds[:, 0])
test_upper = np.expm1(test_preds[:, 1])

# Apply the learned calibration factor
print(f"Applying calibration factor ({best_factor:.3f}) to test predictions...")
test_center = (test_lower + test_upper) / 2
test_width = test_upper - test_lower
calibrated_lower = test_center - (test_width / 2) * best_factor
calibrated_upper = test_center + (test_width / 2) * best_factor

# Final safety checks
calibrated_upper = np.maximum(calibrated_lower, calibrated_upper)
# Re-load X_test_selected's index just in case
X_test_index = pd.read_csv('./test.csv', index_col="id").index


# Create submission dataframe
submission_df = pd.DataFrame({
    'id': X_test_index,
    'pi_lower': calibrated_lower,
    'pi_upper': calibrated_upper
})

submission_df.to_csv('submission_god_tier_final.csv', index=False)
print("\n'submission_god_tier_final.csv' created successfully!")
display(submission_df.head())

--- Starting Final Calibration and Submission ---
True target values reloaded.

Final OOF Winkler Score (before calibration): 334,758.51
Final OOF Coverage (before calibration):    86.70%

Searching for best calibration factor to target 90% coverage...
Best calibration factor found: 1.102

Creating final submission file...
Applying calibration factor (1.102) to test predictions...

'submission_god_tier_final.csv' created successfully!


Unnamed: 0,id,pi_lower,pi_upper
0,200000,786732.700299,1099599.0
1,200001,530422.909941,801095.4
2,200002,450283.454019,703026.6
3,200003,292081.892586,450115.9
4,200004,356081.5163,737649.5
