In [1]:
# =============================================================================
# BLOCK 1: SETUP, IMPORTS, AND DATA LOADING
# =============================================================================
import warnings
warnings.filterwarnings('ignore')
import time
import os
# --- Library Imports ---
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import catboost as cb
import optuna
print("Libraries imported successfully.")
# --- Helper Function for Winkler Score ---
def winkler_score(y_true, lower, upper, alpha=0.1, return_coverage=False):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    if return_coverage:
        coverage = np.mean((y_true >= lower) & (y_true <= upper))
        return np.mean(score), coverage
    return np.mean(score)
# --- Global Constants ---
N_SPLITS = 5
RANDOM_STATE = 42
DATA_PATH = './'
N_OPTUNA_TRIALS = 75 # A strong number for a comprehensive search
COMPETITION_ALPHA = 0.1

# --- Load Raw Data ---
try:
    # We drop the low-variance columns they identified right away
    drop_cols=['id', 'golf', 'view_rainier', 'view_skyline', 'view_lakesamm','view_otherwater', 'view_other']
    df_train = pd.read_csv(DATA_PATH + 'dataset.csv').drop(columns=drop_cols)
    df_test = pd.read_csv(DATA_PATH + 'test.csv').drop(columns=drop_cols)
    print("Raw data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'dataset.csv' or 'test.csv'.")
    exit()
# --- Prepare Target Variable ---
y_true = df_train['sale_price'].copy()
grade_for_stratify = df_train['grade'].copy()
# The mean-error model works best when predicting the raw price directly
# So, we will NOT log-transform the target this time.
# df_train.drop('sale_price', axis=1, inplace=True) # We keep sale_price for FE
print("Setup complete.")


Libraries imported successfully.
Raw data loaded successfully.
Setup complete.


In [2]:
# Make sure to have these libraries installed
# pip install pandas numpy scikit-learn

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
import gc

# Define a random state for reproducibility
RANDOM_STATE = 42

def create_comprehensive_features(df_train, df_test):
    """
    Combines original and new advanced feature engineering steps into a single pipeline.
    """
    print("--- Starting Comprehensive Feature Engineering ---")

    # Store original indices and target variable
    train_ids = df_train.index
    test_ids = df_test.index
    y_train = df_train['sale_price'].copy() # Keep the target separate

    # Combine for consistent processing
    df_train_temp = df_train.drop(columns=['sale_price'])
    all_data = pd.concat([df_train_temp, df_test], axis=0, ignore_index=True)

    # --- Original Feature Engineering ---

    # A) Brute-Force Numerical Interactions
    print("Step 1: Creating brute-force numerical interaction features...")
    NUMS = ['area', 'land_val', 'imp_val', 'sqft_lot', 'sqft', 'sqft_1', 'grade', 'year_built']
    # Ensure all columns exist and are numeric, fill missing with 0 for safety
    for col in NUMS:
        if col not in all_data.columns:
            all_data[col] = 0
        else:
            all_data[col] = pd.to_numeric(all_data[col], errors='coerce').fillna(0)
            
    for i in range(len(NUMS)):
        for j in range(i + 1, len(NUMS)):
            all_data[f'{NUMS[i]}_x_{NUMS[j]}'] = all_data[NUMS[i]] * all_data[NUMS[j]]

    # B) Date Features
    print("Step 2: Creating date features...")
    all_data['sale_date'] = pd.to_datetime(all_data['sale_date'])
    all_data['sale_year'] = all_data['sale_date'].dt.year
    all_data['sale_month'] = all_data['sale_date'].dt.month
    all_data['sale_dayofyear'] = all_data['sale_date'].dt.dayofyear
    all_data['age_at_sale'] = all_data['sale_year'] - all_data['year_built']

    # C) TF-IDF Text Features
    print("Step 3: Creating TF-IDF features for text columns...")
    text_cols = ['subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data[text_cols] = all_data[text_cols].fillna('missing').astype(str)
    
    for col in text_cols:
        tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=128, binary=True)
        svd = TruncatedSVD(n_components=8, random_state=RANDOM_STATE)
        
        tfidf_matrix = tfidf.fit_transform(all_data[col])
        tfidf_svd = svd.fit_transform(tfidf_matrix)
        
        tfidf_df = pd.DataFrame(tfidf_svd, columns=[f'{col}_tfidf_svd_{i}' for i in range(8)])
        all_data = pd.concat([all_data, tfidf_df], axis=1)

    # D) Log transform some interaction features
    for c in ['land_val_x_imp_val', 'land_val_x_sqft', 'imp_val_x_sqft']:
        if c in all_data.columns:
            all_data[c] = np.log1p(all_data[c].fillna(0))

    # --- New Feature Engineering Ideas ---

    # F) Group-By Aggregation Features
    print("Step 4: Creating group-by aggregation features...")
    group_cols = ['submarket', 'city', 'zoning']
    num_cols_for_agg = ['grade', 'sqft', 'imp_val', 'land_val', 'age_at_sale']

    for group_col in group_cols:
        for num_col in num_cols_for_agg:
            agg_stats = all_data.groupby(group_col)[num_col].agg(['mean', 'std', 'max', 'min']).reset_index()
            agg_stats.columns = [group_col] + [f'{group_col}_{num_col}_{stat}' for stat in ['mean', 'std', 'max', 'min']]
            all_data = pd.merge(all_data, agg_stats, on=group_col, how='left')
            all_data[f'{num_col}_minus_{group_col}_mean'] = all_data[num_col] - all_data[f'{group_col}_{num_col}_mean']

    # G) Ratio Features
    print("Step 5: Creating ratio features...")
    # Add a small epsilon to prevent division by zero
    epsilon = 1e-6 
    all_data['total_val'] = all_data['imp_val'] + all_data['land_val']
    all_data['imp_val_to_land_val_ratio'] = all_data['imp_val'] / (all_data['land_val'] + epsilon)
    all_data['land_val_ratio'] = all_data['land_val'] / (all_data['total_val'] + epsilon)
    all_data['sqft_to_lot_ratio'] = all_data['sqft'] / (all_data['sqft_lot'] + epsilon)
    all_data['was_renovated'] = (all_data['year_reno'] > 0).astype(int)
    all_data['reno_age_at_sale'] = np.where(all_data['was_renovated'] == 1, all_data['sale_year'] - all_data['year_reno'], -1)

    # H) Geospatial Clustering Features
    print("Step 6: Creating geospatial clustering features...")
    coords = all_data[['latitude', 'longitude']].copy()
    coords.fillna(coords.median(), inplace=True) # Simple imputation

    # KMeans is sensitive to feature scaling, but for lat/lon it's often okay without it.
    kmeans = KMeans(n_clusters=20, random_state=RANDOM_STATE, n_init=10) 
    all_data['location_cluster'] = kmeans.fit_predict(coords)
    
    # Calculate distance to each cluster center
    cluster_centers = kmeans.cluster_centers_
    for i in range(len(cluster_centers)):
        center = cluster_centers[i]
        all_data[f'dist_to_cluster_{i}'] = np.sqrt((coords['latitude'] - center[0])**2 + (coords['longitude'] - center[1])**2)

    # --- Final Cleanup ---
    print("Step 7: Finalizing feature set...")
    cols_to_drop = ['sale_date', 'subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data = all_data.drop(columns=cols_to_drop)

    # One-hot encode the new cluster feature
    all_data = pd.get_dummies(all_data, columns=['location_cluster'], prefix='loc_cluster')
    
    # Final check for any remaining object columns to be safe (besides index)
    object_cols = all_data.select_dtypes(include='object').columns
    if len(object_cols) > 0:
        print(f"Warning: Found unexpected object columns: {object_cols}. Dropping them.")
        all_data = all_data.drop(columns=object_cols)
        
    all_data.fillna(0, inplace=True)

    # Separate back into train and test sets
    train_len = len(train_ids)
    X = all_data.iloc[:train_len].copy()
    X_test = all_data.iloc[train_len:].copy()
    
    # Restore original indices
    X.index = train_ids
    X_test.index = test_ids
    
    # Align columns - crucial for model prediction
    X_test = X_test[X.columns]
    
    print(f"\nComprehensive FE complete. Total features: {X.shape[1]}")
    gc.collect()
    
    return X, X_test, y_train
# =============================================================================
# BLOCK 2.5: EXECUTE FEATURE ENGINEERING
# =============================================================================
print("\n--- Starting Block 2.5: Executing Feature Engineering Pipeline ---")

# This is the crucial step that was missing.
# We call the function to create our training and testing dataframes.
X, X_test, y_train = create_comprehensive_features(df_train, df_test)

# Let's verify the output
print(f"Feature engineering complete. X shape: {X.shape}, X_test shape: {X_test.shape}")
gc.collect()


--- Starting Block 2.5: Executing Feature Engineering Pipeline ---
--- Starting Comprehensive Feature Engineering ---
Step 1: Creating brute-force numerical interaction features...
Step 2: Creating date features...
Step 3: Creating TF-IDF features for text columns...
Step 4: Creating group-by aggregation features...
Step 5: Creating ratio features...
Step 6: Creating geospatial clustering features...
Step 7: Finalizing feature set...

Comprehensive FE complete. Total features: 233
Feature engineering complete. X shape: (200000, 233), X_test shape: (200000, 233)


0

In [3]:
# =============================================================================
# BLOCK 3: TUNE XGBOOST MEAN MODEL (NATIVE API)
# =============================================================================
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import optuna

# --- 1. Prepare Data for Tuning ---
X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
    X, y_true, test_size=0.2, random_state=RANDOM_STATE
)

# Convert data to DMatrix format for the native API
dtrain_opt = xgb.DMatrix(X_train_opt, label=y_train_opt)
dval_opt = xgb.DMatrix(X_val_opt, label=y_val_opt)

# --- 2. Define the Optuna Objective Function for XGBoost ---
def objective_xgboost(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 6, 10),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'objective': 'reg:squarederror', 'eval_metric': 'rmse',
        'random_state': RANDOM_STATE, 'n_jobs': -1, 'tree_method': 'hist'
    }

    model = xgb.train(
        params=params,
        dtrain=dtrain_opt,
        num_boost_round=3000,
        evals=[(dval_opt, 'validation')],
        early_stopping_rounds=100,
        verbose_eval=False
    )
    
    preds = model.predict(dval_opt, iteration_range=(0, model.best_iteration))
    rmse = np.sqrt(mean_squared_error(y_val_opt, preds))
    trial.set_user_attr("best_iteration", model.best_iteration)
    return rmse

# --- 3. Create and Run the Optuna Study ---
study_xgboost = optuna.create_study(direction='minimize')
print("--- Starting XGBoost Hyperparameter Tuning... ---")
study_xgboost.optimize(objective_xgboost, n_trials=N_OPTUNA_TRIALS)

# --- 4. Print and Store the Best Results ---
print("\n--- XGBoost Tuning Complete ---")
print(f"Best trial validation RMSE: ${study_xgboost.best_value:,.2f}")
print("Best hyperparameters found for XGBoost:")

best_params_xgboost = study_xgboost.best_params
best_params_xgboost['n_estimators'] = study_xgboost.best_trial.user_attrs['best_iteration']

for key, value in best_params_xgboost.items():
    print(f"  '{key}': {value},")

[I 2025-07-24 12:55:54,236] A new study created in memory with name: no-name-fcab0ed8-99b2-43ad-9fc6-16471ba7d77f


--- Starting XGBoost Hyperparameter Tuning... ---


[I 2025-07-24 12:56:41,062] Trial 0 finished with value: 100074.45292381068 and parameters: {'learning_rate': 0.08443487752486299, 'max_depth': 9, 'lambda': 3.340441544565785, 'alpha': 0.004250127879555757, 'subsample': 0.9879905530373351, 'colsample_bytree': 0.9975926226984286}. Best is trial 0 with value: 100074.45292381068.
[I 2025-07-24 12:57:32,012] Trial 1 finished with value: 98936.7604078484 and parameters: {'learning_rate': 0.061070515835173894, 'max_depth': 6, 'lambda': 0.008098523552529477, 'alpha': 0.0034115202023448886, 'subsample': 0.9617865579778011, 'colsample_bytree': 0.7724826615844702}. Best is trial 1 with value: 98936.7604078484.
[I 2025-07-24 12:58:44,709] Trial 2 finished with value: 98057.25804855039 and parameters: {'learning_rate': 0.04113171607826967, 'max_depth': 8, 'lambda': 0.010519921186851675, 'alpha': 1.7075068460265435, 'subsample': 0.8519866462663147, 'colsample_bytree': 0.7826855118657211}. Best is trial 2 with value: 98057.25804855039.
[I 2025-07-24


--- XGBoost Tuning Complete ---
Best trial validation RMSE: $97,326.88
Best hyperparameters found for XGBoost:
  'learning_rate': 0.03552717465641824,
  'max_depth': 7,
  'lambda': 3.7544787161366617,
  'alpha': 0.019811345139349127,
  'subsample': 0.8907068186135738,
  'colsample_bytree': 0.7265925353844132,
  'n_estimators': 2999,


In [4]:
# =============================================================================
# BLOCK 4: K-FOLD TRAINING & SAVING WITH OPTIMAL XGBOOST PARAMETERS
# =============================================================================
from sklearn.model_selection import StratifiedKFold
import gc

print("\n" + "="*80)
print("--- Step 4: K-Fold Cross-Validation with Optimal Hyperparameters ---")
print("="*80)

# Initialize arrays to store the predictions
oof_xgb_preds = np.zeros(len(X))
test_xgb_preds = np.zeros(len(X_test))

# Use StratifiedKFold for the final training, as per the winning strategy
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, grade_for_stratify)):
    print(f"\n--- Training Fold {fold+1}/{N_SPLITS} ---")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y_true.iloc[train_idx], y_true.iloc[val_idx]

    # Initialize and train the model for this fold using the best parameters
    # The number of estimators was already determined by early stopping during tuning
    model = xgb.XGBRegressor(**best_params_xgboost)
    model.fit(X_train, y_train_fold)
    
    # Generate predictions for the validation set (this fold's OOF part)
    oof_preds_fold = model.predict(X_val)
    oof_xgb_preds[val_idx] = np.clip(oof_preds_fold, 0, None)
    
    # Generate predictions for the test set and accumulate them
    test_xgb_preds += np.clip(model.predict(X_test), 0, None) / N_SPLITS
    
    fold_rmse = np.sqrt(mean_squared_error(y_val_fold, oof_preds_fold))
    print(f"  Fold {fold+1} Validation RMSE: ${fold_rmse:,.2f}")
    del model, X_train, X_val, y_train_fold, y_val_fold
    gc.collect()

# --- Final Evaluation and Saving ---
print("\n" + "="*80)
print("--- Step 5: Final Evaluation and Saving Predictions ---")
print("="*80)

final_oof_rmse = np.sqrt(mean_squared_error(y_true, oof_xgb_preds))
print(f"Final XGBoost OOF RMSE across all {N_SPLITS} folds: ${final_oof_rmse:,.2f}")




--- Step 4: K-Fold Cross-Validation with Optimal Hyperparameters ---

--- Training Fold 1/5 ---
  Fold 1 Validation RMSE: $97,274.25

--- Training Fold 2/5 ---
  Fold 2 Validation RMSE: $96,974.68

--- Training Fold 3/5 ---
  Fold 3 Validation RMSE: $97,263.04

--- Training Fold 4/5 ---
  Fold 4 Validation RMSE: $97,126.81

--- Training Fold 5/5 ---
  Fold 5 Validation RMSE: $96,735.00

--- Step 5: Final Evaluation and Saving Predictions ---
Final XGBoost OOF RMSE across all 5 folds: $97,074.97


In [5]:
# Define the save path and save the final prediction arrays
SAVE_PATH = './mean_models_v1/'
os.makedirs(SAVE_PATH, exist_ok=True)
np.save(os.path.join(SAVE_PATH, 'oof_xgb_preds.npy'), oof_xgb_preds)
np.save(os.path.join(SAVE_PATH, 'test_xgb_preds.npy'), test_xgb_preds)
print("\n'oof_xgb_preds.npy' and 'test_xgb_preds.npy' saved successfully.")


'oof_xgb_preds.npy' and 'test_xgb_preds.npy' saved successfully.
