In [1]:
# =============================================================================
# BLOCK 1: SETUP, IMPORTS, AND DATA LOADING
# =============================================================================
import warnings
warnings.filterwarnings('ignore')
import time
import os
# --- Library Imports ---
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import catboost as cb
import optuna
print("Libraries imported successfully.")
# --- Helper Function for Winkler Score ---
def winkler_score(y_true, lower, upper, alpha=0.1, return_coverage=False):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    if return_coverage:
        coverage = np.mean((y_true >= lower) & (y_true <= upper))
        return np.mean(score), coverage
    return np.mean(score)
# --- Global Constants ---
N_SPLITS = 5
RANDOM_STATE = 42
DATA_PATH = './'
N_OPTUNA_TRIALS = 75 # A strong number for a comprehensive search
COMPETITION_ALPHA = 0.1

# --- Load Raw Data ---
try:
    # We drop the low-variance columns they identified right away
    drop_cols=['id', 'golf', 'view_rainier', 'view_skyline', 'view_lakesamm','view_otherwater', 'view_other']
    df_train = pd.read_csv(DATA_PATH + 'dataset.csv').drop(columns=drop_cols)
    df_test = pd.read_csv(DATA_PATH + 'test.csv').drop(columns=drop_cols)
    print("Raw data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'dataset.csv' or 'test.csv'.")
    exit()
# --- Prepare Target Variable ---
y_true = df_train['sale_price'].copy()
# The mean-error model works best when predicting the raw price directly
# So, we will NOT log-transform the target this time.
# df_train.drop('sale_price', axis=1, inplace=True) # We keep sale_price for FE
print("Setup complete.")


Libraries imported successfully.
Raw data loaded successfully.
Setup complete.


In [2]:
# Make sure to have these libraries installed
# pip install pandas numpy scikit-learn

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
import gc

# Define a random state for reproducibility
RANDOM_STATE = 42

def create_comprehensive_features(df_train, df_test):
    """
    Combines original and new advanced feature engineering steps into a single pipeline.
    """
    print("--- Starting Comprehensive Feature Engineering ---")

    # Store original indices and target variable
    train_ids = df_train.index
    test_ids = df_test.index
    y_train = df_train['sale_price'].copy() # Keep the target separate

    # Combine for consistent processing
    df_train_temp = df_train.drop(columns=['sale_price'])
    all_data = pd.concat([df_train_temp, df_test], axis=0, ignore_index=True)

    # --- Original Feature Engineering ---

    # A) Brute-Force Numerical Interactions
    print("Step 1: Creating brute-force numerical interaction features...")
    NUMS = ['area', 'land_val', 'imp_val', 'sqft_lot', 'sqft', 'sqft_1', 'grade', 'year_built']
    # Ensure all columns exist and are numeric, fill missing with 0 for safety
    for col in NUMS:
        if col not in all_data.columns:
            all_data[col] = 0
        else:
            all_data[col] = pd.to_numeric(all_data[col], errors='coerce').fillna(0)
            
    for i in range(len(NUMS)):
        for j in range(i + 1, len(NUMS)):
            all_data[f'{NUMS[i]}_x_{NUMS[j]}'] = all_data[NUMS[i]] * all_data[NUMS[j]]

    # B) Date Features
    print("Step 2: Creating date features...")
    all_data['sale_date'] = pd.to_datetime(all_data['sale_date'])
    all_data['sale_year'] = all_data['sale_date'].dt.year
    all_data['sale_month'] = all_data['sale_date'].dt.month
    all_data['sale_dayofyear'] = all_data['sale_date'].dt.dayofyear
    all_data['age_at_sale'] = all_data['sale_year'] - all_data['year_built']

    # C) TF-IDF Text Features
    print("Step 3: Creating TF-IDF features for text columns...")
    text_cols = ['subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data[text_cols] = all_data[text_cols].fillna('missing').astype(str)
    
    for col in text_cols:
        tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=128, binary=True)
        svd = TruncatedSVD(n_components=8, random_state=RANDOM_STATE)
        
        tfidf_matrix = tfidf.fit_transform(all_data[col])
        tfidf_svd = svd.fit_transform(tfidf_matrix)
        
        tfidf_df = pd.DataFrame(tfidf_svd, columns=[f'{col}_tfidf_svd_{i}' for i in range(8)])
        all_data = pd.concat([all_data, tfidf_df], axis=1)

    # D) Log transform some interaction features
    for c in ['land_val_x_imp_val', 'land_val_x_sqft', 'imp_val_x_sqft']:
        if c in all_data.columns:
            all_data[c] = np.log1p(all_data[c].fillna(0))

    # --- New Feature Engineering Ideas ---

    # F) Group-By Aggregation Features
    print("Step 4: Creating group-by aggregation features...")
    group_cols = ['submarket', 'city', 'zoning']
    num_cols_for_agg = ['grade', 'sqft', 'imp_val', 'land_val', 'age_at_sale']

    for group_col in group_cols:
        for num_col in num_cols_for_agg:
            agg_stats = all_data.groupby(group_col)[num_col].agg(['mean', 'std', 'max', 'min']).reset_index()
            agg_stats.columns = [group_col] + [f'{group_col}_{num_col}_{stat}' for stat in ['mean', 'std', 'max', 'min']]
            all_data = pd.merge(all_data, agg_stats, on=group_col, how='left')
            all_data[f'{num_col}_minus_{group_col}_mean'] = all_data[num_col] - all_data[f'{group_col}_{num_col}_mean']

    # G) Ratio Features
    print("Step 5: Creating ratio features...")
    # Add a small epsilon to prevent division by zero
    epsilon = 1e-6 
    all_data['total_val'] = all_data['imp_val'] + all_data['land_val']
    all_data['imp_val_to_land_val_ratio'] = all_data['imp_val'] / (all_data['land_val'] + epsilon)
    all_data['land_val_ratio'] = all_data['land_val'] / (all_data['total_val'] + epsilon)
    all_data['sqft_to_lot_ratio'] = all_data['sqft'] / (all_data['sqft_lot'] + epsilon)
    all_data['was_renovated'] = (all_data['year_reno'] > 0).astype(int)
    all_data['reno_age_at_sale'] = np.where(all_data['was_renovated'] == 1, all_data['sale_year'] - all_data['year_reno'], -1)

    # H) Geospatial Clustering Features
    print("Step 6: Creating geospatial clustering features...")
    coords = all_data[['latitude', 'longitude']].copy()
    coords.fillna(coords.median(), inplace=True) # Simple imputation

    # KMeans is sensitive to feature scaling, but for lat/lon it's often okay without it.
    kmeans = KMeans(n_clusters=20, random_state=RANDOM_STATE, n_init=10) 
    all_data['location_cluster'] = kmeans.fit_predict(coords)
    
    # Calculate distance to each cluster center
    cluster_centers = kmeans.cluster_centers_
    for i in range(len(cluster_centers)):
        center = cluster_centers[i]
        all_data[f'dist_to_cluster_{i}'] = np.sqrt((coords['latitude'] - center[0])**2 + (coords['longitude'] - center[1])**2)

    # --- Final Cleanup ---
    print("Step 7: Finalizing feature set...")
    cols_to_drop = ['sale_date', 'subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data = all_data.drop(columns=cols_to_drop)

    # One-hot encode the new cluster feature
    all_data = pd.get_dummies(all_data, columns=['location_cluster'], prefix='loc_cluster')
    
    # Final check for any remaining object columns to be safe (besides index)
    object_cols = all_data.select_dtypes(include='object').columns
    if len(object_cols) > 0:
        print(f"Warning: Found unexpected object columns: {object_cols}. Dropping them.")
        all_data = all_data.drop(columns=object_cols)
        
    all_data.fillna(0, inplace=True)

    # Separate back into train and test sets
    train_len = len(train_ids)
    X = all_data.iloc[:train_len].copy()
    X_test = all_data.iloc[train_len:].copy()
    
    # Restore original indices
    X.index = train_ids
    X_test.index = test_ids
    
    # Align columns - crucial for model prediction
    X_test = X_test[X.columns]
    
    print(f"\nComprehensive FE complete. Total features: {X.shape[1]}")
    gc.collect()
    
    return X, X_test, y_train
# =============================================================================
# BLOCK 2.5: EXECUTE FEATURE ENGINEERING
# =============================================================================
print("\n--- Starting Block 2.5: Executing Feature Engineering Pipeline ---")

# This is the crucial step that was missing.
# We call the function to create our training and testing dataframes.
X, X_test, y_train = create_comprehensive_features(df_train, df_test)

# Let's verify the output
print(f"Feature engineering complete. X shape: {X.shape}, X_test shape: {X_test.shape}")
gc.collect()


--- Starting Block 2.5: Executing Feature Engineering Pipeline ---
--- Starting Comprehensive Feature Engineering ---
Step 1: Creating brute-force numerical interaction features...
Step 2: Creating date features...
Step 3: Creating TF-IDF features for text columns...
Step 4: Creating group-by aggregation features...
Step 5: Creating ratio features...
Step 6: Creating geospatial clustering features...
Step 7: Finalizing feature set...

Comprehensive FE complete. Total features: 233
Feature engineering complete. X shape: (200000, 233), X_test shape: (200000, 233)


0

In [3]:
# =============================================================================
# BLOCK 3: TUNE CATBOOST MEAN MODEL
# =============================================================================
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import optuna

# --- 1. Prepare Data for Tuning ---
# We create a single, smaller train/validation split from the full dataset
# This makes the tuning process much faster than using K-Folds for every trial.
X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
    X, y_true, test_size=0.2, random_state=RANDOM_STATE
)

# --- 2. Define the Optuna Objective Function for CatBoost ---
def objective_catboost(trial):
    """
    This function takes an Optuna 'trial' object and does the following:
    1. Defines a search space for CatBoost's hyperparameters.
    2. Trains a CatBoost model with a set of hyperparameters suggested by the trial.
    3. Evaluates the model on the validation set.
    4. Returns the validation score (RMSE), which Optuna tries to minimize.
    """
    # Define the hyperparameter search space
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 6, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        
        # Fixed parameters
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'random_seed': RANDOM_STATE,
        'verbose': 0,  # Suppress verbose output during training
        'early_stopping_rounds': 100
    }

    # Initialize and train the CatBoost model with the suggested parameters
    model = cb.CatBoostRegressor(**params)
    model.fit(
        X_train_opt, y_train_opt,
        eval_set=[(X_val_opt, y_val_opt)],
        use_best_model=True
    )

    # Make predictions on the validation set
    preds = model.predict(X_val_opt)

    # Calculate and return the Root Mean Squared Error
    rmse = np.sqrt(mean_squared_error(y_val_opt, preds))
    return rmse

# --- 3. Create and Run the Optuna Study ---
# Create a study object and specify the direction is 'minimize' (for RMSE)
study_catboost = optuna.create_study(direction='minimize')

# Start the optimization process. 
# n_trials can be increased for a more thorough search, but this is a good start.
print("--- Starting CatBoost Hyperparameter Tuning... ---")
study_catboost.optimize(objective_catboost, n_trials=N_OPTUNA_TRIALS)

# --- 4. Print the Best Results ---
print("\n--- CatBoost Tuning Complete ---")
print(f"Best trial validation RMSE: ${study_catboost.best_value:,.2f}")
print("Best hyperparameters found for CatBoost:")
for key, value in study_catboost.best_params.items():
    print(f"  '{key}': {value},")

# Store the best parameters in a dictionary for later use in the K-Fold training loop
best_params_catboost = study_catboost.best_params

[I 2025-07-24 11:05:58,495] A new study created in memory with name: no-name-c3a86e07-b618-489e-adca-74dd0b8be857


--- Starting CatBoost Hyperparameter Tuning... ---


[I 2025-07-24 11:06:43,007] Trial 0 finished with value: 98132.47034990958 and parameters: {'iterations': 1724, 'learning_rate': 0.08553734379319525, 'depth': 9, 'l2_leaf_reg': 0.0034611339608567834, 'subsample': 0.824226327997697, 'random_strength': 0.03361501975246283, 'bagging_temperature': 0.9162153901615222}. Best is trial 0 with value: 98132.47034990958.
[I 2025-07-24 11:07:14,527] Trial 1 finished with value: 103050.68366818536 and parameters: {'iterations': 2121, 'learning_rate': 0.016116870504803364, 'depth': 8, 'l2_leaf_reg': 5.016282029850647, 'subsample': 0.775705794558661, 'random_strength': 0.13366380736653607, 'bagging_temperature': 0.14718337062265796}. Best is trial 0 with value: 98132.47034990958.
[I 2025-07-24 11:07:46,305] Trial 2 finished with value: 99325.02461933375 and parameters: {'iterations': 1238, 'learning_rate': 0.04133952385334066, 'depth': 9, 'l2_leaf_reg': 0.17663790221349804, 'subsample': 0.8876050508601209, 'random_strength': 0.003949268356284431, 'ba


--- CatBoost Tuning Complete ---
Best trial validation RMSE: $96,387.87
Best hyperparameters found for CatBoost:
  'iterations': 2959,
  'learning_rate': 0.0604931337615656,
  'depth': 9,
  'l2_leaf_reg': 0.1195942543993236,
  'subsample': 0.7215144912978994,
  'random_strength': 0.0011789533818766123,
  'bagging_temperature': 0.8080844703507192,


In [4]:
# =============================================================================
# BLOCK 4: K-FOLD TRAINING OF CATBOOST MEAN MODEL
# =============================================================================
import catboost as cb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

print("\n--- STAGE 1, PART 2: K-Fold Training of CatBoost Mean Model ---")
print("# Using the optimal hyperparameters found by Optuna.")

# --- 1. Setup K-Fold and Prediction Arrays ---
# Use StratifiedKFold on 'grade' to ensure consistent splits across models
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
grade_for_stratify = pd.read_csv(DATA_PATH + 'dataset.csv')['grade']

# Initialize arrays to store the out-of-fold (OOF) and test set predictions
oof_catboost_preds = np.zeros(len(X))
test_catboost_preds = np.zeros(len(X_test))

# --- 2. Combine Tuned Params with Fixed Params ---
# The best_params_catboost dictionary should be in memory from the previous cell
# We add the fixed parameters needed for CatBoost training here
final_params_catboost = {
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': RANDOM_STATE,
    'verbose': 0,  # Suppress in-loop verbosity for cleaner output
    'early_stopping_rounds': 100,
    **best_params_catboost # Unpack the tuned hyperparameters
}

# --- 3. K-Fold Training Loop ---
for fold, (train_idx, val_idx) in enumerate(skf.split(X, grade_for_stratify)):
    print(f"  Training CatBoost Mean Model - Fold {fold+1}/{N_SPLITS}...")
    
    # Split the data for the current fold
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_true.iloc[train_idx], y_true.iloc[val_idx]

    # Initialize and train the CatBoost model for this fold
    model = cb.CatBoostRegressor(**final_params_catboost)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        use_best_model=True
    )

    # Generate and store predictions
    # OOF predictions are made on the validation set for this fold
    oof_catboost_preds[val_idx] = model.predict(X_val)
    # Test predictions are averaged across all 5 fold models
    test_catboost_preds += model.predict(X_test) / N_SPLITS
    
    gc.collect()

# --- 4. Final Metrics and Comparison ---
print("\n--- CatBoost K-Fold Training Complete & Performance Metrics ---")

# Calculate the final RMSE score across all OOF predictions
final_catboost_oof_rmse = np.sqrt(mean_squared_error(y_true, oof_catboost_preds))
print(f"CatBoost Final OOF RMSE: ${final_catboost_oof_rmse:,.2f}")

# For direct comparison, here is the score from your best XGBoost model
# This value is taken from the logs of the winner_v1_301 notebook
reference_xgb_rmse = 98990.27
print(f"Reference XGBoost RMSE (from winner_v1_301): ${reference_xgb_rmse:,.2f}")

# Provide a clear conclusion
if final_catboost_oof_rmse < reference_xgb_rmse:
    print("\nCONCLUSION: SUCCESS! The tuned CatBoost model is MORE accurate than the previous XGBoost model.")
else:
    print("\nCONCLUSION: The tuned CatBoost model is LESS accurate than the previous XGBoost model.")


--- STAGE 1, PART 2: K-Fold Training of CatBoost Mean Model ---
# Using the optimal hyperparameters found by Optuna.
  Training CatBoost Mean Model - Fold 1/5...
  Training CatBoost Mean Model - Fold 2/5...
  Training CatBoost Mean Model - Fold 3/5...
  Training CatBoost Mean Model - Fold 4/5...
  Training CatBoost Mean Model - Fold 5/5...

--- CatBoost K-Fold Training Complete & Performance Metrics ---
CatBoost Final OOF RMSE: $96,371.94
Reference XGBoost RMSE (from winner_v1_301): $98,990.27

CONCLUSION: SUCCESS! The tuned CatBoost model is MORE accurate than the previous XGBoost model.


In [5]:
# Define the save path and save the final prediction arrays
SAVE_PATH = './mean_models_v1/'
os.makedirs(SAVE_PATH, exist_ok=True)
np.save(os.path.join(SAVE_PATH, 'oof_cb_preds.npy'), oof_catboost_preds)
np.save(os.path.join(SAVE_PATH, 'test_cb_preds.npy'), test_catboost_preds)
print("\n'oof_cb_preds.npy' and 'test_cb_preds.npy' saved successfully.")


'oof_cb_preds.npy' and 'test_cb_preds.npy' saved successfully.
