In [1]:
# =============================================================================
# BLOCK 1: SETUP, IMPORTS, AND DATA LOADING
# =============================================================================
import warnings
warnings.filterwarnings('ignore')

# --- Library Imports ---
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.cluster import KMeans
import lightgbm as lgb
import xgboost as xgb
import optuna

print("Libraries imported successfully.")

# --- Global Constants ---
N_SPLITS = 5
RANDOM_STATE = 42
DATA_PATH = './'

# --- Load Raw Data ---
try:
    df_train = pd.read_csv(DATA_PATH + 'dataset.csv', index_col="id", parse_dates=["sale_date"])
    df_test = pd.read_csv(DATA_PATH + 'test.csv', index_col="id", parse_dates=["sale_date"])
    print("Raw data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'dataset.csv' or 'test.csv'.")
    exit()

# --- Prepare Target Variable ---
y_log = np.log1p(df_train['sale_price'])
# Add target to df_train for feature engineering, then drop sale_price
df_train['sale_price_log'] = y_log
df_train.drop('sale_price', axis=1, inplace=True)

print("Setup complete.")

Libraries imported successfully.
Raw data loaded successfully.
Setup complete.


In [4]:
# =============================================================================
# BLOCK 2 (V4): GOD-TIER FEATURE ENGINEERING
# =============================================================================
print("--- Starting Block 2 (V4): God-Tier Feature Engineering ---")

def create_god_tier_features(train_df, test_df):
    """
    Builds upon the successful V3 feature set with even more advanced interactions.
    """
    # 1. Combine for consistent processing
    train_df['is_train'] = 1
    test_df['is_train'] = 0
    all_data = pd.concat([train_df, test_df], axis=0)

    # 2. Foundational Feature Creation (Keep all the successful features from before)
    all_data['sale_year'] = all_data['sale_date'].dt.year
    all_data['sale_month'] = all_data['sale_date'].dt.month
    all_data['age_at_sale'] = all_data['sale_year'] - all_data['year_built']
    all_data['total_bathrooms'] = all_data['bath_full'] + 0.5 * all_data['bath_half'] + 0.75 * all_data['bath_3qtr']
    all_data['total_sqft'] = all_data['sqft'] + all_data['sqft_fbsmt']
    all_data['was_renovated'] = (all_data['year_reno'] > 0).astype(int)
    all_data['time_since_reno'] = np.where(all_data['was_renovated'] == 1, all_data['sale_year'] - all_data['year_reno'], all_data['age_at_sale'])
    
    # 3. ADVANCED FEATURE CREATION (Keep all successful features)

    # --- A) Location Clusters ---
    kmeans = KMeans(n_clusters=40, random_state=RANDOM_STATE, n_init='auto') # Tuned: More clusters
    all_data['location_cluster'] = kmeans.fit_predict(all_data[['latitude', 'longitude']])
    
    # --- B) Peer-Comparison & Target-Encoded Features ---
    train_copy_for_aggs = all_data[all_data['is_train'] == 1].copy()
    group_cols_to_agg = ['location_cluster', 'city', 'submarket']
    if 'zipcode' in all_data.columns: group_cols_to_agg.append('zipcode')
    for group_col in group_cols_to_agg:
        aggs = {'grade': ['mean', 'std'], 'age_at_sale': ['mean', 'std'], 'total_sqft': ['mean', 'std'], 'sale_price_log': ['mean']}
        group_aggs = train_copy_for_aggs.groupby(group_col).agg(aggs)
        group_aggs.columns = [f'{c[0]}_agg_{c[1]}_by_{group_col}' for c in group_aggs.columns]
        all_data = all_data.merge(group_aggs, on=group_col, how='left')
        all_data[f'grade_vs_mean_{group_col}'] = all_data['grade'] - all_data[f'grade_agg_mean_by_{group_col}']
        all_data[f'sqft_vs_mean_{group_col}'] = all_data['total_sqft'] - all_data[f'total_sqft_agg_mean_by_{group_col}']
        all_data[f'age_zscore_{group_col}'] = (all_data['age_at_sale'] - all_data[f'age_at_sale_agg_mean_by_{group_col}']) / (all_data[f'age_at_sale_agg_std_by_{group_col}'] + 1e-6)

    # --- C) Time-based Trend Feature ---
    all_data['market_trend'] = all_data.groupby('sale_year')['sale_price_log'].transform('mean')
    
    # --- D) 'sale_warning' Handling ---
    # (Keeping this the same as it was effective)
    sale_warnings_dummies = all_data['sale_warning'].fillna('').str.get_dummies(sep=' ')
    top_warnings = sale_warnings_dummies.sum().sort_values(ascending=False).head(15).index
    sale_warnings_dummies = sale_warnings_dummies[top_warnings]
    sale_warnings_dummies.columns = [f'warning_{col}' for col in top_warnings]
    all_data = all_data.join(sale_warnings_dummies)

    # --- E) Deeper Interaction & Advanced Ratio Features ---
    all_data['imp_val_to_total_val'] = all_data['imp_val'] / (all_data['land_val'] + all_data['imp_val'] + 1e-6)
    all_data['land_val_per_sqft_lot'] = all_data['land_val'] / (all_data['sqft_lot'] + 1e-6)
    all_data['grade_x_sqft_vs_mean_cluster'] = all_data['grade'] * all_data['sqft_vs_mean_location_cluster']
    all_data['age_x_grade'] = all_data['age_at_sale'] * all_data['grade']
    all_data['age_at_sale_sq'] = all_data['age_at_sale']**2
    all_data['sin_sale_month'] = np.sin(2 * np.pi * all_data['sale_month']/12)
    all_data['cos_sale_month'] = np.cos(2 * np.pi * all_data['sale_month']/12)

    # --- F) NEW: God-Tier Interaction Features ---
    print("Creating god-tier interaction features...")
    # Does being 'better than your neighbors' matter more in expensive neighborhoods?
    all_data['grade_vs_mean_x_location_price'] = all_data['grade_vs_mean_location_cluster'] * all_data['sale_price_log_agg_mean_by_location_cluster']
    # Interaction of age and renovation status with location value
    all_data['age_x_location_price'] = all_data['age_at_sale'] * all_data['sale_price_log_agg_mean_by_location_cluster']
    all_data['reno_x_location_price'] = all_data['was_renovated'] * all_data['sale_price_log_agg_mean_by_location_cluster']
    # How much of the lot is used?
    all_data['lot_utilization'] = all_data['total_sqft'] / (all_data['sqft_lot'] + 1e-6)

    # 4. Final Cleanup and Encoding
    print("Finalizing dataset...")
    for col in all_data.select_dtypes(include='object').columns:
        all_data[col] = pd.Categorical(all_data[col]).codes
        
    cols_to_drop = [
        'sale_date', 'year_built', 'year_reno', 'bath_full', 'bath_half',
        'bath_3qtr', 'sqft', 'sqft_fbsmt', 'latitude', 'longitude', 'sale_price_log'
    ]
    all_data = all_data.drop(columns=cols_to_drop)
    all_data.fillna(0, inplace=True)

    # 5. Separate back into train and test
    train_processed = all_data[all_data['is_train'] == 1].drop(columns=['is_train'])
    test_processed = all_data[all_data['is_train'] == 0].drop(columns=['is_train'])
    
    train_cols = train_processed.columns
    test_processed = test_processed[train_cols]
    
    return train_processed, test_processed

# Run the new feature engineering pipeline
X, X_test = create_god_tier_features(df_train, df_test)

print("Hyper-aggressive feature engineering (V3) complete.")
print(f"Final training features shape: {X.shape}")

# Clean up memory
del df_train, df_test
gc.collect()


--- Starting Block 2 (V4): God-Tier Feature Engineering ---


KeyError: 'grade_agg_mean_by_location_cluster'