In [4]:
import warnings
warnings.filterwarnings('ignore')
# --- Library Imports ---
# Core Libraries
import pandas as pd
import numpy as np
import gc # Garbage Collector for memory management

# Scikit-learn for modeling and preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler # For scaling data for the Neural Network

# Gradient Boosting Models
import lightgbm as lgb
import xgboost as xgb

# PyTorch for the Neural Network
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

print("All libraries imported successfully.")

# --- Global Constants ---
# This makes it easy to change settings for the entire notebook in one place.
N_SPLITS = 5
RANDOM_STATE = 42
COMPETITION_ALPHA = 0.1
DATA_PATH = './' # Assumes data is in the same directory. Change if needed.

# For PyTorch Model
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 50
BATCH_SIZE = 512
LEARNING_RATE = 1e-3

print(f"Global constants defined. Using device: {DEVICE}")

# --- Load Raw Data ---
try:
    df_train_raw = pd.read_csv(DATA_PATH + 'dataset.csv', index_col="id", parse_dates=["sale_date"])
    df_test_raw = pd.read_csv(DATA_PATH + 'test.csv', index_col="id", parse_dates=["sale_date"])
    print("Raw data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'dataset.csv' or 'test.csv'.")
    exit() # Stop execution if data is not found

# --- Initial Data Prep ---
# Store the true target values and create the log-transformed version for training
y_true = df_train_raw['sale_price'].copy()
y_log = np.log1p(y_true)

# Drop the target from the training dataframe
df_train_raw.drop('sale_price', axis=1, inplace=True)

print("Setup complete. Raw data is ready for feature engineering.")

All libraries imported successfully.
Global constants defined. Using device: cuda
Raw data loaded successfully.
Setup complete. Raw data is ready for feature engineering.


In [5]:
# =============================================================================
# BLOCK 2 (V2): HYPER-AGGRESSIVE FEATURE ENGINEERING
# =============================================================================
print("--- Starting Block 2 (V2): Hyper-Aggressive Feature Engineering ---")

def create_advanced_features(train_df, test_df):
    """
    Creates a set of highly contextual and powerful features.
    """
    # 1. Combine for consistent processing
    train_df['is_train'] = 1
    test_df['is_train'] = 0
    # Add the target variable to the training data temporarily for group-by operations
    train_df['sale_price_log'] = y_log
    all_data = pd.concat([train_df, test_df], axis=0)

    # 2. Foundational Feature Creation (similar to before, but a bit cleaner)
    all_data['sale_year'] = all_data['sale_date'].dt.year
    all_data['sale_month'] = all_data['sale_date'].dt.month
    all_data['age_at_sale'] = all_data['sale_year'] - all_data['year_built']
    all_data['was_renovated'] = (all_data['year_reno'] > 0).astype(int)
    all_data['total_bathrooms'] = all_data['bath_full'] + 0.5 * all_data['bath_half'] + 0.75 * all_data['bath_3qtr']
    all_data['total_sqft'] = all_data['sqft'] + all_data['sqft_fbsmt']
    
    # 3. ADVANCED FEATURE CREATION
    
    # --- A) Location, Location, Location ---
    # Create precise location clusters
    kmeans = KMeans(n_clusters=30, random_state=RANDOM_STATE, n_init='auto') # More clusters for more precision
    all_data['location_cluster'] = kmeans.fit_predict(all_data[['latitude', 'longitude']])
    
    # --- B) Peer-Comparison Features (The Core of this Strategy) ---
    # Create features that compare a house to its local environment (by location_cluster, city, and submarket)
    for group_col in ['location_cluster', 'city', 'submarket']:
        # Group by the categorical column
        grouped = all_data.groupby(group_col)
        
        # Define aggregations
        aggs = {
            'grade': ['mean', 'std'],
            'age_at_sale': ['mean', 'std'],
            'total_sqft': ['mean', 'std'],
            'sale_price_log': ['mean'] # This is a form of Target Encoding
        }
        
        # Calculate aggregations
        group_aggs = grouped.agg(aggs)
        
        # Flatten the multi-index column names
        group_aggs.columns = [f'{col[0]}_agg_{col[1]}_by_{group_col}' for col in group_aggs.columns]
        
        # Merge these new features back to the main dataframe
        all_data = all_data.merge(group_aggs, on=group_col, how='left')

        # --- Create Relative (Difference & Ratio) Features ---
        all_data[f'grade_vs_mean_{group_col}'] = all_data['grade'] - all_data[f'grade_agg_mean_by_{group_col}']
        all_data[f'sqft_vs_mean_{group_col}'] = all_data['total_sqft'] - all_data[f'total_sqft_agg_mean_by_{group_col}']
        # Z-score for age to see how many standard deviations away it is from the mean
        all_data[f'age_zscore_{group_col}'] = (all_data['age_at_sale'] - all_data[f'age_at_sale_agg_mean_by_{group_col}']) / (all_data[f'age_at_sale_agg_std_by_{group_col}'] + 1e-6)

    print("Peer-comparison features created.")
    
    # --- C) Advanced 'sale_warning' Handling ---
    sale_warnings_dummies = all_data['sale_warning'].fillna('').str.get_dummies(sep=' ')
    top_warnings = sale_warnings_dummies.sum().sort_values(ascending=False).head(15).index
    sale_warnings_dummies = sale_warnings_dummies[top_warnings]
    sale_warnings_dummies.columns = [f'warning_{col}' for col in top_warnings]
    all_data = all_data.join(sale_warnings_dummies)
    
    print("Advanced sale_warning features created.")

    # --- D) Time-based Trend Feature ---
    # This captures the general upward trend of prices over time.
    # We use the Target Encoded sale_price_log by year.
    all_data['market_trend'] = all_data.groupby('sale_year')['sale_price_log_agg_mean_by_city'].transform('mean')

    # 4. Final Cleanup and Encoding
    # Use simple categorical encoding for remaining object columns
    for col in all_data.select_dtypes(include='object').columns:
        all_data[col] = pd.Categorical(all_data[col]).codes
        
    # Drop original columns and helper columns
    cols_to_drop = [
        'sale_date', 'year_built', 'year_reno', 'bath_full', 'bath_half',
        'bath_3qtr', 'sqft', 'sqft_fbsmt', 'latitude', 'longitude', 'sale_price_log' # Drop the helper target column
    ]
    all_data = all_data.drop(columns=cols_to_drop)

    # Fill any NaNs that resulted from aggregations (e.g., std dev of a single-item group is NaN)
    all_data.fillna(0, inplace=True) # Filling with 0 is safe for these features

    # 5. Separate back into train and test
    train_processed = all_data[all_data['is_train'] == 1].drop(columns=['is_train'])
    test_processed = all_data[all_data['is_train'] == 0].drop(columns=['is_train'])

    # Align columns to ensure consistency
    train_cols = train_processed.columns
    test_processed = test_processed[train_cols]
    
    return train_processed, test_processed

# Run the new feature engineering pipeline
X, X_test = create_advanced_features(df_train_raw, df_test_raw)

print("Hyper-aggressive feature engineering complete.")
print(f"Final training features shape: {X.shape}")
print(f"Final test features shape:     {X_test.shape}")

# --- Clean up memory ---
del df_train_raw, df_test_raw
gc.collect()

# Display a sample of the final processed data
display(X.head())

--- Starting Block 2 (V2): Hyper-Aggressive Feature Engineering ---
Peer-comparison features created.
Hyper-aggressive feature engineering complete.
Final training features shape: (200000, 88)
Final test features shape:     (200000, 88)


Unnamed: 0,sale_nbr,sale_warning,join_status,join_year,area,city,zoning,subdivision,present_use,land_val,...,warning_56,warning_29,warning_54,warning_3,warning_35,warning_4,warning_16,warning_34,warning_44,market_trend
0,2.0,0,3,2025,53,13,458,104,2,167000,...,0,0,0,0,0,0,0,0,0,13.102912
1,0.0,85,3,2025,74,19,443,10302,2,1184000,...,0,0,0,0,0,0,0,0,0,13.081228
2,1.0,0,3,2025,30,30,313,2847,2,230000,...,0,0,0,0,0,0,0,0,0,13.062894
3,1.0,0,3,2025,96,6,448,6838,2,190000,...,0,0,0,0,0,0,0,0,0,13.081228
4,2.0,0,1,2025,36,18,382,4133,2,616000,...,0,0,0,0,0,0,0,0,0,13.062685


In [6]:
# =============================================================================
# BLOCK 3: SMART FEATURE SELECTION
# =============================================================================
print("--- Starting Block 3: Smart Feature Selection ---")

# We will train a single, fast LightGBM model on the full training data
# to get a ranked list of feature importances.

# Define the model. We use simple parameters as we only care about feature ranking.
fs_model = lgb.LGBMRegressor(
    random_state=RANDOM_STATE,
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    n_jobs=-1,
)

print("Training a temporary LightGBM model to find feature importances...")
# We use 'y_log' which is the log-transformed target we created in Block 1
fs_model.fit(X, y_log)

# Create a DataFrame of feature importances
importances = pd.DataFrame({
    'feature': X.columns,
    'importance': fs_model.feature_importances_
}).sort_values('importance', ascending=False)

# --- Define a threshold for dropping features ---
# We will drop features that have zero importance. This is a safe and effective first step.
ZERO_IMPORTANCE_THRESHOLD = 0
useless_features = importances[importances['importance'] <= ZERO_IMPORTANCE_THRESHOLD]['feature'].tolist()

print(f"\nFound {len(useless_features)} features with zero importance.")
if len(useless_features) > 0:
    print("Useless features to be dropped:", useless_features)

# --- Drop the useless features from our datasets ---
X_selected = X.drop(columns=useless_features)
X_test_selected = X_test.drop(columns=useless_features)

# It's also good practice to align columns after dropping
X_test_selected = X_test_selected[X_selected.columns]

print(f"\nFeature selection complete.")
print(f"Original number of features: {X.shape[1]}")
print(f"Number of features after selection: {X_selected.shape[1]}")

# --- Clean up memory ---
del X, X_test, fs_model
gc.collect()

# Display the top 30 most important features for our review
print("\nTop 30 most important features:")
display(importances.head(30))

--- Starting Block 3: Smart Feature Selection ---
Training a temporary LightGBM model to find feature importances...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021262 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5669
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 88
[LightGBM] [Info] Start training from score 13.078327

Found 3 features with zero importance.

Feature selection complete.
Original number of features: 88
Number of features after selection: 85

Top 30 most important features:


Unnamed: 0,feature,importance
35,sale_year,1474
9,land_val,1207
10,imp_val,992
37,age_at_sale,730
2,join_status,609
4,area,599
11,sqft_lot,599
87,market_trend,486
40,total_sqft,402
6,zoning,396


In [7]:
# =============================================================================
# BLOCK 4: K-FOLD TRAINING - LIGHTGBM
# =============================================================================
print("--- Starting Block 4: K-Fold Training for LightGBM ---")

# Define K-Folds - we use StratifiedKFold on 'grade' to ensure balanced folds
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

# Placeholders for our predictions
oof_preds_lgbm = np.zeros((len(X_selected), 2)) # One col for lower, one for upper
test_preds_lgbm = np.zeros((len(X_test_selected), 2))

# Use the 'grade' column from the original data for stratification
# Note: X_selected does not have 'grade' if it was dropped, so we use the original X
# Let's ensure 'grade' is present in X_selected for stratification
grade_for_stratify = pd.read_csv(DATA_PATH + 'dataset.csv')['grade']

# LightGBM Parameters - A good starting point
params_lgbm = {
    'objective': 'quantile',
    'metric': 'quantile',
    'random_state': RANDOM_STATE,
    'n_jobs': -1,
    'learning_rate': 0.03,
    'n_estimators': 2000,
    'num_leaves': 40,
    'max_depth': 7,
    'subsample': 0.7,
    'colsample_bytree': 0.7
}

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(skf.split(X_selected, grade_for_stratify)):
    print(f"===== FOLD {fold+1}/{N_SPLITS} =====")
    
    # Split data for this fold
    X_train_fold, y_train_fold = X_selected.iloc[train_idx], y_log.iloc[train_idx]
    X_val_fold, y_val_fold = X_selected.iloc[val_idx], y_log.iloc[val_idx]

    # --- Train Lower Quantile Model (alpha=0.05) ---
    print("Training LGBM lower quantile model...")
    params_lgbm['alpha'] = COMPETITION_ALPHA / 2
    model_lower = lgb.LGBMRegressor(**params_lgbm)
    model_lower.fit(X_train_fold, y_train_fold,
                    eval_set=[(X_val_fold, y_val_fold)],
                    callbacks=[lgb.early_stopping(100, verbose=False)])

    # --- Train Upper Quantile Model (alpha=0.95) ---
    print("Training LGBM upper quantile model...")
    params_lgbm['alpha'] = 1 - (COMPETITION_ALPHA / 2)
    model_upper = lgb.LGBMRegressor(**params_lgbm)
    model_upper.fit(X_train_fold, y_train_fold,
                    eval_set=[(X_val_fold, y_val_fold)],
                    callbacks=[lgb.early_stopping(100, verbose=False)])

    # --- Generate Predictions ---
    # Out-of-Fold (OOF) predictions for local validation
    oof_preds_lgbm[val_idx, 0] = model_lower.predict(X_val_fold)
    oof_preds_lgbm[val_idx, 1] = model_upper.predict(X_val_fold)
    
    # Test set predictions (averaged across folds)
    test_preds_lgbm[:, 0] += model_lower.predict(X_test_selected) / N_SPLITS
    test_preds_lgbm[:, 1] += model_upper.predict(X_test_selected) / N_SPLITS

print("\nLightGBM K-Fold training complete.")
print("OOF and Test predictions for LightGBM are now stored.")

--- Starting Block 4: K-Fold Training for LightGBM ---
===== FOLD 1/5 =====
Training LGBM lower quantile model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022096 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5632
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 85
[LightGBM] [Info] Start training from score 12.128117
Training LGBM upper quantile model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5632
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 85
[LightGBM] [Info] Start training from score 14.177406


===== FOLD 2/5 =====
Training LGBM lower quantile model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015453 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5628
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 85
[LightGBM] [Info] Start training from score 12.128117
Training LGBM upper quantile model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5628
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 85
[LightGBM] [Info] Start training from score 14.169683




===== FOLD 3/5 =====
Training LGBM lower quantile model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012515 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5628
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 85
[LightGBM] [Info] Start training from score 12.128117
Training LGBM upper quantile model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015596 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5628
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 85
[LightGBM] [Info] Start training from score 14.176676




===== FOLD 4/5 =====
Training LGBM lower quantile model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006793 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5644
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 85
[LightGBM] [Info] Start training from score 12.128117




Training LGBM upper quantile model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013098 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5644
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 85
[LightGBM] [Info] Start training from score 14.173185




===== FOLD 5/5 =====
Training LGBM lower quantile model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016502 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5624
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 85
[LightGBM] [Info] Start training from score 12.128117
Training LGBM upper quantile model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5624
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 85
[LightGBM] [Info] Start training from score 14.178583

LightGBM K-Fold training complete.
OOF and Test predictions for LightGBM are now stored.


In [9]:
# =============================================================================
# BLOCK 5: K-FOLD TRAINING - XGBOOST (FUNCTIONAL API FOR MAXIMUM COMPATIBILITY)
# =============================================================================
print("--- Starting Block 5: K-Fold Training for XGBoost (Functional API) ---")

# We use the same K-Fold split definition for consistency
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

# Placeholders for our predictions
oof_preds_xgb = np.zeros((len(X_selected), 2))
test_preds_xgb = np.zeros((len(X_test_selected), 2))

# XGBoost Parameters for the functional API
params_xgb_func = {
    'objective': 'reg:quantileerror',
    'eval_metric': 'rmse',
    'eta': 0.03,
    'max_depth': 7,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'random_state': RANDOM_STATE,
    'n_jobs': -1,
    'tree_method': 'hist',
}
NUM_BOOST_ROUND = 2000 # The equivalent of n_estimators

# Use the same 'grade' column for stratification as before
grade_for_stratify = pd.read_csv(DATA_PATH + 'dataset.csv')['grade']

# Convert the test data to DMatrix once, outside the loop
dtest = xgb.DMatrix(X_test_selected)

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(skf.split(X_selected, grade_for_stratify)):
    print(f"===== FOLD {fold+1}/{N_SPLITS} =====")
    
    # Split data and convert to XGBoost's DMatrix format
    X_train_fold, y_train_fold = X_selected.iloc[train_idx], y_log.iloc[train_idx]
    X_val_fold, y_val_fold = X_selected.iloc[val_idx], y_log.iloc[val_idx]
    
    dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
    dval = xgb.DMatrix(X_val_fold, label=y_val_fold)

    # --- Train Lower Quantile Model ---
    print("Training XGBoost lower quantile model...")
    params_lower = params_xgb_func.copy()
    params_lower['quantile_alpha'] = COMPETITION_ALPHA / 2
    
    model_lower = xgb.train(
        params=params_lower,
        dtrain=dtrain,
        num_boost_round=NUM_BOOST_ROUND,
        evals=[(dval, 'validation')],
        early_stopping_rounds=100,
        verbose_eval=False 
    )

    # --- Train Upper Quantile Model ---
    print("Training XGBoost upper quantile model...")
    params_upper = params_xgb_func.copy()
    params_upper['quantile_alpha'] = 1 - (COMPETITION_ALPHA / 2)

    model_upper = xgb.train(
        params=params_upper,
        dtrain=dtrain,
        num_boost_round=NUM_BOOST_ROUND,
        evals=[(dval, 'validation')],
        early_stopping_rounds=100,
        verbose_eval=False
    )

    # --- Generate Predictions ---
    # The model object from xgb.train has a .best_iteration attribute
    oof_preds_xgb[val_idx, 0] = model_lower.predict(dval, iteration_range=(0, model_lower.best_iteration))
    oof_preds_xgb[val_idx, 1] = model_upper.predict(dval, iteration_range=(0, model_upper.best_iteration))
    
    # Test predictions
    test_preds_xgb[:, 0] += model_lower.predict(dtest, iteration_range=(0, model_lower.best_iteration)) / N_SPLITS
    test_preds_xgb[:, 1] += model_upper.predict(dtest, iteration_range=(0, model_upper.best_iteration)) / N_SPLITS

print("\nXGBoost K-Fold training complete.")
print("OOF and Test predictions for XGBoost are now stored.")

--- Starting Block 5: K-Fold Training for XGBoost (Functional API) ---
===== FOLD 1/5 =====
Training XGBoost lower quantile model...
Training XGBoost upper quantile model...
===== FOLD 2/5 =====
Training XGBoost lower quantile model...
Training XGBoost upper quantile model...
===== FOLD 3/5 =====
Training XGBoost lower quantile model...
Training XGBoost upper quantile model...
===== FOLD 4/5 =====
Training XGBoost lower quantile model...
Training XGBoost upper quantile model...
===== FOLD 5/5 =====
Training XGBoost lower quantile model...
Training XGBoost upper quantile model...

XGBoost K-Fold training complete.
OOF and Test predictions for XGBoost are now stored.


In [10]:
# =============================================================================
# BLOCK 6: K-FOLD TRAINING - PYTORCH NEURAL NETWORK
# =============================================================================
print("--- Starting Block 6: K-Fold Training for PyTorch NN ---")

# --- Define the Neural Network Architecture ---
# A simple but effective Multi-Layer Perceptron (MLP)
class QuantileMLP(nn.Module):
    def __init__(self, input_size, d_hidden=128):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, d_hidden),
            nn.ReLU(),
            nn.BatchNorm1d(d_hidden),
            nn.Dropout(0.3),
            nn.Linear(d_hidden, d_hidden // 2),
            nn.ReLU(),
            nn.BatchNorm1d(d_hidden // 2),
            nn.Dropout(0.3),
            nn.Linear(d_hidden // 2, 1) # Output is a single value (the quantile)
        )
    def forward(self, x):
        return self.network(x)

# --- Define the Pinball Loss function for quantile regression ---
def pinball_loss(y_true, y_pred, alpha):
    error = y_true - y_pred
    return torch.mean(torch.max(alpha * error, (alpha - 1) * error))


# --- Data Scaling is CRITICAL for Neural Networks ---
# We fit the scaler ONLY on the full training data to avoid data leakage
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)
X_test_scaled = scaler.transform(X_test_selected)


# --- Setup for K-Fold training ---
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
oof_preds_nn = np.zeros((len(X_selected), 2))
test_preds_nn = np.zeros((len(X_test_selected), 2))
grade_for_stratify = pd.read_csv(DATA_PATH + 'dataset.csv')['grade']


# --- K-Fold Loop ---
for fold, (train_idx, val_idx) in enumerate(skf.split(X_scaled, grade_for_stratify)):
    print(f"===== FOLD {fold+1}/{N_SPLITS} =====")

    # Split scaled data
    X_train_fold, y_train_fold = X_scaled[train_idx], y_log.iloc[train_idx].values
    X_val_fold, y_val_fold = X_scaled[val_idx], y_log.iloc[val_idx].values

    # Create PyTorch Datasets and DataLoaders
    train_dataset = TensorDataset(torch.FloatTensor(X_train_fold), torch.FloatTensor(y_train_fold).unsqueeze(1))
    val_dataset = TensorDataset(torch.FloatTensor(X_val_fold), torch.FloatTensor(y_val_fold).unsqueeze(1))
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    # --- Train a model for each quantile ---
    for i, alpha in enumerate([COMPETITION_ALPHA / 2, 1 - (COMPETITION_ALPHA / 2)]):
        print(f"Training NN for alpha = {alpha:.2f}...")
        
        # Initialize model, optimizer, and move to device
        model = QuantileMLP(input_size=X_scaled.shape[1]).to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
        
        best_val_loss = float('inf')
        patience_counter = 0
        
        # Training loop for epochs
        for epoch in range(EPOCHS):
            model.train()
            for x_batch, y_batch in train_loader:
                x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)
                
                optimizer.zero_grad()
                preds = model(x_batch)
                loss = pinball_loss(y_batch, preds, alpha)
                loss.backward()
                optimizer.step()
                
            # Validation loop for early stopping
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for x_batch, y_batch in val_loader:
                    x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)
                    preds = model(x_batch)
                    val_loss += pinball_loss(y_batch, preds, alpha).item()
            
            val_loss /= len(val_loader)

            # Early stopping logic
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), f'best_model_fold{fold}_alpha{i}.pth')
                patience_counter = 0
            else:
                patience_counter += 1
            
            if patience_counter >= 5: # Stop if val loss doesn't improve for 5 epochs
                # print(f"Early stopping at epoch {epoch+1}")
                break

        # --- Generate Predictions with the best model ---
        best_model = QuantileMLP(input_size=X_scaled.shape[1]).to(DEVICE)
        best_model.load_state_dict(torch.load(f'best_model_fold{fold}_alpha{i}.pth'))
        best_model.eval()
        
        # Predict on validation and test sets
        with torch.no_grad():
            # OOF
            val_preds_tensor = best_model(torch.FloatTensor(X_val_fold).to(DEVICE))
            oof_preds_nn[val_idx, i] = val_preds_tensor.cpu().numpy().flatten()
            
            # Test
            test_preds_tensor = best_model(torch.FloatTensor(X_test_scaled).to(DEVICE))
            test_preds_nn[:, i] += test_preds_tensor.cpu().numpy().flatten() / N_SPLITS
            
print("\nPyTorch NN K-Fold training complete.")
print("OOF and Test predictions for the Neural Network are now stored.")

--- Starting Block 6: K-Fold Training for PyTorch NN ---
===== FOLD 1/5 =====
Training NN for alpha = 0.05...
Training NN for alpha = 0.95...
===== FOLD 2/5 =====
Training NN for alpha = 0.05...
Training NN for alpha = 0.95...
===== FOLD 3/5 =====
Training NN for alpha = 0.05...
Training NN for alpha = 0.95...
===== FOLD 4/5 =====
Training NN for alpha = 0.05...
Training NN for alpha = 0.95...
===== FOLD 5/5 =====
Training NN for alpha = 0.05...
Training NN for alpha = 0.95...

PyTorch NN K-Fold training complete.
OOF and Test predictions for the Neural Network are now stored.


In [11]:
# =============================================================================
# BLOCK 7: ENSEMBLING, CALIBRATION, AND SUBMISSION
# =============================================================================
print("--- Starting Block 7: Final Ensembling and Submission ---")

# --- Define the Winkler Score function ---
def winkler_score(y_true, lower, upper, alpha, return_coverage=False):
    width = upper - lower
    penalty_lower = (2 / alpha) * (lower - y_true) * (y_true < lower)
    penalty_upper = (2 / alpha) * (y_true - upper) * (y_true > upper)
    score = width + penalty_lower + penalty_upper
    
    if return_coverage:
        inside = (y_true >= lower) & (y_true <= upper)
        coverage = np.mean(inside)
        return np.mean(score), coverage
    return np.mean(score)

# --- Step 1: Create a Simple Averaging Ensemble ---
# We average the log-scale predictions from all three models.
oof_preds_ensemble_log = (oof_preds_lgbm + oof_preds_xgb + oof_preds_nn) / 3
test_preds_ensemble_log = (test_preds_lgbm + test_preds_xgb + test_preds_nn) / 3

print("Created simple average ensemble of LGBM, XGBoost, and NN.")

# --- Step 2: Evaluate the Ensemble and Calibrate ---
# Inverse transform OOF predictions to the original dollar scale
oof_lower_ensemble = np.expm1(oof_preds_ensemble_log[:, 0])
oof_upper_ensemble = np.expm1(oof_preds_ensemble_log[:, 1])

# Ensure lower bound is never higher than the upper bound
oof_upper_ensemble = np.maximum(oof_lower_ensemble, oof_upper_ensemble)

# Calculate the initial Winkler score and coverage of our ensemble
winkler_oof, coverage_oof = winkler_score(
    y_true.values, # Use the original y_true we stored
    oof_lower_ensemble, 
    oof_upper_ensemble,
    alpha=COMPETITION_ALPHA,
    return_coverage=True
)

print("\n--- Ensemble OOF Performance (Before Calibration) ---")
print(f"Ensemble OOF Winkler Score: {winkler_oof:,.2f}")
print(f"Ensemble OOF Coverage:      {coverage_oof:.2%}")

# --- Interval Calibration ---
print("\nSearching for best calibration factor to target 90% coverage...")
center = (oof_lower_ensemble + oof_upper_ensemble) / 2
width = oof_upper_ensemble - oof_lower_ensemble
best_factor = 1.0
# Start with a large difference
best_coverage_diff = 1.0 

for factor in np.arange(0.90, 1.20, 0.001): # Use a finer step for more precision
    new_lower = center - (width / 2) * factor
    new_upper = center + (width / 2) * factor
    _, coverage = winkler_score(y_true.values, new_lower, new_upper, COMPETITION_ALPHA, return_coverage=True)
    
    if abs(coverage - 0.90) < best_coverage_diff:
        best_coverage_diff = abs(coverage - 0.90)
        best_factor = factor

print(f"Found best calibration factor: {best_factor:.3f}")

# --- Step 3: Create Final Submission File ---
print("\nCreating final submission file...")

# Inverse transform the test set predictions
test_lower_ensemble = np.expm1(test_preds_ensemble_log[:, 0])
test_upper_ensemble = np.expm1(test_preds_ensemble_log[:, 1])

# Apply the learned calibration factor
print(f"Applying calibration factor ({best_factor:.3f}) to test predictions...")
test_center = (test_lower_ensemble + test_upper_ensemble) / 2
test_width = test_upper_ensemble - test_lower_ensemble
calibrated_lower = test_center - (test_width / 2) * best_factor
calibrated_upper = test_center + (test_width / 2) * best_factor

# Final checks: ensure lower <= upper and clip to plausible price ranges
min_price = 1000 # A reasonable minimum house price
max_price = y_true.max() * 1.5 # A reasonable maximum
calibrated_upper = np.maximum(calibrated_lower, calibrated_upper)
calibrated_lower = np.clip(calibrated_lower, min_price, None)
calibrated_upper = np.clip(calibrated_upper, min_price, None)


# Create submission dataframe
submission_df = pd.DataFrame({
    'id': X_test_selected.index,
    'pi_lower': calibrated_lower,
    'pi_upper': calibrated_upper
})

submission_df.to_csv('submission_ensemble_calibrated.csv', index=False)
print("\n'submission_ensemble_calibrated.csv' file created successfully!")
display(submission_df.head())

--- Starting Block 7: Final Ensembling and Submission ---
Created simple average ensemble of LGBM, XGBoost, and NN.

--- Ensemble OOF Performance (Before Calibration) ---
Ensemble OOF Winkler Score: 569,123.77
Ensemble OOF Coverage:      98.97%

Searching for best calibration factor to target 90% coverage...
Found best calibration factor: 0.900

Creating final submission file...
Applying calibration factor (0.900) to test predictions...

'submission_ensemble_calibrated.csv' file created successfully!


Unnamed: 0,id,pi_lower,pi_upper
0,200000,685505.778758,1469127.0
1,200001,466489.827622,1019705.0
2,200002,388270.290643,941107.3
3,200003,278676.572982,591186.3
4,200004,347944.135938,846787.5
