In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder
import warnings

# Configure settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
print("\n--- Step 1: Loading Data ---")
df_train = pd.read_csv("dataset.csv", index_col="id", parse_dates=["sale_date"])
df_test = pd.read_csv("test.csv", index_col="id", parse_dates=["sale_date"])

# Combine for easier processing
df_train['is_train'] = 1
df_test['is_train'] = 0
all_data = pd.concat([df_train, df_test], axis=0)
print("Data loaded and combined.")


--- Step 1: Loading Data ---
Data loaded and combined.


In [3]:
# =============================================================================
# STEP 2 (UPGRADED): HYPER-AGGRESSIVE FEATURE ENGINEERING
# =============================================================================
print("\n--- Step 2 (Upgraded): Hyper-Aggressive Feature Engineering ---")

# We need the target variable temporarily for some features, so let's add it
df_train['sale_price_log'] = np.log1p(df_train['sale_price'])
all_data = pd.concat([df_train, df_test], axis=0)

# --- A) Foundational Features ---
all_data['sale_year'] = all_data['sale_date'].dt.year
all_data['sale_month'] = all_data['sale_date'].dt.month
all_data['age_at_sale'] = all_data['sale_year'] - all_data['year_built']
all_data['total_bathrooms'] = all_data['bath_full'] + 0.5 * all_data['bath_half'] + 0.75 * all_data['bath_3qtr']
all_data['total_sqft'] = all_data['sqft'] + all_data['sqft_fbsmt']
all_data['was_renovated'] = (all_data['year_reno'] > 0).astype(int)

# --- B) Location Clusters ---
kmeans = KMeans(n_clusters=30, random_state=42, n_init='auto')
all_data['location_cluster'] = kmeans.fit_predict(all_data[['latitude', 'longitude']])

# --- C) Peer-Comparison & Target-Encoded Features ---
print("Creating rich peer-comparison and target-encoded features...")
for group_col in ['location_cluster', 'city', 'submarket', 'zipcode']: # Assuming zipcode exists
    if group_col not in all_data.columns: continue # Skip if column doesn't exist
        
    # Define aggregations to calculate
    aggs = {
        'grade': ['mean', 'std'],
        'age_at_sale': ['mean', 'std'],
        'total_sqft': ['mean', 'std'],
        'sale_price_log': ['mean'] # This is our Target Encoding
    }
    
    group_aggs = all_data.groupby(group_col).agg(aggs)
    group_aggs.columns = [f'{col[0]}_agg_{col[1]}_by_{group_col}' for col in group_aggs.columns]
    all_data = all_data.merge(group_aggs, on=group_col, how='left')

    # Create relative difference/ratio features
    all_data[f'grade_vs_mean_{group_col}'] = all_data['grade'] - all_data[f'grade_agg_mean_by_{group_col}']
    all_data[f'sqft_vs_mean_{group_col}'] = all_data['total_sqft'] - all_data[f'total_sqft_agg_mean_by_{group_col}']
    all_data[f'age_zscore_{group_col}'] = (all_data['age_at_sale'] - all_data[f'age_at_sale_agg_mean_by_{group_col}']) / (all_data[f'age_at_sale_agg_std_by_{group_col}'] + 1e-6)

# --- D) Cleanup and Final Processing ---
# Drop original columns that have been replaced or are no longer needed
cols_to_drop = [
    'sale_date', 'year_built', 'year_reno', 'bath_full', 'bath_half', 'bath_3qtr',
    'sqft', 'sqft_fbsmt', 'latitude', 'longitude', 'sale_price', 'sale_price_log'
]
all_data = all_data.drop(columns=cols_to_drop)

# Impute missing values and encode remaining categoricals
all_data['sale_nbr'].fillna(all_data['sale_nbr'].median(), inplace=True)
for col in all_data.select_dtypes(include='object').columns:
    all_data[col] = pd.Categorical(all_data[col]).codes

# Fill any NaNs that resulted from aggregations (e.g., std dev of a single-item group)
all_data.fillna(0, inplace=True)

print("Aggressive feature engineering complete.")


--- Step 2 (Upgraded): Hyper-Aggressive Feature Engineering ---
Creating rich peer-comparison and target-encoded features...
Aggressive feature engineering complete.


In [4]:
# =============================================================================
# STEP 2.5 (NEW): SMART FEATURE SELECTION
# =============================================================================
print("\n--- Step 2.5 (New): Smart Feature Selection ---")

# Separate the data back into a temporary training set to run feature selection
temp_X = all_data[all_data['is_train'] == 1].drop(columns=['is_train'])
temp_y_log = y_log # y_log was defined in the first cell

# We need a quick model to rank features. LightGBM is perfect for this.
import lightgbm as lgb

fs_model = lgb.LGBMRegressor(random_state=42, n_jobs=-1)
print("Training a temporary model to rank feature importances...")
fs_model.fit(temp_X, temp_y_log)

# Create a DataFrame of feature importances
importances = pd.DataFrame({
    'feature': temp_X.columns,
    'importance': fs_model.feature_importances_
}).sort_values('importance', ascending=False)

# --- Define which features to drop ---
# We will drop features with zero importance.
useless_features = importances[importances['importance'] == 0]['feature'].tolist()
print(f"\nFound {len(useless_features)} features with zero importance to drop.")
if useless_features:
    print(useless_features)

# --- Drop the useless features from the main 'all_data' dataframe ---
all_data.drop(columns=useless_features, inplace=True)

print(f"\nFeature selection complete. Proceeding with {all_data.shape[1]} features.")
print("\nTop 20 most important features:")
display(importances.head(20))

# Clean up memory
del temp_X, temp_y_log, fs_model, importances
gc.collect()


--- Step 2.5 (New): Smart Feature Selection ---


NameError: name 'y_log' is not defined

In [5]:
print("\n--- Step 3: Finalizing Data for Modeling ---")

# Separate back into train and test sets
train_processed = all_data[all_data['is_train'] == 1].drop(columns=['is_train'])
test_processed = all_data[all_data['is_train'] == 0].drop(columns=['is_train', 'sale_price'])

X = train_processed.drop(columns=['sale_price'])
y_log = np.log1p(train_processed['sale_price'])
X_test = test_processed

# Align columns - crucial for safety
X_test = X_test[X.columns]

print(f"Final training features shape: {X.shape}")
print(f"Final test features shape:     {X_test.shape}")

# Competition variables
COMPETITION_ALPHA = 0.1
LOWER_QUANTILE = COMPETITION_ALPHA / 2
UPPER_QUANTILE = 1 - (COMPETITION_ALPHA / 2)


--- Step 3: Finalizing Data for Modeling ---
Final training features shape: (200000, 49)
Final test features shape:     (200000, 49)


In [11]:
# =============================================================================
#  4. K-FOLD MODEL TRAINING WITH XGBOOST (FUNCTIONAL API for old versions)
# =============================================================================
print("\n--- Step 4: K-Fold Cross-Validation with XGBoost ---")

# Define K-Folds
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Placeholders
oof_preds_lower = np.zeros(len(X))
oof_preds_upper = np.zeros(len(X))
test_preds_lower = np.zeros(len(X_test))
test_preds_upper = np.zeros(len(X_test))

# XGBoost Parameters
# For the functional API, we pass parameters as a dictionary
params_xgb_func = {
    'objective': 'reg:quantileerror',
    'eval_metric': 'rmse',
    'eta': 0.03,
    'max_depth': 7,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'random_state': 42,
    'n_jobs': -1,
    'tree_method': 'hist',
}
NUM_BOOST_ROUND = 2000 # The equivalent of n_estimators

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(skf.split(X, X['grade'])):
    print(f"===== FOLD {fold+1}/{N_SPLITS} =====")
    
    # Split data and convert to XGBoost's DMatrix format
    X_train_fold, y_train_fold = X.iloc[train_idx], y_log.iloc[train_idx]
    X_val_fold, y_val_fold = X.iloc[val_idx], y_log.iloc[val_idx]
    
    dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
    dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
    dtest = xgb.DMatrix(X_test) # DMatrix for the full test set

    # --- Train Lower Quantile Model ---
    print("Training lower quantile model...")
    params_lower = params_xgb_func.copy()
    params_lower['quantile_alpha'] = LOWER_QUANTILE
    
    # xgb.train is the functional API
    model_lower = xgb.train(
        params=params_lower,
        dtrain=dtrain,
        num_boost_round=NUM_BOOST_ROUND,
        evals=[(dval, 'validation')],
        early_stopping_rounds=100,
        verbose_eval=False # Suppress per-round output
    )

    # --- Train Upper Quantile Model ---
    print("Training upper quantile model...")
    params_upper = params_xgb_func.copy()
    params_upper['quantile_alpha'] = UPPER_QUANTILE

    model_upper = xgb.train(
        params=params_upper,
        dtrain=dtrain,
        num_boost_round=NUM_BOOST_ROUND,
        evals=[(dval, 'validation')],
        early_stopping_rounds=100,
        verbose_eval=False
    )

    # --- Generate Predictions ---
    # The model object from xgb.train has a best_iteration attribute
    oof_preds_lower[val_idx] = model_lower.predict(dval, iteration_range=(0, model_lower.best_iteration))
    oof_preds_upper[val_idx] = model_upper.predict(dval, iteration_range=(0, model_upper.best_iteration))
    
    # Test predictions
    test_preds_lower += model_lower.predict(dtest, iteration_range=(0, model_lower.best_iteration)) / N_SPLITS
    test_preds_upper += model_upper.predict(dtest, iteration_range=(0, model_upper.best_iteration)) / N_SPLITS

print("\nK-Fold training complete.")


--- Step 4: K-Fold Cross-Validation with XGBoost ---
===== FOLD 1/5 =====
Training lower quantile model...
Training upper quantile model...
===== FOLD 2/5 =====
Training lower quantile model...
Training upper quantile model...
===== FOLD 3/5 =====
Training lower quantile model...
Training upper quantile model...
===== FOLD 4/5 =====
Training lower quantile model...
Training upper quantile model...
===== FOLD 5/5 =====
Training lower quantile model...
Training upper quantile model...

K-Fold training complete.


In [12]:
# =============================================================================
#  5. VALIDATION AND CALIBRATION
# =============================================================================
print("\n--- Step 5: Evaluating OOF Predictions and Calibrating ---")

# Define Winkler Score function
def winkler_score(y_true, lower, upper, alpha, return_coverage=False):
    width = upper - lower
    penalty_lower = (2 / alpha) * (lower - y_true) * (y_true < lower)
    penalty_upper = (2 / alpha) * (y_true - upper) * (y_true > upper)
    score = width + penalty_lower + penalty_upper
    if return_coverage:
        coverage = np.mean((y_true >= lower) & (y_true <= upper))
        return np.mean(score), coverage
    return np.mean(score)

# Inverse transform OOF predictions and true values
oof_lower_final = np.expm1(oof_preds_lower)
oof_upper_final = np.expm1(oof_preds_upper)
y_true_final = np.expm1(y_log)

# Ensure lower <= upper
oof_upper_final = np.maximum(oof_lower_final, oof_upper_final)

# Calculate initial OOF score
winkler_oof, coverage_oof = winkler_score(y_true_final, oof_lower_final, oof_upper_final, COMPETITION_ALPHA, return_coverage=True)
print(f"Initial OOF Winkler Score: {winkler_oof:,.2f}")
print(f"Initial OOF Coverage:      {coverage_oof:.2%}")

# --- Interval Calibration ---
# We aim for 90% coverage. Let's find the factor needed.
center = (oof_lower_final + oof_upper_final) / 2
width = oof_upper_final - oof_lower_final
best_factor = 1.0
best_coverage_diff = abs(coverage_oof - 0.90)

print("Searching for best calibration factor...")
for factor in np.arange(0.9, 1.2, 0.005):
    new_lower = center - (width / 2) * factor
    new_upper = center + (width / 2) * factor
    _, coverage = winkler_score(y_true_final, new_lower, new_upper, COMPETITION_ALPHA, return_coverage=True)
    if abs(coverage - 0.90) < best_coverage_diff:
        best_coverage_diff = abs(coverage - 0.90)
        best_factor = factor

print(f"Found best calibration factor: {best_factor:.3f}")


--- Step 5: Evaluating OOF Predictions and Calibrating ---
Initial OOF Winkler Score: 338,438.31
Initial OOF Coverage:      86.12%
Searching for best calibration factor...
Found best calibration factor: 1.120


In [13]:
# =============================================================================
#  6. CREATE FINAL SUBMISSION
# =============================================================================
print("\n--- Step 6: Creating Final Submission File ---")

# Inverse transform test predictions
test_lower_final = np.expm1(test_preds_lower)
test_upper_final = np.expm1(test_preds_upper)

# Apply the calibration factor found on our OOF predictions
print("Applying calibration to test predictions...")
test_center = (test_lower_final + test_upper_final) / 2
test_width = test_upper_final - test_lower_final
calibrated_lower = test_center - (test_width / 2) * best_factor
calibrated_upper = test_center + (test_width / 2) * best_factor

# Ensure lower <= upper again after calibration
calibrated_upper = np.maximum(calibrated_lower, calibrated_upper)

# Create submission file
submission_df = pd.DataFrame({
    'id': X_test.index,
    'pi_lower': calibrated_lower,
    'pi_upper': calibrated_upper
})

submission_df.to_csv('submission_xgb_kfold.csv', index=False)
print("\n'submission_xgb_kfold.csv' file created successfully!")
display(submission_df.head())


--- Step 6: Creating Final Submission File ---
Applying calibration to test predictions...

'submission_xgb_kfold.csv' file created successfully!


Unnamed: 0,id,pi_lower,pi_upper
0,200000,821980.037049,1141128.0
1,200001,554608.99243,814166.5
2,200002,424258.159977,659157.3
3,200003,293653.460705,436013.1
4,200004,378902.369918,657014.0
