In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder
import warnings

# Configure settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
print("Libraries imported successfully!")

Libraries imported successfully!


In [3]:
print("\n--- Step 1: Loading Data ---")
df_train = pd.read_csv("dataset.csv", index_col="id", parse_dates=["sale_date"])
df_test = pd.read_csv("test.csv", index_col="id", parse_dates=["sale_date"])

# Combine for easier processing
df_train['is_train'] = 1
df_test['is_train'] = 0
all_data = pd.concat([df_train, df_test], axis=0)
print("Data loaded and combined.")


--- Step 1: Loading Data ---
Data loaded and combined.


In [4]:
print("\n--- Step 2: Feature Engineering ---")

def feature_engineer(df):
    df_processed = df.copy()
    
    # Date-Based Features
    df_processed['sale_year'] = df_processed['sale_date'].dt.year
    df_processed['sale_month'] = df_processed['sale_date'].dt.month
    df_processed['age_at_sale'] = df_processed['sale_year'] - df_processed['year_built']
    df_processed['time_since_reno'] = np.where(df_processed['year_reno'] > 0, df_processed['sale_year'] - df_processed['year_reno'], df_processed['age_at_sale'])
    
    # Aggregation & Combination
    df_processed['total_bathrooms'] = df_processed['bath_full'] + 0.75 * df_processed['bath_3qtr'] + 0.5 * df_processed['bath_half']
    df_processed['total_sqft'] = df_processed['sqft'] + df_processed['sqft_fbsmt'] + df_processed['gara_sqft']
    
    # Ratio Features
    df_processed['imp_to_land_ratio'] = df_processed['imp_val'] / (df_processed['land_val'] + 1)
    df_processed['sqft_per_room'] = df_processed['sqft'] / (df_processed['beds'] + df_processed['total_bathrooms'] + 1)
    
    # Polynomial and Interaction Features
    df_processed['grade_sq'] = df_processed['grade']**2
    df_processed['grade_x_sqft'] = df_processed['grade'] * df_processed['total_sqft']
    
    return df_processed

all_data = feature_engineer(all_data)
print("Base feature engineering complete.")

# Location Clusters (fitted on all data, as it's unsupervised)
kmeans = KMeans(n_clusters=20, random_state=42, n_init='auto')
all_data['location_cluster'] = kmeans.fit_predict(all_data[['latitude', 'longitude']])
print("Location clusters created.")

# Cleanup
cols_to_drop = ['sale_date', 'year_built', 'year_reno', 'bath_full', 'bath_3qtr', 'bath_half', 'subdivision']
all_data = all_data.drop(columns=cols_to_drop)

# Imputation & Encoding
all_data['sale_nbr'].fillna(all_data['sale_nbr'].median(), inplace=True)
all_data['submarket'].fillna('missing', inplace=True)

categorical_cols = all_data.select_dtypes(include='object').columns.tolist()
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
all_data[categorical_cols] = encoder.fit_transform(all_data[categorical_cols])
print("Imputation and encoding complete.")



--- Step 2: Feature Engineering ---
Base feature engineering complete.
Location clusters created.
Imputation and encoding complete.


In [5]:
print("\n--- Step 3: Finalizing Data for Modeling ---")

# Separate back into train and test sets
train_processed = all_data[all_data['is_train'] == 1].drop(columns=['is_train'])
test_processed = all_data[all_data['is_train'] == 0].drop(columns=['is_train', 'sale_price'])

X = train_processed.drop(columns=['sale_price'])
y_log = np.log1p(train_processed['sale_price'])
X_test = test_processed

# Align columns - crucial for safety
X_test = X_test[X.columns]

print(f"Final training features shape: {X.shape}")
print(f"Final test features shape:     {X_test.shape}")

# Competition variables
COMPETITION_ALPHA = 0.1
LOWER_QUANTILE = COMPETITION_ALPHA / 2
UPPER_QUANTILE = 1 - (COMPETITION_ALPHA / 2)


--- Step 3: Finalizing Data for Modeling ---
Final training features shape: (200000, 49)
Final test features shape:     (200000, 49)


In [11]:
# =============================================================================
#  4. K-FOLD MODEL TRAINING WITH XGBOOST (FUNCTIONAL API for old versions)
# =============================================================================
print("\n--- Step 4: K-Fold Cross-Validation with XGBoost ---")

# Define K-Folds
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Placeholders
oof_preds_lower = np.zeros(len(X))
oof_preds_upper = np.zeros(len(X))
test_preds_lower = np.zeros(len(X_test))
test_preds_upper = np.zeros(len(X_test))

# XGBoost Parameters
# For the functional API, we pass parameters as a dictionary
params_xgb_func = {
    'objective': 'reg:quantileerror',
    'eval_metric': 'rmse',
    'eta': 0.03,
    'max_depth': 7,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'random_state': 42,
    'n_jobs': -1,
    'tree_method': 'hist',
}
NUM_BOOST_ROUND = 2000 # The equivalent of n_estimators

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(skf.split(X, X['grade'])):
    print(f"===== FOLD {fold+1}/{N_SPLITS} =====")
    
    # Split data and convert to XGBoost's DMatrix format
    X_train_fold, y_train_fold = X.iloc[train_idx], y_log.iloc[train_idx]
    X_val_fold, y_val_fold = X.iloc[val_idx], y_log.iloc[val_idx]
    
    dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
    dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
    dtest = xgb.DMatrix(X_test) # DMatrix for the full test set

    # --- Train Lower Quantile Model ---
    print("Training lower quantile model...")
    params_lower = params_xgb_func.copy()
    params_lower['quantile_alpha'] = LOWER_QUANTILE
    
    # xgb.train is the functional API
    model_lower = xgb.train(
        params=params_lower,
        dtrain=dtrain,
        num_boost_round=NUM_BOOST_ROUND,
        evals=[(dval, 'validation')],
        early_stopping_rounds=100,
        verbose_eval=False # Suppress per-round output
    )

    # --- Train Upper Quantile Model ---
    print("Training upper quantile model...")
    params_upper = params_xgb_func.copy()
    params_upper['quantile_alpha'] = UPPER_QUANTILE

    model_upper = xgb.train(
        params=params_upper,
        dtrain=dtrain,
        num_boost_round=NUM_BOOST_ROUND,
        evals=[(dval, 'validation')],
        early_stopping_rounds=100,
        verbose_eval=False
    )

    # --- Generate Predictions ---
    # The model object from xgb.train has a best_iteration attribute
    oof_preds_lower[val_idx] = model_lower.predict(dval, iteration_range=(0, model_lower.best_iteration))
    oof_preds_upper[val_idx] = model_upper.predict(dval, iteration_range=(0, model_upper.best_iteration))
    
    # Test predictions
    test_preds_lower += model_lower.predict(dtest, iteration_range=(0, model_lower.best_iteration)) / N_SPLITS
    test_preds_upper += model_upper.predict(dtest, iteration_range=(0, model_upper.best_iteration)) / N_SPLITS

print("\nK-Fold training complete.")


--- Step 4: K-Fold Cross-Validation with XGBoost ---
===== FOLD 1/5 =====
Training lower quantile model...
Training upper quantile model...
===== FOLD 2/5 =====
Training lower quantile model...
Training upper quantile model...
===== FOLD 3/5 =====
Training lower quantile model...
Training upper quantile model...
===== FOLD 4/5 =====
Training lower quantile model...
Training upper quantile model...
===== FOLD 5/5 =====
Training lower quantile model...
Training upper quantile model...

K-Fold training complete.


In [12]:
# =============================================================================
#  5. VALIDATION AND CALIBRATION
# =============================================================================
print("\n--- Step 5: Evaluating OOF Predictions and Calibrating ---")

# Define Winkler Score function
def winkler_score(y_true, lower, upper, alpha, return_coverage=False):
    width = upper - lower
    penalty_lower = (2 / alpha) * (lower - y_true) * (y_true < lower)
    penalty_upper = (2 / alpha) * (y_true - upper) * (y_true > upper)
    score = width + penalty_lower + penalty_upper
    if return_coverage:
        coverage = np.mean((y_true >= lower) & (y_true <= upper))
        return np.mean(score), coverage
    return np.mean(score)

# Inverse transform OOF predictions and true values
oof_lower_final = np.expm1(oof_preds_lower)
oof_upper_final = np.expm1(oof_preds_upper)
y_true_final = np.expm1(y_log)

# Ensure lower <= upper
oof_upper_final = np.maximum(oof_lower_final, oof_upper_final)

# Calculate initial OOF score
winkler_oof, coverage_oof = winkler_score(y_true_final, oof_lower_final, oof_upper_final, COMPETITION_ALPHA, return_coverage=True)
print(f"Initial OOF Winkler Score: {winkler_oof:,.2f}")
print(f"Initial OOF Coverage:      {coverage_oof:.2%}")

# --- Interval Calibration ---
# We aim for 90% coverage. Let's find the factor needed.
center = (oof_lower_final + oof_upper_final) / 2
width = oof_upper_final - oof_lower_final
best_factor = 1.0
best_coverage_diff = abs(coverage_oof - 0.90)

print("Searching for best calibration factor...")
for factor in np.arange(0.9, 1.2, 0.005):
    new_lower = center - (width / 2) * factor
    new_upper = center + (width / 2) * factor
    _, coverage = winkler_score(y_true_final, new_lower, new_upper, COMPETITION_ALPHA, return_coverage=True)
    if abs(coverage - 0.90) < best_coverage_diff:
        best_coverage_diff = abs(coverage - 0.90)
        best_factor = factor

print(f"Found best calibration factor: {best_factor:.3f}")


--- Step 5: Evaluating OOF Predictions and Calibrating ---
Initial OOF Winkler Score: 338,438.31
Initial OOF Coverage:      86.12%
Searching for best calibration factor...
Found best calibration factor: 1.120


In [13]:
# =============================================================================
#  6. CREATE FINAL SUBMISSION
# =============================================================================
print("\n--- Step 6: Creating Final Submission File ---")

# Inverse transform test predictions
test_lower_final = np.expm1(test_preds_lower)
test_upper_final = np.expm1(test_preds_upper)

# Apply the calibration factor found on our OOF predictions
print("Applying calibration to test predictions...")
test_center = (test_lower_final + test_upper_final) / 2
test_width = test_upper_final - test_lower_final
calibrated_lower = test_center - (test_width / 2) * best_factor
calibrated_upper = test_center + (test_width / 2) * best_factor

# Ensure lower <= upper again after calibration
calibrated_upper = np.maximum(calibrated_lower, calibrated_upper)

# Create submission file
submission_df = pd.DataFrame({
    'id': X_test.index,
    'pi_lower': calibrated_lower,
    'pi_upper': calibrated_upper
})

submission_df.to_csv('submission_xgb_kfold.csv', index=False)
print("\n'submission_xgb_kfold.csv' file created successfully!")
display(submission_df.head())


--- Step 6: Creating Final Submission File ---
Applying calibration to test predictions...

'submission_xgb_kfold.csv' file created successfully!


Unnamed: 0,id,pi_lower,pi_upper
0,200000,821980.037049,1141128.0
1,200001,554608.99243,814166.5
2,200002,424258.159977,659157.3
3,200003,293653.460705,436013.1
4,200004,378902.369918,657014.0
