In [6]:
# =============================================================================
# BLOCK 1: SETUP, IMPORTS, AND DATA LOADING
# =============================================================================
# --- Standard Libraries ---
import warnings
warnings.filterwarnings('ignore')
import time
import pandas as pd
import numpy as np
import gc

# --- Machine Learning Libraries ---
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
from lightgbm import LGBMRegressor # Our main model for CQR
import optuna

print("Libraries imported successfully.")

# --- Helper Function for Winkler Score ---
def winkler_score(y_true, lower, upper, alpha=0.1, return_coverage=False):
    width = upper - lower
    penalty_lower = np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0)
    penalty_upper = np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0)
    score = width + penalty_lower + penalty_upper
    if return_coverage:
        coverage = np.mean((y_true >= lower) & (y_true <= upper))
        return np.mean(score), coverage
    return np.mean(score)

# --- Global Constants ---
N_SPLITS = 5
RANDOM_STATE = 42
DATA_PATH = './'
COMPETITION_ALPHA = 0.1 # This is the alpha for the competition metric (90% interval)

# --- Load Raw Data ---
try:
    drop_cols=['id', 'golf', 'view_rainier', 'view_skyline', 'view_lakesamm', 'view_otherwater', 'view_other']
    df_train = pd.read_csv(DATA_PATH + 'dataset.csv').drop(columns=drop_cols)
    df_test = pd.read_csv(DATA_PATH + 'test.csv').drop(columns=drop_cols)
    print("Raw data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'dataset.csv' or 'test.csv'.")
    exit()

# --- Prepare Target Variable ---
y_true = df_train['sale_price'].copy()
print("Setup complete.")

Libraries imported successfully.
Raw data loaded successfully.
Setup complete.


In [7]:
# =============================================================================
# BLOCK 2: SYNTHESIZED FEATURE ENGINEERING
# =============================================================================
print("--- Starting Block 2: Synthesized Feature Engineering ---")

def create_synthesized_features(df_train, df_test):
    # Combine for consistent processing and reset the index
    df_train['is_train'] = 1
    df_test['is_train'] = 0
    train_ids = df_train.index
    test_ids = df_test.index
    all_data = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)

    # A) Brute-Force Numerical Interactions
    NUMS = ['area', 'land_val', 'imp_val', 'sqft_lot', 'sqft', 'sqft_1', 'grade', 'year_built']
    for i in range(len(NUMS)):
        for j in range(i + 1, len(NUMS)):
            all_data[f'{NUMS[i]}_x_{NUMS[j]}'] = all_data[NUMS[i]] * all_data[NUMS[j]]

    # B) Date Features
    all_data['sale_date'] = pd.to_datetime(all_data['sale_date'])
    all_data['year'] = all_data['sale_date'].dt.year
    all_data['month'] = all_data['sale_date'].dt.month
    all_data['year_diff'] = all_data['year'] - all_data['year_built']
    
    # C) TF-IDF Text Features
    text_cols = ['subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data[text_cols] = all_data[text_cols].fillna('missing').astype(str)
    for col in text_cols:
        tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=128, binary=True)
        tfidf_matrix = tfidf.fit_transform(all_data[col])
        svd = TruncatedSVD(n_components=8, random_state=RANDOM_STATE)
        tfidf_svd = svd.fit_transform(tfidf_matrix)
        tfidf_df = pd.DataFrame(tfidf_svd, columns=[f'{col}_tfidf_svd_{i}' for i in range(8)])
        all_data = pd.concat([all_data, tfidf_df], axis=1)

    # D) Log transform
    for c in ['land_val_x_imp_val', 'land_val_x_sqft', 'imp_val_x_sqft']:
        if c in all_data.columns:
            all_data[c] = np.log1p(all_data[c].fillna(0))

    # E) Final Cleanup
    cols_to_drop = ['sale_date', 'subdivision', 'zoning', 'city', 'sale_warning', 'join_status', 'submarket']
    all_data = all_data.drop(columns=cols_to_drop)
    all_data.fillna(0, inplace=True)

    # Separate final datasets
    X = all_data[all_data['is_train'] == 1].drop(columns=['is_train', 'sale_price'])
    X_test = all_data[all_data['is_train'] == 0].drop(columns=['is_train', 'sale_price'])
    
    return X, X_test, test_ids # Return test_ids for the final submission

# --- Run Feature Engineering ---
X, X_test, test_ids = create_synthesized_features(df_train, df_test)
print(f"\nSynthesized FE complete. Total features: {X.shape[1]}")
gc.collect()

--- Starting Block 2: Synthesized Feature Engineering ---

Synthesized FE complete. Total features: 111


744

In [None]:
# =============================================================================
# BLOCK 3: CQR WITH K-FOLD AWARE OPTUNA TUNING
# =============================================================================
print("\n--- Starting CQR Pipeline with Hyperparameter Tuning ---")

# --- Initialize K-Fold and Data ---
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
grade_for_stratify = pd.read_csv(DATA_PATH + 'dataset.csv')['grade']
LOWER_ALPHA = 0.05
UPPER_ALPHA = 0.95

# ==============================================================================
# PART 1: K-FOLD AWARE TUNING FOR QUANTILE MODELS
# ==============================================================================
print("\n# Tuning LightGBM Quantile Models with K-Fold Aware Optuna...")

def objective_quantile_kfold(trial):
    # We will tune a single set of parameters that should work well for both quantiles
    params = {
        'objective': 'quantile', 'metric': 'quantile', 'random_state': RANDOM_STATE,
        'n_jobs': -1, 'verbose': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 1500, 3000),
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),
        'max_depth': trial.suggest_int('max_depth', 7, 12),
        'subsample': trial.suggest_float('subsample', 0.7, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 0.95),
    }
    
    # We will evaluate the parameters based on the average Winkler score
    oof_winkler_scores = []
    
    for train_idx, val_idx in skf.split(X, grade_for_stratify):
        X_train_fold, X_calib_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_calib_fold = y_true.iloc[train_idx], y_true.iloc[val_idx]

        # Train lower model
        model_lower = LGBMRegressor(**params, alpha=LOWER_ALPHA)
        model_lower.fit(X_train_fold, y_train_fold)

        # Train upper model
        model_upper = LGBMRegressor(**params, alpha=UPPER_ALPHA)
        model_upper.fit(X_train_fold, y_train_fold)

        # Get initial predictions
        calib_pred_lower = model_lower.predict(X_calib_fold)
        calib_pred_upper = model_upper.predict(X_calib_fold)
        
        # Calculate scores and correction
        nonconformity_scores = np.maximum(calib_pred_lower - y_calib_fold, y_calib_fold - calib_pred_upper)
        q_correction = np.quantile(nonconformity_scores, (1 - COMPETITION_ALPHA) * (1 + 1/len(y_calib_fold)))
        
        # Apply correction and calculate Winkler score for this fold
        fold_lower = calib_pred_lower - q_correction
        fold_upper = calib_pred_upper + q_correction
        fold_score, _ = winkler_score(y_calib_fold, fold_lower, fold_upper, return_coverage=True)
        oof_winkler_scores.append(fold_score)
        
    return np.mean(oof_winkler_scores)

# --- Run the Optuna Study ---
N_OPTUNA_TRIALS_CQR = 30 # Start with 30, this is very computationally expensive
study_quantile = optuna.create_study(direction='minimize')
study_quantile.optimize(objective_quantile_kfold, n_trials=N_OPTUNA_TRIALS_CQR)

best_params_quantile = study_quantile.best_params
print(f"\n# Quantile Model Tuning Complete. Best Avg Winkler Score: ${study_quantile.best_value:,.2f}")


# ==============================================================================
# PART 2: FINAL CQR TRAINING WITH TUNED PARAMETERS
# ==============================================================================
print("\n# Final CQR Training with Tuned Hyperparameters...")

# --- Initialize arrays to store OOF and Test predictions ---
oof_lower = np.zeros(len(X))
oof_upper = np.zeros(len(X))
final_test_lower = np.zeros(len(X_test))
final_test_upper = np.zeros(len(X_test))

# Combine the best tuned params with the fixed ones
final_params_quantile = {
    'objective': 'quantile', 'metric': 'quantile', 'random_state': RANDOM_STATE,
    'n_jobs': -1, 'verbose': -1,
    **best_params_quantile
}

# --- Main K-Fold Loop ---
for fold, (train_idx, val_idx) in enumerate(skf.split(X, grade_for_stratify)):
    print(f"\n# FOLD {fold+1}/{N_SPLITS}")
    X_train_fold, X_calib_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_calib_fold = y_true.iloc[train_idx], y_true.iloc[val_idx]

    # Train Lower Model with Tuned Params
    model_lower = LGBMRegressor(**final_params_quantile, alpha=LOWER_ALPHA)
    model_lower.fit(X_train_fold, y_train_fold)

    # Train Upper Model with Tuned Params
    model_upper = LGBMRegressor(**final_params_quantile, alpha=UPPER_ALPHA)
    model_upper.fit(X_train_fold, y_train_fold)
    
    calib_pred_lower = model_lower.predict(X_calib_fold)
    calib_pred_upper = model_upper.predict(X_calib_fold)
    
    nonconformity_scores = np.maximum(calib_pred_lower - y_calib_fold, y_calib_fold - calib_pred_upper)
    q_correction = np.quantile(nonconformity_scores, (1 - COMPETITION_ALPHA) * (1 + 1/len(y_calib_fold)))
    print(f"  Correction value 'q' for this fold: {q_correction:.2f}")

    oof_lower[val_idx] = calib_pred_lower - q_correction
    oof_upper[val_idx] = calib_pred_upper + q_correction
    final_test_lower += (model_lower.predict(X_test) - q_correction) / N_SPLITS
    final_test_upper += (model_upper.predict(X_test) + q_correction) / N_SPLITS

# --- Calculate and Display Final OOF Score ---
print("\n" + "="*50)
print("# CQR K-Fold Training Complete.")
final_winkler_score, final_coverage = winkler_score(y_true, oof_lower, oof_upper, return_coverage=True)
print(f"# Final OOF Winkler Score: {final_winkler_score:,.2f}")
print(f"# Final OOF Coverage: {final_coverage:.2%}")
print("="*50)


--- Starting CQR Pipeline with Hyperparameter Tuning ---


[I 2025-07-09 14:05:05,543] A new study created in memory with name: no-name-b912af93-0034-4abe-9923-2e863196989d



# Tuning LightGBM Quantile Models with K-Fold Aware Optuna...


[I 2025-07-09 14:06:44,634] Trial 0 finished with value: 341350.43243549863 and parameters: {'learning_rate': 0.029163306289483194, 'n_estimators': 1840, 'num_leaves': 24, 'max_depth': 11, 'subsample': 0.8841312008294709, 'colsample_bytree': 0.7180920315446796}. Best is trial 0 with value: 341350.43243549863.
[I 2025-07-09 14:08:40,568] Trial 1 finished with value: 338143.326838844 and parameters: {'learning_rate': 0.040216738030709735, 'n_estimators': 2198, 'num_leaves': 28, 'max_depth': 12, 'subsample': 0.8269627113129824, 'colsample_bytree': 0.7910258013563782}. Best is trial 1 with value: 338143.326838844.
[I 2025-07-09 14:10:17,615] Trial 2 finished with value: 338286.5339959337 and parameters: {'learning_rate': 0.043492387896136894, 'n_estimators': 1902, 'num_leaves': 28, 'max_depth': 8, 'subsample': 0.8069694636860422, 'colsample_bytree': 0.8314257543816337}. Best is trial 1 with value: 338143.326838844.
[I 2025-07-09 14:11:39,071] Trial 3 finished with value: 343820.80441999936


# Quantile Model Tuning Complete. Best Avg Winkler Score: $335,250.94

# Final CQR Training with Tuned Hyperparameters...

# FOLD 1/5


In [None]:
# =============================================================================
# BLOCK 4: FINAL SUBMISSION
# =============================================================================
print("\n--- STAGE 3: Creating Final Submission File ---")

# Ensure the upper bound is always greater than or equal to the lower bound
final_test_upper = np.maximum(final_test_lower, final_test_upper)

# Create submission dataframe
submission_df = pd.DataFrame({
    'id': test_ids,
    'pi_lower': final_test_lower,
    'pi_upper': final_test_upper
})

submission_df.to_csv('submission_cqr_v1.csv', index=False)
print("\n'submission_cqr_v1.csv' created successfully!")
display(submission_df.head())