In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder

# Load only the training data for now
df = pd.read_csv("dataset.csv", index_col="id", parse_dates=["sale_date"])
print("Original training data loaded successfully.")
print(f"Shape of original data: {df.shape}")

Original training data loaded successfully.
Shape of original data: (200000, 46)


In [2]:
print("\n--- Step 1: Splitting data and transforming the target ---")

# Separate features (X) and target (y)
X = df.drop('sale_price', axis=1)
y = df['sale_price']

# Apply log transform to the target variable
y_log = np.log1p(y)
print("Target variable 'sale_price' has been log-transformed.")

# Split the data into training (80%) and validation (20%) sets
# We stratify by 'grade' to ensure both sets have a similar distribution of house quality
# This creates a more reliable validation set.
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42, stratify=X['grade']
)

print(f"Data split into training and validation sets:")
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape:   {X_val.shape}")


--- Step 1: Splitting data and transforming the target ---
Target variable 'sale_price' has been log-transformed.
Data split into training and validation sets:
X_train shape: (160000, 45)
X_val shape:   (40000, 45)


In [3]:
print("\n--- Step 2: Defining the feature engineering function ---")

def feature_engineer(df):
    """
    This function takes a dataframe (either train or val) and applies
    all the feature engineering steps.
    """
    # Make a copy to avoid modifying the original dataframe
    df_processed = df.copy()

    # --- Tier 1: Bedrock Features ---
    
    # A) Date-Based Features
    df_processed['sale_year'] = df_processed['sale_date'].dt.year
    df_processed['sale_month'] = df_processed['sale_date'].dt.month
    df_processed['sale_dayofyear'] = df_processed['sale_date'].dt.dayofyear
    df_processed['age_at_sale'] = df_processed['sale_year'] - df_processed['year_built']
    
    # Handle 'year_reno' where 0 means never renovated
    df_processed['time_since_reno'] = np.where(
        df_processed['year_reno'] > 0,
        df_processed['sale_year'] - df_processed['year_reno'],
        df_processed['age_at_sale'] 
    )
    
    # --- Tier 2: Refinement and Interaction ---
    
    # B) Feature Aggregation & Combination
    df_processed['total_bathrooms'] = (
        df_processed['bath_full'] + 
        0.75 * df_processed['bath_3qtr'] + 
        0.5 * df_processed['bath_half']
    )
    df_processed['total_sqft'] = (
        df_processed['sqft'] + 
        df_processed['sqft_fbsmt'] + 
        df_processed['gara_sqft']
    )

    # C) Insightful Ratio Features
    # Add 1 to denominators to prevent division by zero errors
    df_processed['imp_to_land_ratio'] = df_processed['imp_val'] / (df_processed['land_val'] + 1)
    df_processed['sqft_per_room'] = df_processed['sqft'] / (df_processed['beds'] + df_processed['total_bathrooms'] + 1)
    df_processed['lot_to_house_ratio'] = df_processed['sqft_lot'] / (df_processed['total_sqft'] + 1)

    # D) Polynomial and Interaction Features for top predictors
    df_processed['grade_sq'] = df_processed['grade']**2
    df_processed['total_sqft_sq'] = df_processed['total_sqft']**2
    df_processed['grade_x_sqft'] = df_processed['grade'] * df_processed['total_sqft']
    
    return df_processed

# Apply the function to our train and validation sets
X_train_fe = feature_engineer(X_train)
X_val_fe = feature_engineer(X_val)

print("Feature engineering function created and applied to train/val sets.")
print(f"Shape of X_train after FE: {X_train_fe.shape}")



--- Step 2: Defining the feature engineering function ---
Feature engineering function created and applied to train/val sets.
Shape of X_train after FE: (160000, 58)


In [4]:
print("\n--- Step 3: Engineering location clusters ---")

# Initialize the KMeans model
kmeans = KMeans(n_clusters=20, random_state=42, n_init='auto')

# Fit ONLY on the training data's lat/lon to learn the cluster centers
kmeans.fit(X_train_fe[['latitude', 'longitude']])
print("KMeans model fitted on training data.")

# Now, create the 'location_cluster' feature in both sets using the fitted model
X_train_fe['location_cluster'] = kmeans.predict(X_train_fe[['latitude', 'longitude']])
X_val_fe['location_cluster'] = kmeans.predict(X_val_fe[['latitude', 'longitude']])
print("Location cluster feature added to both train and val sets.")



--- Step 3: Engineering location clusters ---
KMeans model fitted on training data.
Location cluster feature added to both train and val sets.


In [5]:
print("\n--- Step 4: Handling missing values and final cleanup ---")

# A) Impute 'sale_nbr'
# Calculate the median from the training set ONLY
sale_nbr_median = X_train_fe['sale_nbr'].median()
print(f"Median for 'sale_nbr' calculated from training data: {sale_nbr_median}")

# Fill missing values in both sets with this median
X_train_fe['sale_nbr'].fillna(sale_nbr_median, inplace=True)
X_val_fe['sale_nbr'].fillna(sale_nbr_median, inplace=True)

# B) Impute 'submarket'
# We will treat 'missing' as its own category
X_train_fe['submarket'].fillna('missing', inplace=True)
X_val_fe['submarket'].fillna('missing', inplace=True)
print("Missing values for 'sale_nbr' and 'submarket' handled.")

# C) Drop columns that are no longer needed or are too noisy
cols_to_drop = [
    'sale_date',       # Used to create time-based features
    'year_built',      # Replaced by 'age_at_sale'
    'year_reno',       # Replaced by 'time_since_reno'
    'bath_full', 'bath_3qtr', 'bath_half', # Combined into 'total_bathrooms'
    'subdivision',     # Too noisy and high-cardinality
]

X_train_cleaned = X_train_fe.drop(columns=cols_to_drop)
X_val_cleaned = X_val_fe.drop(columns=cols_to_drop)
print(f"Dropped {len(cols_to_drop)} redundant/noisy columns.")
print(f"Shape of X_train after cleanup: {X_train_cleaned.shape}")


--- Step 4: Handling missing values and final cleanup ---
Median for 'sale_nbr' calculated from training data: 2.0
Missing values for 'sale_nbr' and 'submarket' handled.
Dropped 7 redundant/noisy columns.
Shape of X_train after cleanup: (160000, 52)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_fe['sale_nbr'].fillna(sale_nbr_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val_fe['sale_nbr'].fillna(sale_nbr_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

In [6]:
print("\n--- Step 5: Encoding categorical features ---")

# Identify categorical columns (those with 'object' dtype)
categorical_cols = X_train_cleaned.select_dtypes(include='object').columns.tolist()
print(f"Categorical columns to be encoded: {categorical_cols}")

# Use OrdinalEncoder as a robust baseline for tree models
# We fit on the training data and transform both sets
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_cleaned[categorical_cols] = encoder.fit_transform(X_train_cleaned[categorical_cols])
X_val_cleaned[categorical_cols] = encoder.transform(X_val_cleaned[categorical_cols])
print("Categorical features have been OrdinalEncoded.")


--- Step 5: Encoding categorical features ---
Categorical features have been OrdinalEncoded.


In [7]:
print("\n--- Step 6: Final datasets are ready for modeling! ---")

# Ensure column order is the same in both dataframes
X_val_final = X_val_cleaned[X_train_cleaned.columns]
X_train_final = X_train_cleaned

print("Final prepared dataframes:")
print(f"X_train_final shape: {X_train_final.shape}")
print(f"X_val_final shape:   {X_val_final.shape}")
print(f"y_train_log shape:   {y_train_log.shape}")
print(f"y_val_log shape:     {y_val_log.shape}")

print("\nFirst 5 rows of the final training features:")
display(X_train_final.head())


--- Step 6: Final datasets are ready for modeling! ---
Final prepared dataframes:
X_train_final shape: (160000, 52)
X_val_final shape:   (40000, 52)
y_train_log shape:   (160000,)
y_val_log shape:     (40000,)

First 5 rows of the final training features:


Unnamed: 0_level_0,sale_nbr,sale_warning,join_status,join_year,latitude,longitude,area,city,zoning,present_use,...,time_since_reno,total_bathrooms,total_sqft,imp_to_land_ratio,sqft_per_room,lot_to_house_ratio,grade_sq,total_sqft_sq,grade_x_sqft,location_cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
180173,2.0,0.0,3.0,2025,47.722,-122.2872,8,33.0,203.0,2,...,10,2.5,3020,0.83991,346.153846,3.045349,49,9120400,21140,3
25977,2.0,0.0,2.0,2025,47.5972,-122.2995,15,33.0,97.0,29,...,0,2.5,1910,1.430416,280.0,0.940869,64,3648100,15280,9
120930,1.0,0.0,3.0,2025,47.6928,-122.3832,39,33.0,203.0,2,...,51,1.5,1420,0.490384,258.181818,4.228712,64,2016400,11360,12
5329,2.0,0.0,3.0,2025,47.2895,-122.2106,28,1.0,325.0,2,...,60,1.0,1330,2.114271,208.0,6.793388,49,1768900,9310,2
87712,3.0,0.0,2.0,2025,47.5864,-121.9667,69,31.0,281.0,2,...,13,3.5,5240,2.510237,440.0,2.736882,121,27457600,57640,0


In [9]:
import lightgbm as lgb
import pandas as pd
import numpy as np

# --- Competition and Model Parameters ---
# This is the alpha for the 90% prediction interval (100% - 90% = 10% -> 0.1)
# We defined this in the competition overview.
COMPETITION_ALPHA = 0.1
LOWER_QUANTILE = COMPETITION_ALPHA / 2  # -> 0.05
UPPER_QUANTILE = 1 - (COMPETITION_ALPHA / 2) # -> 0.95

# --- Define the Winkler Score Function (from starter notebook) ---
def winkler_score(y_true, lower, upper, alpha, return_coverage=False):
    width = upper - lower
    penalty_lower = (2 / alpha) * (lower - y_true) * (y_true < lower)
    penalty_upper = (2 / alpha) * (y_true - upper) * (y_true > upper)
    score = width + penalty_lower + penalty_upper
    
    if return_coverage:
        inside = (y_true >= lower) & (y_true <= upper)
        coverage = np.mean(inside)
        return np.mean(score), coverage
    return np.mean(score)


print("\n--- Step 7: Training baseline LightGBM models ---")

# --- Common LightGBM parameters for our baseline model ---
# These are reasonable defaults to start with.
params = {
    'objective': 'quantile',
    'metric': 'quantile',
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_jobs': -1,
}

# --- Train the Lower Quantile Model ---
print(f"Training model for lower quantile: {LOWER_QUANTILE}")
params['alpha'] = LOWER_QUANTILE
model_lower = lgb.LGBMRegressor(**params)
model_lower.fit(
    X_train_final, 
    y_train_log,
    eval_set=[(X_val_final, y_val_log)],
    eval_metric='quantile',
    callbacks=[lgb.early_stopping(100, verbose=False)] # Stop if validation score doesn't improve for 100 rounds
)

# --- Train the Upper Quantile Model ---
print(f"Training model for upper quantile: {UPPER_QUANTILE}")
params['alpha'] = UPPER_QUANTILE
model_upper = lgb.LGBMRegressor(**params)
model_upper.fit(
    X_train_final, 
    y_train_log,
    eval_set=[(X_val_final, y_val_log)],
    eval_metric='quantile',
    callbacks=[lgb.early_stopping(100, verbose=False)] # Use early stopping here as well
)

print("Baseline models trained successfully.")


--- Step 7: Training baseline LightGBM models ---
Training model for lower quantile: 0.05
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011319 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
Training model for upper quantile: 0.95
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094
Baseline models trained successfully.


In [10]:
print("\n--- Step 8: Evaluating the baseline model ---")

# Predict on the validation set
preds_lower_log = model_lower.predict(X_val_final)
preds_upper_log = model_upper.predict(X_val_final)

# IMPORTANT: Inverse transform the predictions back to the original dollar scale
# We trained on log(price), so we must convert back using expm1
preds_lower = np.expm1(preds_lower_log)
preds_upper = np.expm1(preds_upper_log)

# Also, get the true values for the validation set back to the dollar scale
y_val_true = np.expm1(y_val_log)

# Ensure the lower bound is never higher than the upper bound
preds_upper = np.maximum(preds_lower, preds_upper)

# Calculate the Winkler score and coverage on the validation set
winkler, coverage = winkler_score(
    y_val_true, 
    preds_lower, 
    preds_upper,
    alpha=COMPETITION_ALPHA,
    return_coverage=True
)

print("\n--- Baseline Model Performance ---")
print(f"Winkler Score on Validation Set: {winkler:,.2f}")
print(f"Coverage on Validation Set:      {coverage:.2%}")
print("-----------------------------------")
print(f"Our goal is 90% coverage. If coverage is too low, our intervals are too narrow.")
print(f"If coverage is too high, our intervals are too wide and can be narrowed to improve the score.")


--- Step 8: Evaluating the baseline model ---

--- Baseline Model Performance ---
Winkler Score on Validation Set: 338,360.60
Coverage on Validation Set:      86.89%
-----------------------------------
Our goal is 90% coverage. If coverage is too low, our intervals are too narrow.
If coverage is too high, our intervals are too wide and can be narrowed to improve the score.


In [11]:
import optuna

print("\n--- Step 9: Setting up Optuna for hyperparameter tuning ---")

def objective(trial):
    """
    This is the objective function that Optuna will try to minimize.
    It trains a pair of models with a given set of hyperparameters and
    returns the Winkler score.
    """
    
    # Define the search space for hyperparameters
    params = {
        'objective': 'quantile',
        'metric': 'quantile',
        'random_state': 42,
        'n_jobs': -1,
        'n_estimators': 1000, # We use early stopping, so this can be a large number
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

    # --- Train Lower Model ---
    params['alpha'] = LOWER_QUANTILE
    model_lower_opt = lgb.LGBMRegressor(**params)
    model_lower_opt.fit(X_train_final, y_train_log,
                        eval_set=[(X_val_final, y_val_log)],
                        callbacks=[lgb.early_stopping(100, verbose=False)])

    # --- Train Upper Model ---
    params['alpha'] = UPPER_QUANTILE
    model_upper_opt = lgb.LGBMRegressor(**params)
    model_upper_opt.fit(X_train_final, y_train_log,
                        eval_set=[(X_val_final, y_val_log)],
                        callbacks=[lgb.early_stopping(100, verbose=False)])

    # --- Evaluate ---
    preds_lower_log = model_lower_opt.predict(X_val_final)
    preds_upper_log = model_upper_opt.predict(X_val_final)
    
    preds_lower = np.expm1(preds_lower_log)
    preds_upper = np.expm1(preds_upper_log)
    y_val_true = np.expm1(y_val_log)
    preds_upper = np.maximum(preds_lower, preds_upper)
    
    # We want to minimize the Winkler score
    score = winkler_score(y_val_true, preds_lower, preds_upper, alpha=COMPETITION_ALPHA)
    
    return score

# Create an Optuna study object and specify we want to minimize the score
study = optuna.create_study(direction='minimize')

# Start the optimization process. Let's run 25 trials as a strong starting point.
# You can increase this number for better results if you have time.
print("Starting Optuna optimization. This may take a while...")
study.optimize(objective, n_trials=25)

# --- Get the best results ---
best_params = study.best_params
best_score = study.best_value

print("\n--- Optuna Tuning Results ---")
print(f"Best Winkler Score Found: {best_score:,.2f}")
print("Best Hyperparameters Found:")
print(best_params)

[I 2025-07-02 14:50:59,368] A new study created in memory with name: no-name-c6a0c456-8cda-43c2-864a-0c5a631a381d



--- Step 9: Setting up Optuna for hyperparameter tuning ---
Starting Optuna optimization. This may take a while...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004539 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
















[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010127 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094
















[I 2025-07-02 14:51:12,055] Trial 0 finished with value: 341151.8231182601 and parameters: {'learning_rate': 0.05117425527743049, 'num_leaves': 110, 'max_depth': 4, 'subsample': 0.7475483121125408, 'colsample_bytree': 0.6020526895057539, 'reg_alpha': 0.04398228371431861, 'reg_lambda': 1.8561424070805954e-07}. Best is trial 0 with value: 341151.8231182601.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117










[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005920 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094












[I 2025-07-02 14:52:08,845] Trial 1 finished with value: 354061.71300318214 and parameters: {'learning_rate': 0.016420519727270214, 'num_leaves': 254, 'max_depth': 9, 'subsample': 0.7945844624576369, 'colsample_bytree': 0.889097450090883, 'reg_alpha': 0.000518053669383454, 'reg_lambda': 0.00344788768102031}. Best is trial 0 with value: 341151.8231182601.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117














[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011603 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


















[I 2025-07-02 14:52:18,705] Trial 2 finished with value: 342707.47172921256 and parameters: {'learning_rate': 0.059187158147142316, 'num_leaves': 98, 'max_depth': 4, 'subsample': 0.721174914637648, 'colsample_bytree': 0.8780325884378138, 'reg_alpha': 1.907612864568919e-06, 'reg_lambda': 0.00042428085666433015}. Best is trial 0 with value: 341151.8231182601.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
















[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005775 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


















[I 2025-07-02 14:52:33,175] Trial 3 finished with value: 341427.3536181021 and parameters: {'learning_rate': 0.05217115811122722, 'num_leaves': 300, 'max_depth': 6, 'subsample': 0.7657168444698611, 'colsample_bytree': 0.997332962717374, 'reg_alpha': 7.092851528788271e-05, 'reg_lambda': 2.1539432288730722e-08}. Best is trial 0 with value: 341151.8231182601.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011460 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
















[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013744 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094
















[I 2025-07-02 14:52:50,314] Trial 4 finished with value: 358669.3505409601 and parameters: {'learning_rate': 0.011883655113871698, 'num_leaves': 235, 'max_depth': 6, 'subsample': 0.8387141472389934, 'colsample_bytree': 0.6026309076817724, 'reg_alpha': 0.08677795469907874, 'reg_lambda': 2.2134614244652855e-07}. Best is trial 0 with value: 341151.8231182601.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016278 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003636 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:53:16,515] Trial 5 finished with value: 339448.4341224103 and parameters: {'learning_rate': 0.02363377421497667, 'num_leaves': 73, 'max_depth': 8, 'subsample': 0.6166821715210286, 'colsample_bytree': 0.6374974419049068, 'reg_alpha': 0.35447608844362116, 'reg_lambda': 1.0106744669468788}. Best is trial 5 with value: 339448.4341224103.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010227 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
















[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008421 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094
















[I 2025-07-02 14:53:29,492] Trial 6 finished with value: 354351.26503552153 and parameters: {'learning_rate': 0.029884519678379802, 'num_leaves': 197, 'max_depth': 4, 'subsample': 0.868129799313675, 'colsample_bytree': 0.7313252429700465, 'reg_alpha': 2.6999020331182765e-05, 'reg_lambda': 1.6685246185692912e-07}. Best is trial 5 with value: 339448.4341224103.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011514 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006643 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:53:48,839] Trial 7 finished with value: 337432.1306450799 and parameters: {'learning_rate': 0.05136229456153555, 'num_leaves': 35, 'max_depth': 12, 'subsample': 0.6199429385127754, 'colsample_bytree': 0.7938037035413013, 'reg_alpha': 4.796634067288386e-06, 'reg_lambda': 2.1357556586038686e-07}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009401 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117














[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.076001 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094
















[I 2025-07-02 14:53:58,911] Trial 8 finished with value: 346395.6392974201 and parameters: {'learning_rate': 0.043247619863880725, 'num_leaves': 41, 'max_depth': 4, 'subsample': 0.9195672488996396, 'colsample_bytree': 0.91098197280538, 'reg_alpha': 2.5363244402282332e-05, 'reg_lambda': 0.3914018937838617}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004090 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117










[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015473 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


















[I 2025-07-02 14:54:14,696] Trial 9 finished with value: 342737.310856202 and parameters: {'learning_rate': 0.07330300210185585, 'num_leaves': 171, 'max_depth': 7, 'subsample': 0.6709593230809118, 'colsample_bytree': 0.8767052759353473, 'reg_alpha': 0.02783711964620683, 'reg_lambda': 0.03497071272710269}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005912 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004705 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:54:26,607] Trial 10 finished with value: 338083.9877090255 and parameters: {'learning_rate': 0.08989261678468548, 'num_leaves': 29, 'max_depth': 12, 'subsample': 0.9863988947900854, 'colsample_bytree': 0.7185782068898168, 'reg_alpha': 1.9342767534174875e-08, 'reg_lambda': 1.0645342763293711e-05}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052684 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004824 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:54:36,648] Trial 11 finished with value: 338022.2671040787 and parameters: {'learning_rate': 0.08858420068080758, 'num_leaves': 24, 'max_depth': 12, 'subsample': 0.9832785409662368, 'colsample_bytree': 0.7542457611247017, 'reg_alpha': 1.24641555376299e-08, 'reg_lambda': 4.6402389163988595e-06}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004366 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013288 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:54:44,798] Trial 12 finished with value: 338304.9784802868 and parameters: {'learning_rate': 0.09863115468682584, 'num_leaves': 20, 'max_depth': 12, 'subsample': 0.9965026331196464, 'colsample_bytree': 0.7677481406178455, 'reg_alpha': 3.728703856142415e-08, 'reg_lambda': 2.3699028856917975e-05}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037001 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:55:10,368] Trial 13 finished with value: 346088.71182640974 and parameters: {'learning_rate': 0.036222817165135146, 'num_leaves': 126, 'max_depth': 10, 'subsample': 0.6065933911412881, 'colsample_bytree': 0.8043835929728521, 'reg_alpha': 4.839596549453639e-07, 'reg_lambda': 7.2863752209998145e-06}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012108 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:55:20,054] Trial 14 finished with value: 342249.37435966637 and parameters: {'learning_rate': 0.0663829289516678, 'num_leaves': 79, 'max_depth': 11, 'subsample': 0.8878610398132031, 'colsample_bytree': 0.8059414568292559, 'reg_alpha': 1.997550311947532e-06, 'reg_lambda': 1.6519693924844827e-06}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010009 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:55:31,896] Trial 15 finished with value: 340428.9389119234 and parameters: {'learning_rate': 0.07560311887261985, 'num_leaves': 56, 'max_depth': 10, 'subsample': 0.6749825962787586, 'colsample_bytree': 0.677405235951517, 'reg_alpha': 1.7523222833202018e-07, 'reg_lambda': 0.00010035279601549515}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010485 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010633 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:55:54,428] Trial 16 finished with value: 347759.1099374681 and parameters: {'learning_rate': 0.02663225423208605, 'num_leaves': 137, 'max_depth': 11, 'subsample': 0.939615540286661, 'colsample_bytree': 0.8284722228954705, 'reg_alpha': 0.006228935093079695, 'reg_lambda': 1.358175244662994e-08}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011764 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006738 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:56:14,386] Trial 17 finished with value: 338774.39840195444 and parameters: {'learning_rate': 0.04004060984905781, 'num_leaves': 63, 'max_depth': 12, 'subsample': 0.836513001975913, 'colsample_bytree': 0.7459264443075538, 'reg_alpha': 1.25758957368733e-08, 'reg_lambda': 2.2163892925910344e-06}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012329 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010648 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094




[I 2025-07-02 14:56:23,911] Trial 18 finished with value: 352959.19979272806 and parameters: {'learning_rate': 0.0813592862696764, 'num_leaves': 163, 'max_depth': 9, 'subsample': 0.6904723127072132, 'colsample_bytree': 0.7007299022594938, 'reg_alpha': 0.0009047297823645303, 'reg_lambda': 0.0005515472275681096}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012346 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011643 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:56:48,551] Trial 19 finished with value: 343914.5727142476 and parameters: {'learning_rate': 0.02049724671615418, 'num_leaves': 93, 'max_depth': 11, 'subsample': 0.9449942158378488, 'colsample_bytree': 0.770053249851339, 'reg_alpha': 6.7432766111685895e-06, 'reg_lambda': 7.419843109952223e-07}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011327 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:57:03,651] Trial 20 finished with value: 337564.8793420275 and parameters: {'learning_rate': 0.050669515525802795, 'num_leaves': 47, 'max_depth': 10, 'subsample': 0.6391430186701976, 'colsample_bytree': 0.8328670587288096, 'reg_alpha': 2.807205122671311e-07, 'reg_lambda': 5.96512437655239e-05}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007941 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006483 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:57:24,766] Trial 21 finished with value: 338156.48444615514 and parameters: {'learning_rate': 0.04832302824437795, 'num_leaves': 45, 'max_depth': 10, 'subsample': 0.645796167695567, 'colsample_bytree': 0.8410496001831844, 'reg_alpha': 1.576053409870184e-07, 'reg_lambda': 8.712566953324546e-05}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:57:37,853] Trial 22 finished with value: 339186.03192229336 and parameters: {'learning_rate': 0.06071572021211748, 'num_leaves': 22, 'max_depth': 12, 'subsample': 0.6319664568915128, 'colsample_bytree': 0.9543091392990278, 'reg_alpha': 6.010378389620351e-07, 'reg_lambda': 9.510876119814432}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004239 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060694 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:57:52,959] Trial 23 finished with value: 340500.14753571915 and parameters: {'learning_rate': 0.09948141594241255, 'num_leaves': 56, 'max_depth': 11, 'subsample': 0.7220899241597214, 'colsample_bytree': 0.8337328978168087, 'reg_alpha': 5.500264487779032e-08, 'reg_lambda': 0.004635150593997513}. Best is trial 7 with value: 337432.1306450799.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013852 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Start training from score 14.176094


[I 2025-07-02 14:58:34,751] Trial 24 finished with value: 339393.193441584 and parameters: {'learning_rate': 0.03800269926367416, 'num_leaves': 84, 'max_depth': 9, 'subsample': 0.6504901547077621, 'colsample_bytree': 0.7715312337038833, 'reg_alpha': 3.3607696424220252, 'reg_lambda': 3.705160363421949e-05}. Best is trial 7 with value: 337432.1306450799.



--- Optuna Tuning Results ---
Best Winkler Score Found: 337,432.13
Best Hyperparameters Found:
{'learning_rate': 0.05136229456153555, 'num_leaves': 35, 'max_depth': 12, 'subsample': 0.6199429385127754, 'colsample_bytree': 0.7938037035413013, 'reg_alpha': 4.796634067288386e-06, 'reg_lambda': 2.1357556586038686e-07}


In [12]:
print("\n--- Step 10: Training final model on all data and creating submission ---")

# --- First, prepare the official test set using the same pipeline ---

# Load the test data
df_test = pd.read_csv("test.csv", index_col="id", parse_dates=["sale_date"])
X_test = df_test.copy()

# 1. Apply Feature Engineering
X_test_fe = feature_engineer(X_test)

# 2. Add Location Clusters (using the kmeans model already fitted on the training data)
X_test_fe['location_cluster'] = kmeans.predict(X_test_fe[['latitude', 'longitude']])

# 3. Handle Missing Values (using the median from the training set)
X_test_fe['sale_nbr'].fillna(sale_nbr_median, inplace=True)
X_test_fe['submarket'].fillna('missing', inplace=True)

# 4. Drop same columns
X_test_cleaned = X_test_fe.drop(columns=cols_to_drop)

# 5. Encode Categoricals (using the encoder already fitted on the training data)
X_test_cleaned[categorical_cols] = encoder.transform(X_test_cleaned[categorical_cols])

# 6. Ensure column order matches
X_test_final = X_test_cleaned[X_train_final.columns]
print("Official test data processed successfully.")


# --- Now, train the final models using the best parameters from Optuna on ALL training data ---

# Combine our train and val sets back into one full training set
X_full_train = pd.concat([X_train_final, X_val_final], axis=0)
y_full_log = pd.concat([y_train_log, y_val_log], axis=0)

# Get the best params and add required fixed params
final_params = study.best_params
final_params['objective'] = 'quantile'
final_params['metric'] = 'quantile'
final_params['random_state'] = 42
final_params['n_jobs'] = -1

# Train Final Lower Model
print("Training final lower model on all data...")
final_params['alpha'] = LOWER_QUANTILE
final_model_lower = lgb.LGBMRegressor(**final_params, n_estimators=1200) # Use a bit more estimators for final model
final_model_lower.fit(X_full_train, y_full_log)

# Train Final Upper Model
print("Training final upper model on all data...")
final_params['alpha'] = UPPER_QUANTILE
final_model_upper = lgb.LGBMRegressor(**final_params, n_estimators=1200)
final_model_upper.fit(X_full_train, y_full_log)


# --- Generate predictions for the test set ---
test_preds_lower_log = final_model_lower.predict(X_test_final)
test_preds_upper_log = final_model_upper.predict(X_test_final)

# Inverse transform to get dollar values
test_preds_lower = np.expm1(test_preds_lower_log)
test_preds_upper = np.expm1(test_preds_upper_log)

# Ensure lower <= upper
test_preds_upper = np.maximum(test_preds_lower, test_preds_upper)

# --- Create submission file ---
submission_df = pd.DataFrame({
    'id': X_test_final.index,
    'pi_lower': test_preds_lower,
    'pi_upper': test_preds_upper
})

submission_df.to_csv('submission.csv', index=False)
print("\n'submission.csv' file created successfully!")
display(submission_df.head())


--- Step 10: Training final model on all data and creating submission ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test_fe['sale_nbr'].fillna(sale_nbr_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test_fe['submarket'].fillna('missing', inplace=True)


Official test data processed successfully.
Training final lower model on all data...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009978 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4977
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 52
[LightGBM] [Info] Start training from score 12.128117
Training final upper model on all data...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014084 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4977
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 52
[LightGBM] [Info] Start training from score 14.175982

'submission.csv' file created successfully!


Unnamed: 0,id,pi_lower,pi_upper
0,200000,797786.81637,1088439.0
1,200001,576454.793222,794323.9
2,200002,441414.157489,643014.3
3,200003,301582.766598,438685.8
4,200004,398875.432742,687219.1
