In [1]:
# High-scoring House Prices pipeline (LightGBM + CatBoost blend, 5-fold CV)
# Output: /kaggle/working/submission.csv

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")

DATA_DIR = Path("/kaggle/input/house-prices-advanced-regression-techniques")
train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")
test_ids = test["Id"].copy()

# 1) Domain-aware NA handling
none_cols = [
    'Alley','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
    'FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond',
    'PoolQC','Fence','MiscFeature','MasVnrType'
]
for df in (train, test):
    for c in none_cols:
        if c in df.columns:
            df[c] = df[c].fillna("None")

for df in (train, test):
    for c in ['MasVnrArea','BsmtFullBath','BsmtHalfBath','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
              'TotalBsmtSF','GarageYrBlt','GarageArea','GarageCars','LotFrontage','PoolArea']:
        if c in df.columns:
            df[c] = df[c].fillna(0 if c != 'GarageYrBlt' else 0)
    for c in ['Electrical','KitchenQual','Exterior1st','Exterior2nd','SaleType','Functional']:
        if c in df.columns and df[c].isna().any():
            df[c] = df[c].fillna(df[c].mode()[0])

# 2) Ordinal encodings
ord_maps = {
    'ExterQual': {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'ExterCond': {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'BsmtQual':  {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'BsmtCond':  {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'HeatingQC': {'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'KitchenQual':{'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'FireplaceQu':{'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'GarageQual': {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'GarageCond': {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'PoolQC':     {'None':0,'Fa':1,'TA':2,'Gd':3,'Ex':4},
    'BsmtExposure':{'None':0,'No':1,'Mn':2,'Av':3,'Gd':4},
    'BsmtFinType1':{'None':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6},
    'BsmtFinType2':{'None':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6},
    'Functional': {'Sal':1,'Sev':2,'Maj2':3,'Maj1':4,'Mod':5,'Min2':6,'Min1':7,'Typ':8},
    'PavedDrive': {'N':0,'P':1,'Y':2},
}
for col, mp in ord_maps.items():
    if col in train.columns:
        train[col] = train[col].map(mp).astype("Int64")
        test[col]  = test[col].map(mp).astype("Int64")

# 3) Feature engineering
def add_features(df):
    df = df.copy()
    df["TotalSF"]   = df["TotalBsmtSF"] + df["1stFlrSF"] + df["2ndFlrSF"]
    df["TotalPorchSF"] = df[["OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch","WoodDeckSF"]].sum(axis=1)
    df["TotalBath"] = df["FullBath"] + 0.5*df["HalfBath"] + df["BsmtFullBath"] + 0.5*df["BsmtHalfBath"]
    df["HouseAge"]  = df["YrSold"] - df["YearBuilt"]
    df["RemodAge"]  = df["YrSold"] - df["YearRemodAdd"]
    df["IsRemod"]   = (df["YearBuilt"] != df["YearRemodAdd"]).astype(int)
    df["GarageAge"] = np.where(df["GarageYrBlt"]>0, df["YrSold"] - df["GarageYrBlt"], -1)
    if "OverallQual" in df.columns:
        df["Qual_SF"] = df["OverallQual"] * np.log1p(df["TotalSF"])
    return df

train = add_features(train)
test  = add_features(test)

# Outlier removal
out_idx = train[(train["GrLivArea"]>4000) & (train["SalePrice"]<300000)].index
train = train.drop(index=out_idx).reset_index(drop=True)

# 4) Matrices
y_log = np.log1p(train["SalePrice"].values)
train = train.drop(columns=["SalePrice"])
X_full = train.drop(columns=["Id"])
X_test = test.drop(columns=["Id"])

# Make non-ordinal object columns categorical
def make_categories(df):
    df = df.copy()
    ord_cols = set(ord_maps.keys())
    for c in df.columns:
        if df[c].dtype == 'object' and c not in ord_cols:
            df[c] = df[c].astype("category")
    return df

X_full = make_categories(X_full)
X_test = make_categories(X_test)

# ---- FIX: ensure no NaN in categorical columns without double-adding "None"
for df in (X_full, X_test):
    for c in df.columns:
        if str(df[c].dtype) == "category":
            if "None" not in df[c].cat.categories:
                df[c] = df[c].cat.add_categories(["None"])
            df[c] = df[c].fillna("None")
        elif df[c].dtype == object:
            df[c] = df[c].fillna("None")

cat_features = [i for i,c in enumerate(X_full.columns) if str(X_full[c].dtype) == "category"]

# 5) CV + models
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(X_full))
oof_cat = np.zeros(len(X_full))
pred_lgb = np.zeros(len(X_test))
pred_cat = np.zeros(len(X_test))

# LightGBM
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

lgb_params = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.03,
    n_estimators=5000,
    num_leaves=16,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.3,
    min_child_samples=15,
    random_state=42
)

# CatBoost
try:
    from catboost import CatBoostRegressor, Pool
    use_catboost = True
except Exception:
    use_catboost = False

cat_params = dict(
    loss_function="RMSE",
    learning_rate=0.03,
    depth=6,
    l2_leaf_reg=5.0,
    iterations=10000,
    random_seed=42,
    task_type="CPU",
    verbose=False,
    od_type="Iter",
    od_wait=200,
    allow_writing_files=False
)

for fold, (tr_idx, va_idx) in enumerate(kfold.split(X_full), 1):
    X_tr, X_va = X_full.iloc[tr_idx], X_full.iloc[va_idx]
    y_tr, y_va = y_log[tr_idx], y_log[va_idx]

    # LightGBM
    dtr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_features, free_raw_data=False)
    dva = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_features, reference=dtr, free_raw_data=False)
    m_lgb = lgb.train(
        lgb_params,
        dtr,
        valid_sets=[dtr, dva],
        valid_names=["train","valid"],
        num_boost_round=5000,
        callbacks=[
            early_stopping(stopping_rounds=300, verbose=False),
            log_evaluation(period=200)
        ]
    )
    oof_lgb[va_idx] = m_lgb.predict(X_va, num_iteration=m_lgb.best_iteration)
    pred_lgb += m_lgb.predict(X_test, num_iteration=m_lgb.best_iteration) / kfold.n_splits

    # CatBoost
    if use_catboost:
        tr_pool = Pool(X_tr, y_tr, cat_features=cat_features)
        va_pool = Pool(X_va, y_va, cat_features=cat_features)
        m_cat = CatBoostRegressor(**cat_params)
        m_cat.fit(tr_pool, eval_set=va_pool, use_best_model=True, verbose=False)
        oof_cat[va_idx] = m_cat.predict(va_pool)
        pred_cat += m_cat.predict(Pool(X_test, cat_features=cat_features)) / kfold.n_splits

# 6) Blend
def rmsle_from_log(y_true_log, y_pred_log):
    return np.sqrt(np.mean((y_true_log - y_pred_log)**2))

if use_catboost:
    best_w, best_score = None, 9e9
    for w in np.linspace(0.0, 1.0, 51):
        oof_blend = w*oof_lgb + (1-w)*oof_cat
        score = rmsle_from_log(y_log, oof_blend)
        if score < best_score:
            best_score, best_w = score, w
    print(f"[Blend] Best OOF RMSLE (log space): {best_score:.5f} with LightGBM weight={best_w:.2f}")
    test_log = best_w*pred_lgb + (1-best_w)*pred_cat
else:
    print("[Blend] CatBoost unavailable, using LightGBM only.")
    test_log = pred_lgb

# 7) Save submission
test_preds = np.expm1(test_log)
test_preds = np.maximum(test_preds, 0)
submission = pd.DataFrame({"Id": test_ids, "SalePrice": test_preds})
out_path = "/kaggle/working/submission.csv"
submission.to_csv(out_path, index=False)
print(f"Saved: {out_path}")
submission.head()


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002225 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4165
[LightGBM] [Info] Number of data points in the train set: 1166, number of used features: 83
[LightGBM] [Info] Start training from score 12.023362
[200]	train's rmse: 0.0685316	valid's rmse: 0.126226
[400]	train's rmse: 0.0458941	valid's rmse: 0.124729
[600]	train's rmse: 0.0339682	valid's rmse: 0.12452
[800]	train's rmse: 0.0262899	valid's rmse: 0.124661
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001305 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4158
[LightGBM] [Info] Number of data points in the train set: 1166, number of used features: 83
[LightGBM] [Info] Start training from score 12.026498
[200]	train's rmse: 0.0696394	valid's rms

Unnamed: 0,Id,SalePrice
0,1461,117682.341437
1,1462,163881.681832
2,1463,182998.466802
3,1464,194979.855507
4,1465,184696.8891
