In [5]:
!pip install catboost

import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
train = pd.read_csv('C:\\House Price Prediction\\train.csv')
test = pd.read_csv('C:\\House Price Prediction\\test.csv')
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)
train["SalePrice_Log"] = np.log1p(train["SalePrice"])
y = train["SalePrice_Log"]
train.drop(["SalePrice", "SalePrice_Log"], axis=1, inplace=True)
all_data = pd.concat([train, test], axis=0, ignore_index=True)

for col in ["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond",
            "BsmtQual", "BsmtCond", "BsmtFinType1", "BsmtFinType2", "MasVnrType", "MSZoning", "Functional", "Utilities",
            "KitchenQual", "Exterior1st", "Exterior2nd", "SaleType"]:
    all_data[col] = all_data[col].fillna("None")

for col in ["GarageYrBlt", "GarageArea", "GarageCars", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF",
            "BsmtFullBath", "BsmtHalfBath", "MasVnrArea", "LotFrontage"]:
    all_data[col] = all_data[col].fillna(all_data[col].median())

numeric_feats = all_data.select_dtypes(include=["int64", "float64"]).columns
skewed_feats = all_data[numeric_feats].apply(lambda x: stats.skew(x.dropna())).sort_values(ascending=False)
skewed = skewed_feats[abs(skewed_feats) > 0.75].index
all_data[skewed] = np.log1p(all_data[skewed])

categorical_cols = all_data.select_dtypes(include=["object"]).columns
numeric_cols = all_data.select_dtypes(exclude=["object"]).columns

preprocess = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), numeric_cols),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ]), categorical_cols)
])

X_full = preprocess.fit_transform(all_data)
X = X_full[:len(y)]
X_test = X_full[len(y):]
ridge = RidgeCV(alphas=np.logspace(-3, 3, 50))
lasso = LassoCV(random_state=42, max_iter=5000)
elastic = ElasticNetCV(l1_ratio=[.1, .5, .9], random_state=42)

gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                max_depth=4, max_features="sqrt",
                                min_samples_leaf=15, min_samples_split=10,
                                loss='huber', random_state=42)

xgb = XGBRegressor(objective="reg:squarederror", learning_rate=0.05,
                   n_estimators=3000, max_depth=3,
                   subsample=.7, colsample_bytree=.7, random_state=42)

lgbm = LGBMRegressor(objective='regression', num_leaves=5,
                     learning_rate=0.05, n_estimators=720,
                     max_bin=55, bagging_fraction=0.8,
                     bagging_freq=5, feature_fraction=0.2319,
                     feature_fraction_seed=9, bagging_seed=9,
                     min_data_in_leaf=6, min_sum_hessian_in_leaf=11)

cat = CatBoostRegressor(depth=4, learning_rate=0.05,
                        iterations=5000, verbose=False, random_state=42)
def rmsle_cv(model, X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    return np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_log_error", cv=kf, n_jobs=-1))

models = {
    "Ridge": ridge,
    "Lasso": lasso,
    "ElasticNet": elastic,
    "GBR": gbr,
    "XGBoost": xgb,
    "LightGBM": lgbm,
    "CatBoost": cat
}

for name, model in models.items():
    score = rmsle_cv(model, X, y).mean()
    print(f"{name:10s} RMSLE: {score:.5f}")

base_models = [
    ("ridge", ridge),
    ("lasso", lasso),
    ("xgb", xgb),
    ("lgbm", lgbm),
    ("cat", cat)
]
stacked = StackingRegressor(
    estimators=base_models,
    final_estimator=LassoCV(alphas=np.logspace(-4, 0, 30), random_state=42),
    cv=5,
    n_jobs=-1
)
print("\nStacked Model RMSLE:", rmsle_cv(stacked, X, y).mean().round(5))
final_model = stacked.fit(X, y)
final_preds = np.expm1(final_model.predict(X_test))
submission = pd.DataFrame({"Id": test_ID, "SalePrice": final_preds})
submission.to_csv("submission.csv", index=False)
print(" submission.csv is ready.")


Ridge      RMSLE: 0.01007
Lasso      RMSLE: 0.00991
ElasticNet RMSLE: 0.00990
GBR        RMSLE: 0.00971
XGBoost    RMSLE: 0.00961
LightGBM   RMSLE: 0.00986
CatBoost   RMSLE: 0.00962

Stacked Model RMSLE: 0.00935
 submission.csv is ready.
