In [None]:
# Importing Library

import pandas as pd
import numpy as np

from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder

In [None]:
# Load Dataset
df = pd.read_csv("csv files/House_Price_train.csv")

In [None]:
df.info()

In [None]:
# Target Transformation

df['log_price'] = np.log1p(df['SalePrice'])


In [None]:
# Prepare Feature and Target

y = df["log_price"]

drop_cols = [c for c in ["Id", "SalePrice", "log_price"] if c in df.columns]
X = df.drop(drop_cols, axis=1)


In [None]:
# Identify Columns type

cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns

In [None]:
# Handling Missing Values
# Separate features and target

# Numeric → median
X[num_cols] = X[num_cols].fillna(X[num_cols].median())

# Categorical → string (before encoding)
X[cat_cols] = X[cat_cols].fillna("None").astype(str)



In [None]:
# Encode Categorical Features

encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1
)

X[cat_cols] = encoder.fit_transform(X[cat_cols])


In [None]:
#Train–Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [None]:
# Training the model

cat_model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.03,
    depth=8,
    l2_leaf_reg=3,
    loss_function="RMSE",
    random_seed=42,
    verbose=200
)

cat_model.fit(X_train, y_train)


In [None]:
# Testing the model

log_preds = cat_model.predict(X_test)
preds = np.expm1(log_preds)


In [None]:
# Evaluation of the model

true_prices = np.expm1(y_test)

rmse = np.sqrt(mean_squared_error(true_prices, preds))
r2 = r2_score(true_prices, preds)

print("CatBoost RMSE:", rmse)
print("CatBoost R²:", r2)

# Tuning

In [None]:
X_fe = X.copy()

X_fe["TotalSF"] = (
    X_fe["TotalBsmtSF"] +
    X_fe["1stFlrSF"] +
    X_fe["2ndFlrSF"]
)

X_fe["Age"] = X_fe["YrSold"] - X_fe["YearBuilt"]
X_fe["RemodAge"] = X_fe["YrSold"] - X_fe["YearRemodAdd"]

X_fe["OverallQual_SF"] = X_fe["OverallQual"] * X_fe["GrLivArea"]


In [None]:
from sklearn.preprocessing import OrdinalEncoder

cat_cols = X_fe.select_dtypes(include="object").columns
num_cols = X_fe.select_dtypes(exclude="object").columns

# Handle missing values again (safe)
X_fe[num_cols] = X_fe[num_cols].fillna(X_fe[num_cols].median())
X_fe[cat_cols] = X_fe[cat_cols].fillna("None").astype(str)

encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1
)

X_fe[cat_cols] = encoder.fit_transform(X_fe[cat_cols])


In [None]:
from sklearn.model_selection import train_test_split

X_train_fe, X_test_fe, y_train, y_test = train_test_split(
    X_fe,
    y,
    test_size=0.2,
    random_state=42
)


In [None]:
from catboost import CatBoostRegressor

cat_model_fe = CatBoostRegressor(
    iterations=5000,
    learning_rate=0.025,
    depth=8,
    l2_leaf_reg=4,
    bagging_temperature=0.5,
    random_strength=1.0,
    loss_function="RMSE",
    random_seed=42,
    verbose=200
)

cat_model_fe.fit(
    X_train_fe,
    y_train,
    eval_set=(X_test_fe, y_test),
    early_stopping_rounds=300
)


In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

log_preds_fe = cat_model_fe.predict(X_test_fe)
preds_fe = np.expm1(log_preds_fe)

true_prices = np.expm1(y_test)

rmse_fe = np.sqrt(mean_squared_error(true_prices, preds_fe))
r2_fe = r2_score(true_prices, preds_fe)

print("Improved CatBoost RMSE:", rmse_fe)
print("Improved CatBoost R²:", r2_fe)
