In [18]:
# Importing Library

import pandas as pd
import numpy as np

from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder

In [19]:
# Load Dataset
df = pd.read_csv("csv files/House_Price_train.csv")

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [21]:
# Target Transformation

df['log_price'] = np.log1p(df['SalePrice'])


In [22]:
# Prepare Feature and Target

y = df["log_price"]

drop_cols = [c for c in ["Id", "SalePrice", "log_price"] if c in df.columns]
X = df.drop(drop_cols, axis=1)


In [23]:
# Identify Columns type

cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns

In [24]:
# Handling Missing Values
# Separate features and target

# Numeric → median
X[num_cols] = X[num_cols].fillna(X[num_cols].median())

# Categorical → string (before encoding)
X[cat_cols] = X[cat_cols].fillna("None").astype(str)



In [25]:
# Encode Categorical Features

encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1
)

X[cat_cols] = encoder.fit_transform(X[cat_cols])


In [26]:
#Train–Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [None]:
# Training the model

cat_model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.03,
    depth=8,
    l2_leaf_reg=3,
    loss_function="RMSE",
    random_seed=42,
    verbose=200
)

cat_model.fit(X_train, y_train)


In [28]:
# Testing the model

log_preds = cat_model.predict(X_test)
preds = np.expm1(log_preds)


In [29]:
# Evaluation of the model

true_prices = np.expm1(y_test)

rmse = np.sqrt(mean_squared_error(true_prices, preds))
r2 = r2_score(true_prices, preds)

print("CatBoost RMSE:", rmse)
print("CatBoost R²:", r2)

CatBoost RMSE: 29038.505840596914
CatBoost R²: 0.8900653246449589


# Tuning

In [30]:
X_fe = X.copy()

X_fe["TotalSF"] = (
    X_fe["TotalBsmtSF"] +
    X_fe["1stFlrSF"] +
    X_fe["2ndFlrSF"]
)

X_fe["Age"] = X_fe["YrSold"] - X_fe["YearBuilt"]
X_fe["RemodAge"] = X_fe["YrSold"] - X_fe["YearRemodAdd"]

X_fe["OverallQual_SF"] = X_fe["OverallQual"] * X_fe["GrLivArea"]


In [31]:
from sklearn.preprocessing import OrdinalEncoder

cat_cols = X_fe.select_dtypes(include="object").columns
num_cols = X_fe.select_dtypes(exclude="object").columns

# Handle missing values again (safe)
X_fe[num_cols] = X_fe[num_cols].fillna(X_fe[num_cols].median())
X_fe[cat_cols] = X_fe[cat_cols].fillna("None").astype(str)

encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1
)

X_fe[cat_cols] = encoder.fit_transform(X_fe[cat_cols])


In [32]:
from sklearn.model_selection import train_test_split

X_train_fe, X_test_fe, y_train, y_test = train_test_split(
    X_fe,
    y,
    test_size=0.2,
    random_state=42
)


In [None]:
from catboost import CatBoostRegressor

cat_model_fe = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.025,
    depth=8,
    l2_leaf_reg=4,
    bagging_temperature=0.5,
    random_strength=1.0,
    loss_function="RMSE",
    random_seed=42,
    verbose=200
)

cat_model_fe.fit(
    X_train_fe,
    y_train,
    eval_set=(X_test_fe, y_test),
    early_stopping_rounds=300
)


In [34]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

log_preds_fe = cat_model_fe.predict(X_test_fe)
preds_fe = np.expm1(log_preds_fe)

true_prices = np.expm1(y_test)

rmse_fe = np.sqrt(mean_squared_error(true_prices, preds_fe))
r2_fe = r2_score(true_prices, preds_fe)

print("Improved CatBoost RMSE:", rmse_fe)
print("Improved CatBoost R²:", r2_fe)


Improved CatBoost RMSE: 28897.426439551895
Improved CatBoost R²: 0.8911309333823565
