In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('../data/gold-houseprices.csv')
len(df.columns)

69

In [3]:
df.head()

Unnamed: 0,property_id,title,price,price_per_sqft,bhk_count,super_area,floor,bathroom_count,property_type,latitude,...,locality_Phase III,locality_Pragathi Nagar,locality_Prakash Nagar,locality_Prakruthi Nagar,locality_Royal Enclave,locality_Saint Thomas Town,locality_Shivananada Layout,locality_Veer Sandra,locality_Vinayaka Nagar,locality_Yeshwanthpur Industrial Area
0,1,"8 BHK Flat for Sale in Rajajinagar, Bangalore",448300000.0,34936.0,8.0,8983.0,2.0,9.0,apartment,13.009859,...,False,False,False,False,False,False,False,False,False,False
1,2,"2 BHK Flat for Sale in Rajajinagar, Bangalore",50000000.0,27778.0,2.0,1500.0,16.0,3.0,apartment,12.99488,...,False,False,True,False,False,False,False,False,False,False
2,3,"2 BHK Flat for Sale in Sarjapur Road, Bangalore",18400000.0,12813.0,2.0,1436.0,1.0,2.0,apartment,12.899052,...,False,False,False,False,False,False,False,False,False,False
3,4,"2 BHK Flat for Sale in Kanakapura Road, Banga...",17100000.0,11732.0,2.0,1460.0,1.0,2.0,apartment,12.898554,...,False,False,False,False,False,False,False,False,False,False
4,5,"2 BHK Flat for Sale in Kanakapura Road, Banga...",21000000.0,16484.0,2.0,1274.0,21.0,2.0,apartment,12.746104,...,False,False,False,False,False,False,False,False,False,False


In [4]:
!pip install catboost



In [5]:
df.columns[df.isnull().sum() > 0]

Index([], dtype='object')

In [6]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

target_col = "price"
X = df.drop(columns=[target_col]).copy()
y = df[target_col]

bool_cols = X.select_dtypes(include="bool").columns
if len(bool_cols):
    X[bool_cols] = X[bool_cols].astype(np.int8)

cat_cols = X.select_dtypes(include="object").columns.tolist()
cat_feature_indices = [X.columns.get_loc(col) for col in cat_cols]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = CatBoostRegressor(
    loss_function="RMSE",
    iterations=600,
    depth=15,
    learning_rate=0.03,
    random_seed=42,
    verbose=0
)

model.fit(X_train, y_train, cat_features=cat_feature_indices)

y_pred = model.predict(X_valid)
print(f"Validation MAE: {mean_absolute_error(y_valid, y_pred):.2f}")
print(f"Validation R²: {r2_score(y_valid, y_pred):.3f}")

Validation MAE: 10191014.06
Validation R²: -3.693


In [7]:
from sklearn.ensemble import RandomForestRegressor

random_forest_xtrain = X_train.select_dtypes(include=["int8", "int16", "int32", "int64", "float16", "float32", "float64"])
random_forest_xvalid = X_valid.select_dtypes(include=["int8", "int16", "int32", "int64", "float16", "float32", "float64"])
rf_model = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(random_forest_xtrain, y_train)

rf_pred = rf_model.predict(random_forest_xvalid)
print(f"RandomForest MAE: {mean_absolute_error(y_valid, rf_pred):.2f}")
print(f"RandomForest R²: {r2_score(y_valid, rf_pred):.3f}")

RandomForest MAE: 4347570.56
RandomForest R²: 0.747


In [8]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_log_error
import numpy as np

mape = mean_absolute_percentage_error(y_valid, rf_pred)
rmsle = np.sqrt(mean_squared_log_error(y_valid, rf_pred))
print("MAPE:", mape)
print("RMSLE:", rmsle)

MAPE: 0.13016695592451685
RMSLE: 0.20880825116849944


In [10]:
import joblib
joblib.dump(rf_model, '../models/random_forest_model.pkl')

['../models/random_forest_model.pkl']

In [11]:
from sklearn.metrics import mean_squared_error, median_absolute_error, explained_variance_score

rf_rmse = mean_squared_error(y_valid, rf_pred, squared=False)
cb_rmse = mean_squared_error(y_valid, y_pred, squared=False)

rf_med_ae = median_absolute_error(y_valid, rf_pred)
cb_med_ae = median_absolute_error(y_valid, y_pred)

rf_explained_var = explained_variance_score(y_valid, rf_pred)
cb_explained_var = explained_variance_score(y_valid, y_pred)

evaluation_df = pd.DataFrame(
    {
        "Model": ["RandomForest", "CatBoost"],
        "RMSE": [rf_rmse, cb_rmse],
        "MedianAE": [rf_med_ae, cb_med_ae],
        "ExplainedVariance": [rf_explained_var, cb_explained_var],
    }
).set_index("Model")

residuals = pd.Series(y_valid.values - rf_pred, index=y_valid.index, name="rf_residual")
residual_summary = residuals.describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])

comparison_df = pd.concat(
    [
        y_valid.rename("actual"),
        pd.Series(rf_pred, index=y_valid.index, name="rf_pred"),
        pd.Series(y_pred, index=y_valid.index, name="cat_pred"),
    ],
    axis=1,
)

feature_importance = (
    pd.DataFrame(
        {
            "feature": random_forest_xtrain.columns,
            "importance": rf_model.feature_importances_,
        }
    )
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)

display(evaluation_df)
display(residual_summary)
display(comparison_df.head(10))
display(feature_importance.head(15))

Unnamed: 0_level_0,RMSE,MedianAE,ExplainedVariance
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RandomForest,17644550.0,1287800.0,0.747363
CatBoost,76049630.0,3662588.0,-3.692654


count    7.690000e+02
mean    -4.164109e+04
std      1.765599e+07
min     -1.366320e+08
5%      -7.621280e+06
25%     -1.663000e+06
50%     -3.710000e+05
75%      8.388000e+05
95%      8.421320e+06
max      3.945832e+08
Name: rf_residual, dtype: float64

Unnamed: 0,actual,rf_pred,cat_pred
1432,37300000.0,37198200.0,26628100.0
3660,11800000.0,12099480.0,16473320.0
1611,38000000.0,33083800.0,42159460.0
2364,5380000.0,6249820.0,12146680.0
2928,11400000.0,11290180.0,12690460.0
3781,51300000.0,52830800.0,36309070.0
761,14300000.0,16950200.0,31486380.0
2402,26500000.0,26702400.0,28246380.0
1606,26700000.0,25533600.0,33329470.0
965,8500000.0,8892180.0,13134560.0


Unnamed: 0,feature,importance
0,price_per_sqft,0.771232
1,super_area,0.150931
2,bhk_count,0.024212
3,dist_to_centre_km,0.01379
4,floor,0.008633
5,bathroom_count,0.007924
6,locality_frequency,0.003309
7,property_id,0.003084
8,accessibility_score,0.003054
9,longitude,0.00301
