In [4]:
import pandas as pd
import numpy as np

import re

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer, PolynomialFeatures

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import joblib

## Loading Dataset

In [5]:
df = pd.read_csv("bengaluru_house_prices.csv")

## Data Inspection

In [6]:
df.head(3)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0


In [7]:
df.shape

(13320, 9)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [9]:
for col in df.columns:
    missing_rows = df[col].isnull().sum()
    missing_percent = (missing_rows/df.shape[0])*100
    print(f"{col} Missing : {missing_rows}")

area_type Missing : 0
availability Missing : 0
location Missing : 1
size Missing : 16
society Missing : 5502
total_sqft Missing : 0
bath Missing : 73
balcony Missing : 609
price Missing : 0


In [10]:
df.drop(columns=["society"], inplace=True)

## Feature and Target Selection

In [11]:
X = df.drop('price', axis=1)
y = df['price']

In [12]:
X.shape

(13320, 7)

In [13]:
X.shape

(13320, 7)

## Dataset Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42)

In [15]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(10656, 7) (10656,)
(2664, 7) (2664,)


In [16]:
y_train.isnull().sum()

0

## Handle Missing Values

In [17]:
print("X_train:")
for col in X_train.columns:
    missing_rows = X_train[col].isnull().sum()
    missing_percent = (missing_rows/X_train.shape[0])*100
    print(f" {col} Missing : {missing_rows}")

print("X_test:")
for col in X_test.columns:
    missing_rows = X_test[col].isnull().sum()
    missing_percent = (missing_rows/X_test.shape[0])*100
    print(f" {col} Missing : {missing_rows}")

X_train:
 area_type Missing : 0
 availability Missing : 0
 location Missing : 1
 size Missing : 14
 total_sqft Missing : 0
 bath Missing : 65
 balcony Missing : 499
X_test:
 area_type Missing : 0
 availability Missing : 0
 location Missing : 0
 size Missing : 2
 total_sqft Missing : 0
 bath Missing : 8
 balcony Missing : 110


### Functions Handling String

In [None]:
# Converting ready to move as 1 since others are planned

def simplify_availability(val):
    val = pd.Series(np.ravel(val))
    converted = val.apply(lambda val : 1 if isinstance(val, str) and ("Ready" in val.lower() or "Immediate" in val.lower()) else 0)
    return converted.values.reshape(-1, 1)

# Converting Bedroom to BHK in size

def convert_bhk(val):
    # Flatten to 1D array
    val = pd.Series(np.ravel(val))
    converted = val.apply(lambda val : val.replace("Bedroom", "BHK").replace("bedroom", "BHK") if isinstance(val, str) and "bedroom" in val.lower() else val)
    return converted.values.reshape(-1, 1)

In [40]:
def conv_sqft(val):
    # ✅ Handle both DataFrame and Series inputs
    if isinstance(val, pd.DataFrame):
        val = val.iloc[:, 0]  # take first column
    elif not isinstance(val, pd.Series):
        val = pd.Series(val)
    def conv_single(val):
        try:
            if isinstance(val, (int, float)):
                return float(val)
            elif isinstance(val, str):
                val = val.strip()

                # Range Case
                if " - " in val:
                    range = val.split(" - ")
                    if len(range) ==  2:
                        return ((float(range[0]) + float(range[1]))/2)
    
                # Unit Case
                # Normalize: add space between number and letters (handles "15000Guntha")
                v = re.sub(r"(?<=\d)([A-Za-z])", r" \1", val)
                val = v.lower()

                # Unit Case (case-insensitive)
                if re.search(r'[a-zA-Z]', v):
                    num = re.findall(r'^[\d\.]+', v)
                    if not num:
                        return np.nan
                    n = float(num[0])
                    
                    if "sq. meter" in val:
                        return round(n * 10.7639, 2)
                    elif "perch" in val:
                        return round(n * 272.25, 2)
                    elif "sq. yards" in val:
                        return round(n * 9, 2)
                    elif "acres" in val:
                        return round(n * 43560, 2)
                    elif "cents" in val:
                        return round(n * 435.6, 2)
                    elif "grounds" in val:
                        return round(n * 2400, 2)
                    elif "guntha" in val:
                        return round(n * 1089, 2)
                return float(val)
        except:
            return np.nan

    converted = val.apply(conv_single)
    return converted.values.reshape(-1,1)

### Pre-processing Pipeline

In [41]:
area_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

avail_transformer = Pipeline(steps=[
    ("converter", FunctionTransformer(simplify_availability)),
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

loc_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

size_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("converter", FunctionTransformer(convert_bhk)),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

sqft_transformer = Pipeline(steps=[
    ("converter", FunctionTransformer(conv_sqft)),
    ("scaler", StandardScaler())
])

num_col = ["bath", "balcony"]
num_col_transform = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [42]:
preprocessor = ColumnTransformer(transformers=[
    ('area', area_transformer, ['area_type']),
    ('avail', avail_transformer, ['availability']),
    ('loc', loc_transformer, ['location']),
    ('size', size_transformer, ['size']),
    ('sqft', sqft_transformer, ['total_sqft']),
    ('num', num_col_transform, num_col)
])

In [31]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10656 entries, 3411 to 7270
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     10656 non-null  object 
 1   availability  10656 non-null  object 
 2   location      10655 non-null  object 
 3   size          10642 non-null  object 
 4   total_sqft    10656 non-null  object 
 5   bath          10591 non-null  float64
 6   balcony       10157 non-null  float64
dtypes: float64(2), object(5)
memory usage: 666.0+ KB


## Linear Regression Model Training

In [43]:
linear_regression_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

In [44]:
X_train_transformed = preprocessor.fit_transform(X_train)
print("Any NaN after preprocessing?", np.isnan(X_train_transformed).any())

Any NaN after preprocessing? False


In [45]:
linear_regression_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('area', ...), ('avail', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function sim...0020E206225C0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function con...0020E206220C0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function con...0020E2C153920>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### Saving Trained Model

In [35]:
joblib.dump(linear_regression_pipeline, 'house_price_flaskapi/models/linear_regression.pkl')
print("✅ Model saved as linear_regression.pkl")

joblib.dump(preprocessor, "house_price_flaskapi/models/preprocessor.pkl")
joblib.dump(linear_regression_pipeline.named_steps['regressor'], "house_price_flaskapi/models/lr_model_only.pkl")

✅ Model saved as linear_regression.pkl


['house_price_flaskapi/models/lr_model_only.pkl']

## LR Model Evaluation

In [46]:
y_pred = linear_regression_pipeline.predict(X_test)

In [47]:
mse_lr = mean_squared_error(y_test, y_pred)
rmse_lr =  np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred)
mae_lr = mean_absolute_error(y_test, y_pred)

In [48]:
print(f"LR MSE  : {mse_lr:.2f}")
print(f"LR RMSE : {rmse_lr:.2f}")
print(f"LR R2   : {r2_lr:.2f}")
print(f"LR MAE  : {mae_lr:.2f}")

LR MSE  : 12543.08
LR RMSE : 112.00
LR R2   : 0.41
LR MAE  : 47.08


In [26]:
error_percent = (112 / y.mean()) * 100
print(error_percent, "% error approximately")

99.4975140110876 % error approximately


In [28]:
print("Intercept:", linear_regression_pipeline[1].intercept_)
print("Coefficients:", linear_regression_pipeline[1].coef_)

Intercept: 20.21845234823597
Coefficients: [-11.82336768 -11.17984891  31.94182261 ...   3.41763339  51.12893716
   3.59714602]


## Polynomial Regression Model

In [34]:
area_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

avail_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("converter", FunctionTransformer(simplify_availability))
])

loc_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

size_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("converter", FunctionTransformer(convert_bhk)),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

sqft_transformer = Pipeline(steps=[
    ("converter", FunctionTransformer(conv_sqft)),
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("poly_features", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
    ("scaler", StandardScaler())
])

num_col = ["bath", "balcony"]
num_col_transform = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("poly_features", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
    ("scaler", StandardScaler())
])

poly_preprocessor = ColumnTransformer(transformers=[
    ('area', area_transformer, ['area_type']),
    ('avail', avail_transformer, ['availability']),
    ('loc', loc_transformer, ['location']),
    ('size', size_transformer, ['size']),
    ('sqft', sqft_transformer, ['total_sqft']),
    ('num', num_col_transform, num_col)
])

In [35]:
polynomial_regression_pipeline = Pipeline(steps=[
    ("preprocessor", poly_preprocessor),
    ("regressor", LinearRegression())
])

In [36]:
polynomial_regression_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('area', ...), ('avail', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function sim...002941D4E63E0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function con...002941D4E6840>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function con...002941D4E6480>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,degree,2
,interaction_only,True
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,degree,2
,interaction_only,True
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [37]:
joblib.dump(polynomial_regression_pipeline, 'polynomial_regression.pkl')
print("✅ Model saved as polynomial_regression.pkl")

joblib.dump(poly_preprocessor, "house_price_api/models/poly_preprocessor.pkl")
joblib.dump(polynomial_regression_pipeline.named_steps['regressor'], "house_price_api/models/poly_model_only.pkl")

✅ Model saved as polynomial_regression.pkl


['house_price_api/models/poly_model_only.pkl']

In [31]:
y_pred_poly = polynomial_regression_pipeline.predict(X_test)

In [32]:
mse_pr = mean_squared_error(y_test, y_pred_poly)
rmse_pr =  np.sqrt(mse_pr)
r2_pr = r2_score(y_test, y_pred_poly)
mae_pr = mean_absolute_error(y_test, y_pred_poly)

In [33]:
print(f"Polynomial-LR MSE  : {mse_pr:.2f}")
print(f"Polynomial-LR RMSE : {rmse_pr:.2f}")
print(f"Polynomial-LR R2   : {r2_pr:.2f}")
print(f"Polynomial-LR MAE  : {mae_pr:.2f}")

Polynomial-LR MSE  : 12323.30
Polynomial-LR RMSE : 111.01
Polynomial-LR R2   : 0.42
Polynomial-LR MAE  : 46.82


In [36]:
error_percent = (rmse_pr / y.mean()) * 100
print(error_percent, "% error approximately")

98.61833596337661 % error approximately


In [37]:
print("Intercept:", polynomial_regression_pipeline[1].intercept_)
print("Coefficients:", polynomial_regression_pipeline[1].coef_)

Intercept: 28.92387137260104
Coefficients: [-11.59540178 -10.8051933   30.83271002 ...  37.92650609 -10.48524024
  21.57163693]


## XGBoost regressor

In [60]:
xgboost_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", XGBRegressor(objective="reg:squarederror", random_state=42, n_jobs=-1))
])

In [61]:
# Hyperparameter Tuning

param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.05, 0.1]
}

In [62]:
cv = KFold(n_splits=5, shuffle=True)

grid_search = GridSearchCV(
    estimator=xgboost_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

In [64]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'model__learning_rate': [0.05, 0.1], 'model__max_depth': [3, 5, ...], 'model__n_estimators': [100, 200]}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('area', ...), ('avail', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function sim...0026D302868E0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function con...0026D30287BA0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function con...0026D30287A60>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [65]:
print("Best params:", grid_search.best_params_)
print("Best R²:", grid_search.best_score_)

Best params: {'model__learning_rate': 0.05, 'model__max_depth': 5, 'model__n_estimators': 200}
Best R²: 0.597373924799699


In [69]:
joblib.dump(grid_search, 'XGBoost.pkl')
print("✅ Model saved as XGBoost.pkl")

✅ Model saved as XGBoost.pkl


In [70]:
y_pred_xgb = grid_search.predict(X_test)

In [71]:
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb =  np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)

In [72]:
print(f"XGB MSE  : {mse_xgb:.2f}")
print(f"XGB RMSE : {rmse_xgb:.2f}")
print(f"XGB R2   : {r2_xgb:.2f}")
print(f"XGB MAE  : {mae_xgb:.2f}")

XGB MSE  : 7049.41
XGB RMSE : 83.96
XGB R2   : 0.67
XGB MAE  : 34.45


## Random Forest

In [74]:
random_forest_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(random_state=42, n_jobs=-1))
])

In [75]:
rf_param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5],
}

In [76]:
rf_grid_search = GridSearchCV(
    estimator=random_forest_pipeline,
    param_grid=rf_param_grid,
    cv=cv,
    scoring='r2',
    n_jobs=-1,
    verbose=2)

In [77]:
rf_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [None, 10, ...], 'model__min_samples_split': [2, 5], 'model__n_estimators': [100, 200, ...]}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('area', ...), ('avail', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function sim...0026D302868E0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function con...0026D30287BA0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function con...0026D30287A60>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [78]:
print("Best params:", rf_grid_search.best_params_)
print("Best R²:", rf_grid_search.best_score_)

Best params: {'model__max_depth': None, 'model__min_samples_split': 5, 'model__n_estimators': 300}
Best R²: 0.5791274591463644


In [79]:
joblib.dump(rf_grid_search, 'RandomForest.pkl')
print("✅ Model saved as RandomForest.pkl")

✅ Model saved as RandomForest.pkl


In [80]:
y_pred_rf = rf_grid_search.predict(X_test)

In [81]:
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf =  np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

In [82]:
print(f"RF MSE  : {mse_rf:.2f}")
print(f"RF RMSE : {rmse_rf:.2f}")
print(f"RF R2   : {r2_rf:.2f}")
print(f"RF MAE  : {mae_rf:.2f}")

RF MSE  : 7233.91
RF RMSE : 85.05
RF R2   : 0.66
RF MAE  : 29.91
