In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset=pd.read_csv("preprocessed_house_rent.csv")

In [3]:
dataset

Unnamed: 0,BHK,Rent,Size,Area Type,Furnishing Status,Tenant Preferred,Bathroom,City_Chennai,City_Delhi,City_Hyderabad,City_Kolkata,City_Mumbai,Floor Number,Total Floors
0,2,10000,1100,2,2,1,2,0,0,0,1,0,0,2
1,2,20000,800,2,1,1,1,0,0,0,1,0,1,3
2,2,17000,1000,2,1,1,1,0,0,0,1,0,1,3
3,2,10000,800,2,2,1,1,0,0,0,1,0,1,2
4,2,7500,850,1,2,0,1,0,0,0,1,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,15000,1000,1,1,1,2,0,0,1,0,0,3,5
4742,3,29000,2000,2,1,1,3,0,0,1,0,0,1,4
4743,3,35000,1750,1,1,1,3,0,0,1,0,0,3,5
4744,3,45000,1500,1,1,2,2,0,0,1,0,0,23,34


In [4]:
print(dataset.isnull().sum())

BHK                  0
Rent                 0
Size                 0
Area Type            0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
City_Chennai         0
City_Delhi           0
City_Hyderabad       0
City_Kolkata         0
City_Mumbai          0
Floor Number         0
Total Floors         0
dtype: int64


In [5]:
independent=dataset[[ 'BHK','Size', 'Area Type', 'Furnishing Status',
       'Tenant Preferred', 'Bathroom', 'City_Chennai', 'City_Delhi',
       'City_Hyderabad', 'City_Kolkata', 'City_Mumbai', 'Floor Number',
       'Total Floors']]

In [6]:
independent

Unnamed: 0,BHK,Size,Area Type,Furnishing Status,Tenant Preferred,Bathroom,City_Chennai,City_Delhi,City_Hyderabad,City_Kolkata,City_Mumbai,Floor Number,Total Floors
0,2,1100,2,2,1,2,0,0,0,1,0,0,2
1,2,800,2,1,1,1,0,0,0,1,0,1,3
2,2,1000,2,1,1,1,0,0,0,1,0,1,3
3,2,800,2,2,1,1,0,0,0,1,0,1,2
4,2,850,1,2,0,1,0,0,0,1,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,1000,1,1,1,2,0,0,1,0,0,3,5
4742,3,2000,2,1,1,3,0,0,1,0,0,1,4
4743,3,1750,1,1,1,3,0,0,1,0,0,3,5
4744,3,1500,1,1,2,2,0,0,1,0,0,23,34


In [7]:
dependent=dataset[['Rent']]

In [8]:
dependent

Unnamed: 0,Rent
0,10000
1,20000
2,17000
3,10000
4,7500
...,...
4741,15000
4742,29000
4743,35000
4744,45000


In [9]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
independent=sc.fit_transform(independent)

In [10]:
#Initialize the XGBoost regressor
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)


In [11]:
# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'n_estimators': [100]
}


In [12]:
#Set up the grid search
grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

In [13]:
# Fit the grid search to the data
grid_search.fit(independent,dependent)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


In [14]:
# Retrieve the best parameters and evaluate the model
print("Best parameters found: ", grid_search.best_params_)
best_model = grid_search.best_estimator_

Best parameters found:  {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 1.0}


In [16]:
#Make predictions
y_pred = best_model.predict(independent)


In [18]:
#Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score
rmse = mean_squared_error(dependent, y_pred, squared=False)
r2 = r2_score(dependent, y_pred)
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")

Root Mean Squared Error: 42717.97
R-squared: 0.70
