In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('./cleaned_df.zip', compression='zip', index_col=0)
df.head()

Unnamed: 0,location,bath,balcony,price,House_size,new_total_sqft
2,Uttarahalli,2.0,3.0,62.0,3.0,1440.0
3,Lingadheeranahalli,3.0,1.0,95.0,3.0,1521.0
4,Kothanur,2.0,1.0,51.0,2.0,1200.0
8,Marathahalli,3.0,1.0,63.25,3.0,1310.0
10,Whitefield,2.0,2.0,70.0,3.0,1800.0


### Feature Engineering

In [3]:
bins = [40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200]  # bins
labels = ['L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'L10', 'L11', 'L12', 'L13', 'L14', 'L15', 'L16'] 

# Create a new column for the price range
df['Price_Range'] = pd.cut(df['price'], bins=bins, labels=labels, right=False) # Use the price location and price range to create a new column

df = df.drop('location', axis=1)  # drop the location column

df = pd.get_dummies(df).astype(int) # cast the boolean to int(0 & 1)

df.rename(columns={col:col.split('_')[-1] for col in df.columns if 'Price_Range' in col}, inplace=True)  

In [4]:
df

Unnamed: 0,bath,balcony,price,House_size,new_total_sqft,L1,L2,L3,L4,L5,...,L7,L8,L9,L10,L11,L12,L13,L14,L15,L16
2,2,3,62,3,1440,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1,95,3,1521,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,1,51,2,1200,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,3,1,63,3,1310,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10,2,2,70,3,1800,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13310,2,2,52,2,1050,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13312,2,2,47,2,1262,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13313,2,1,57,3,1345,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13314,3,3,112,3,1715,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### Splitting the data 

In [5]:
#choosing the features and the target
X = df.drop(columns=['price'])
y = df['price']

# splitting the dataset into test and train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training the Model

In [14]:
# defining the hyperparameters
param_grid = {
    'learning_rate': [0.05, 0.1, 1],
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [3, 4, 5],     # Maximum depth of the trees
    'max_features' : [3,4,5]
    }
# Instantiate Gradient Boosting Regressor
model = GradientBoostingRegressor()
 
# initializing the gridsearch
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

# performing the grid search 
grid_search.fit(X_train, y_train)

# Getting the best parameters and the best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Making predictions using the best estimator
y_pred = best_estimator.predict(X_test)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 5, 'n_estimators': 300}


### Evaluating the Model's Performance

In [15]:
# Mean Squared error
mse = mean_squared_error(y_test, y_pred)

# Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# R-Squared
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("RMSE:", rmse)
print("R-squared:", r2)

Mean Squared Error: 46.45950361170799
RMSE: 6.8161208624633405
R-squared: 0.9434245762001077


### Conclusion

Based on the provided model performance metrics:

Mean Squared Error (MSE):
The MSE value of approximately 46.46 indicates the average squared difference between the actual and predicted values. It suggests that, on average, the squared error of the model's predictions is around 46.46.

Root Mean Squared Error (RMSE):
The RMSE value of approximately 6.82 suggests that, on average, the model's predictions are around 6.82 units away from the actual values. Since RMSE is in the same units as the target variable, this value provides a more interpretable measure of the average magnitude of error.

R-squared (R²):
The R² value of approximately 0.94 indicates that the model explains approximately 94.34% of the variance in the target variable. This high R² value suggests that the model is performing well in explaining the variability in the data.