##**Regressors**

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [2]:
from google.colab import userdata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Modellingfrom lightgbm import LGBMClassifier
from sklearn import datasets
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV,  GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [3]:
data_to_model = pd.read_csv('data_to_model2.csv')
target = data_to_model['price']
features = data_to_model.drop('price', axis=1)

### 1 - Preparing to feed the data to our model

## Steps:
- Correct any previous missed errors
- Dividing the data into training and testing datasets
- Encoding categorical data through frequency encoding, we had many options but due to concerns about increasing unnecessarily the numbers of columns and to keep things as simple as possible we chose this method
- Finaly test various Machine Learning models to find out which one is more appropriate for this problem and doing a little parameter optimization to see if we can "squeeze" more performance out of our chosen model.

In [4]:
features["hasLift"] = features["hasLift"].replace('False', 0).astype(int)
features["hasLift"].value_counts()

hasLift
0    9716
1    7407
Name: count, dtype: int64

Before encoding our categorical and numerical data we split the dataset into training and testing data and apply the transformations separately to avoid any possibility of data leakage

In [5]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

train_city_encoding = x_train['city'].value_counts() / len(x_train)
x_train['city'] = x_train['city'].map(train_city_encoding)
x_train

Unnamed: 0,propertyType,size,rooms,bathrooms,floor,hasLift,parkingSpace,city,house_status
1959,flat,95,2,1,0,0,0,0.500292,-1
16208,flat,75,2,2,4,0,1,0.500292,0
10072,chalet,102,3,2,0,0,0,0.202730,0
1740,flat,71,2,1,2,0,0,0.500292,0
1957,flat,122,2,2,2,1,1,0.500292,0
...,...,...,...,...,...,...,...,...,...
11284,flat,70,3,1,3,0,0,0.047452,0
11964,flat,125,2,2,2,0,0,0.047452,0
5390,flat,120,4,2,0,0,0,0.500292,0
860,chalet,340,5,6,0,0,0,0.084173,0


In [6]:
test_city_encoding = x_test['city'].value_counts() / len(x_test)
x_test['city'] = x_test['city'].map(test_city_encoding)
x_test

Unnamed: 0,propertyType,size,rooms,bathrooms,floor,hasLift,parkingSpace,city,house_status
6482,flat,140,3,2,0,1,1,0.070949,0
13821,flat,60,1,1,4,0,0,0.495766,0
2129,flat,58,1,1,2,0,0,0.495766,0
11655,flat,78,2,1,7,1,0,0.042044,0
13590,flat,76,3,1,2,0,0,0.495766,-1
...,...,...,...,...,...,...,...,...,...
9988,chalet,486,4,4,0,0,1,0.216350,0
16246,flat,72,2,1,3,0,0,0.495766,0
9252,flat,165,3,2,2,0,0,0.216350,0
1954,flat,67,3,2,2,0,0,0.495766,-1


In [7]:
train_prop_encoding = x_train['propertyType'].value_counts() / len(x_train)
x_train['propertyType'] = x_train['propertyType'].map(train_prop_encoding)
x_train

Unnamed: 0,propertyType,size,rooms,bathrooms,floor,hasLift,parkingSpace,city,house_status
1959,0.764929,95,2,1,0,0,0,0.500292,-1
16208,0.764929,75,2,2,4,0,1,0.500292,0
10072,0.180099,102,3,2,0,0,0,0.202730,0
1740,0.764929,71,2,1,2,0,0,0.500292,0
1957,0.764929,122,2,2,2,1,1,0.500292,0
...,...,...,...,...,...,...,...,...,...
11284,0.764929,70,3,1,3,0,0,0.047452,0
11964,0.764929,125,2,2,2,0,0,0.047452,0
5390,0.764929,120,4,2,0,0,0,0.500292,0
860,0.180099,340,5,6,0,0,0,0.084173,0


In [8]:
test_prop_encoding = x_test['propertyType'].value_counts() / len(x_test)
x_test['propertyType'] = x_test['propertyType'].map(test_prop_encoding)
x_test

Unnamed: 0,propertyType,size,rooms,bathrooms,floor,hasLift,parkingSpace,city,house_status
6482,0.755036,140,3,2,0,1,1,0.070949,0
13821,0.755036,60,1,1,4,0,0,0.495766,0
2129,0.755036,58,1,1,2,0,0,0.495766,0
11655,0.755036,78,2,1,7,1,0,0.042044,0
13590,0.755036,76,3,1,2,0,0,0.495766,-1
...,...,...,...,...,...,...,...,...,...
9988,0.191825,486,4,4,0,0,1,0.216350,0
16246,0.755036,72,2,1,3,0,0,0.495766,0
9252,0.755036,165,3,2,2,0,0,0.216350,0
1954,0.755036,67,3,2,2,0,0,0.495766,-1


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train[["size", "rooms", "bathrooms", "floor", "hasLift", "parkingSpace", "house_status"]] = scaler.fit_transform(x_train[["size", "rooms", "bathrooms", "floor", "hasLift", "parkingSpace", "house_status"]])
x_test[["size", "rooms", "bathrooms", "floor", "hasLift", "parkingSpace", "house_status"]] = scaler.transform(x_test[["size", "rooms", "bathrooms", "floor", "hasLift", "parkingSpace", "house_status"]])

In [10]:
x_train

Unnamed: 0,propertyType,size,rooms,bathrooms,floor,hasLift,parkingSpace,city,house_status
1959,0.764929,-0.241531,-0.506375,-0.936881,-0.922653,-0.872033,-0.732234,0.500292,-2.574575
16208,0.764929,-0.317338,-0.506375,-0.193605,0.830138,-0.872033,1.365683,0.500292,-0.243982
10072,0.180099,-0.214998,0.157770,-0.193605,-0.922653,-0.872033,-0.732234,0.202730,-0.243982
1740,0.764929,-0.332500,-0.506375,-0.936881,-0.046257,-0.872033,-0.732234,0.500292,-0.243982
1957,0.764929,-0.139190,-0.506375,-0.193605,-0.046257,1.146745,1.365683,0.500292,-0.243982
...,...,...,...,...,...,...,...,...,...
11284,0.764929,-0.336290,0.157770,-0.936881,0.391940,-0.872033,-0.732234,0.047452,-0.243982
11964,0.764929,-0.127819,-0.506375,-0.193605,-0.046257,-0.872033,-0.732234,0.047452,-0.243982
5390,0.764929,-0.146771,0.821914,-0.193605,-0.922653,-0.872033,-0.732234,0.500292,-0.243982
860,0.180099,0.687114,1.486059,2.779497,-0.922653,-0.872033,-0.732234,0.084173,-0.243982


To evaluate the model we use the four methods seen below: Mean Absolute Error (MAE), Mean Squared Error (MSE), Root Mean Squared Error and R squared. We will focus more on the last one due to it's easiness of interpretability, but there's obviously pro's and con's to each of those metrics.

In [11]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In our brief experiment, Random Forrest Regressor emerged as the best option with a R squared score close to 0.75 meaning that our model can explain 3/4 of the variance of the data. Not bad for a start.

In [12]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
mae_list = []
mse_list = []
rmse_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Evaluate Train and Test dataset
    model_train_mae , model_train_mse, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_mse, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    model_list.append(list(models.keys())[i])
    mae_list.append(model_test_mae)
    mse_list.append(model_test_mse)
    rmse_list.append(model_test_rmse)
    r2_list.append(model_test_r2)

pd.DataFrame(list(zip(model_list, mae_list, mse_list, r2_list)), columns=['Model Name', 'MAE_Score', 'MSE_Score', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,MAE_Score,MSE_Score,R2_Score
5,Random Forest Regressor,213811.16222,241063000000.0,0.73633
7,CatBoosting Regressor,234526.704328,293637800000.0,0.678825
4,Decision Tree,248660.783142,335413200000.0,0.633132
3,K-Neighbors Regressor,272393.829606,372229000000.0,0.592864
8,AdaBoost Regressor,352360.963223,387195200000.0,0.576494
6,XGBRegressor,265846.583004,410789200000.0,0.550687
0,Linear Regression,356856.074929,494098400000.0,0.459565
1,Lasso,356855.268978,494098900000.0,0.459565
2,Ridge,356824.80902,494100500000.0,0.459563


Here we quickly tweak some parameters to see if we can improve our results.

In [13]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [14]:
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, refit=True, cv=5, scoring='f1', verbose=3)

In [None]:
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV 1/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   2.4s
[CV 2/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   2.3s
[CV 3/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   2.3s
[CV 4/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   2.3s
[CV 5/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   2.3s
[CV 1/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=nan total time=   4.5s
[CV 2/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=nan total time=   4.5

In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [None]:
print("Best parameters found: ", best_params)
print("Best score found: ", best_score)

In [None]:
tuned_rf = RandomForestRegressor(max_depth=5, max_leaf_nodes=20, n_estimators=50)
tuned_rf.fit(x_train, y_train)
tuned_y_test_pred = tuned_rf.predict(x_test)
model_test_mae , model_test_mse, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
print({"MAE": model_test_mae , "MSE": model_test_mse, "RMSE": model_test_rmse, "R2": model_test_r2})