#REGRESSION - A basic template to compare different Regression models

##Objective - Given no of beds,square lot size and other attributes, our best model predicts the price it can sell for.

##Compared Models
##MLR - Multiple Linear Regression
##PR - Polynomial Regression
##SVR - Support Vector Machines
##DT - Decision Trees
##RFR - Random Forest Regressor
##ANN - Artificial Neural Network
##XBG - XG Boost
##CB - Cat Boost


##Importing libraries

In [1]:
import numpy as np
import pandas as pd

##Loading pre-processed Dataset

In [2]:
dataset = pd.read_csv('/content/House_Price_Data.csv')
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values
y = y.reshape(len(y),1)

##Checking for null values

In [3]:
dataset.isnull().sum()

tx_price                                       0
beds                                           0
baths                                          0
sqft                                           0
lot_size                                       0
restaurants                                    0
groceries                                      0
cafes                                          0
shopping                                       0
arts_entertainment                             0
beauty_spas                                    0
active_life                                    0
median_age                                     0
married                                        0
college_grad                                   0
median_school                                  0
num_schools                                    0
two_and_two                                    0
old_properties                                 0
tax_and_insurance                              0
during_recession    

In [4]:
print(X)

[[1.000e+00 1.000e+00 5.840e+02 ... 0.000e+00 1.000e+00 0.000e+00]
 [1.000e+00 1.000e+00 6.120e+02 ... 0.000e+00 1.000e+00 0.000e+00]
 [1.000e+00 1.000e+00 6.150e+02 ... 0.000e+00 1.000e+00 0.000e+00]
 ...
 [5.000e+00 6.000e+00 7.064e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [5.000e+00 6.000e+00 7.500e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [5.000e+00 6.000e+00 7.515e+03 ... 0.000e+00 0.000e+00 1.000e+00]]


In [5]:
print(y)

[[295850]
 [216500]
 [279900]
 ...
 [600000]
 [759900]
 [735000]]


## Splitting dataset into Training and Testing Splits

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [7]:
print(X_train)

[[3.000e+00 2.000e+00 2.155e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [4.000e+00 3.000e+00 3.700e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [3.000e+00 2.000e+00 1.304e+03 ... 0.000e+00 1.000e+00 0.000e+00]
 ...
 [4.000e+00 2.000e+00 2.472e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [2.000e+00 2.000e+00 1.132e+03 ... 0.000e+00 1.000e+00 0.000e+00]
 [4.000e+00 2.000e+00 2.067e+03 ... 0.000e+00 0.000e+00 1.000e+00]]


In [8]:
print(X_test)

[[3.000e+00 2.000e+00 9.690e+02 ... 0.000e+00 1.000e+00 0.000e+00]
 [3.000e+00 2.000e+00 2.432e+03 ... 0.000e+00 1.000e+00 0.000e+00]
 [2.000e+00 3.000e+00 1.942e+03 ... 0.000e+00 1.000e+00 0.000e+00]
 ...
 [3.000e+00 3.000e+00 1.649e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [3.000e+00 3.000e+00 1.163e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [3.000e+00 3.000e+00 1.525e+03 ... 1.000e+00 0.000e+00 1.000e+00]]


In [9]:
print(y_train)

[[699000]
 [434990]
 [295000]
 ...
 [375000]
 [290000]
 [630000]]


## Apply Scaling to our features to make all features between 0 and 1



In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


sc1 = StandardScaler()
y_train = sc1.fit_transform(y_train)
y_test = sc1.transform(y_test)

## Applying Dimensionality Reduction with Kernel PCA(To reduce Features/complexity)

In [12]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components = 2, kernel = 'rbf')
X_train = kpca.fit_transform(X_train)
X_test = kpca.transform(X_test)

#Models

##Multi Linear Regression

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

# Create a LinearRegression model
MLR_regressor = LinearRegression()

# Define the parameter grid
param_grid = {
    'fit_intercept': [True, False]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(MLR_regressor, param_grid, cv=5)  # cv is the number of cross-validation folds

# Fit the model to the data
grid_search.fit(X_train, y_train)

In [14]:
best_model = grid_search.best_estimator_
# Use the best model to make predictions
y_pred = best_model.predict(X_test)

# y_pred now contains the predicted values for the test data


In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calculate the MSE (Mean Squared Error)
MLR_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
MLR_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
MLR_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
MLR_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", MLR_rmse)
print("Mean Absolute Error (MAE):", MLR_mae)
print("R-squared (R²):", MLR_r2)



Root Mean Squared Error (RMSE): 0.8706639653628561
Mean Absolute Error (MAE): 0.6956534160547754
R-squared (R²): 0.21215341668155274


##Polynomial Regression

In [16]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge  # You can choose Ridge or Lasso
from sklearn.pipeline import make_pipeline

# Create a Polynomial Regression model with Ridge regularization
poly_reg = PolynomialFeatures(degree=2)
poly_regressor = Ridge()

# Create a pipeline for Polynomial Regression with Ridge regularization
model = make_pipeline(poly_reg, poly_regressor)

# Define the parameter grid
param_grid = {
    'polynomialfeatures__degree': [2, 3, 4],  # Try different polynomial degrees
    'ridge__alpha': [0.01, 0.1, 1.0]  # Try different alpha values for Ridge
}

# Create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5)  # cv is the number of cross-validation folds(divided data into 5 folds and every train and test split )

# Fit the model to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best estimator
best_model = grid_search.best_estimator_


Best Hyperparameters: {'polynomialfeatures__degree': 4, 'ridge__alpha': 0.1}


In [17]:
y_pred=best_model.predict(X_test)

In [18]:
# Calculate the MSE (Mean Squared Error)
PR_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
PR_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
PR_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
PR_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", PR_rmse)
print("Mean Absolute Error (MAE):", PR_mae)
print("R-squared (R²):", PR_r2)

Root Mean Squared Error (RMSE): 0.8706639653628561
Mean Absolute Error (MAE): 0.6854861489483645
R-squared (R²): 0.2200551157615841


##Support vector machine

In [20]:
from sklearn.svm import SVR  # For Support Vector Regression, or you can use SVC for classification

# Create an SVM model
svm_regressor = SVR()  # For classification, you can use SVC()

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],          # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type: linear or radial basis function (RBF)
    'gamma': [0.1, 1, 10]        # Kernel coefficient for 'rbf'
}

# Create the GridSearchCV object
grid_search = GridSearchCV(svm_regressor, param_grid, cv=5)  # cv is the number of cross-validation folds

# Fit the model to the data
grid_search.fit(X_train, y_train)  # X_train and y_train are your training data

# Print the best hyperparameters
best_params = grid_search.best_params_
#print("Best Hyperparameters:", best_params)

# Get the best estimator
best_model = grid_search.best_estimator_


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Hyperparameters: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [21]:
y_pred = best_model.predict(X_test)

In [22]:
# Calculate the MSE (Mean Squared Error)
SVR_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
SVR_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
SVR_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
SVR_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", SVR_rmse)
print("Mean Absolute Error (MAE):", SVR_mae)
print("R-squared (R²):", SVR_r2)

Root Mean Squared Error (RMSE): 0.8706639653628561
Mean Absolute Error (MAE): 0.680293465846848
R-squared (R²): 0.20697995309119166


##Decision Trees

In [23]:
from sklearn.ensemble import RandomForestRegressor

# Create a RandomForestRegressor model
rf_regressor = RandomForestRegressor()

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4]  # Minimum samples required for a leaf node
}

# Create the GridSearchCV object
grid_search = GridSearchCV(rf_regressor, param_grid, cv=5)

# Fit the model to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
best_params = grid_search.best_params_
#print("Best Hyperparameters:", best_params)

# Get the best estimator
best_model = grid_search.best_estimator_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}


In [24]:
y_pred = best_model.predict(X_test)

In [25]:
# Calculate the MSE (Mean Squared Error)
DT_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
DT_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
DT_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
DT_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", DT_rmse)
print("Mean Absolute Error (MAE):", DT_mae)
print("R-squared (R²):", DT_r2)

Root Mean Squared Error (RMSE): 0.8706639653628561
Mean Absolute Error (MAE): 0.6944320944658741
R-squared (R²): 0.1980132690218971


##Random Forest


In [26]:
from sklearn.ensemble import RandomForestRegressor  # For regression tasks, or use RandomForestClassifier for classification

# Create a Random Forest model
rf_regressor = RandomForestRegressor()  # For classification, use RandomForestClassifier()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4]  # Minimum samples required for a leaf node
}

# Create the GridSearchCV object
grid_search = GridSearchCV(rf_regressor, param_grid, cv=5)  # cv is the number of cross-validation folds

# Fit the model to the data
grid_search.fit(X_train, y_train)  # X_train and y_train are your training data

# Print the best hyperparameters
best_params = grid_search.best_params_
#print("Best Hyperparameters:", best_params)

# Get the best estimator
best_model = grid_search.best_estimator_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}


In [27]:
y_pred = best_model.predict(X_test)

In [28]:
# Calculate the MSE (Mean Squared Error)
RFR_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
RFR_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
RFR_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
RFR_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", RFR_rmse)
print("Mean Absolute Error (MAE):", RFR_mae)
print("R-squared (R²):", RFR_r2)

Root Mean Squared Error (RMSE): 0.8706639653628561
Mean Absolute Error (MAE): 0.6933150765709142
R-squared (R²): 0.19821772266282323


##Artificial Neural Networks

In [29]:
import tensorflow as tf

In [30]:
ann = tf.keras.models.Sequential()

In [31]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu')) #used 6 nodes

In [32]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [33]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [34]:
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))# used sigmoid to get probabilities

In [35]:
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [36]:
ann.fit(X_train, y_train, batch_size = 2, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x782fd51737c0>

In [37]:
y_pred = ann.predict(X_test)



In [38]:
# Calculate the MSE (Mean Squared Error)
ANN_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
ANN_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
ANN_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
ANN_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", ANN_rmse)
print("Mean Absolute Error (MAE):", ANN_mae)
print("R-squared (R²):", ANN_r2)

Root Mean Squared Error (RMSE): 0.8706639653628561
Mean Absolute Error (MAE): 0.7945174488597184
R-squared (R²): 0.05801194458836367


##XGBOOST

In [39]:
from xgboost import XGBRegressor  # For classification tasks, use XGBClassifier

# Create an XGBoost model
xgb_regressor = XGBRegressor()  # For classification, use XGBClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting rounds (trees)
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage to prevent overfitting
    'max_depth': [3, 4, 5],  # Maximum depth of each tree
    'min_child_weight': [1, 2, 3]  # Minimum sum of instance weight (hessian) needed in a child
}

# Create the GridSearchCV object
grid_search = GridSearchCV(xgb_regressor, param_grid, cv=5)  # cv is the number of cross-validation folds

# Fit the model to the data
grid_search.fit(X_train, y_train)  # X_train and y_train are your training data

# Print the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best estimator
best_model = grid_search.best_estimator_


Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 300}


In [40]:
y_pred = best_model.predict(X_test)

In [41]:
# Calculate the MSE (Mean Squared Error)
XGB_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
XGB_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
XGB_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
XGB_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", XGB_rmse)
print("Mean Absolute Error (MAE):", XGB_mae)
print("R-squared (R²):", XGB_r2)

Root Mean Squared Error (RMSE): 0.8706639653628561
Mean Absolute Error (MAE): 0.6852485150267849
R-squared (R²): 0.2255205044826225


##Catboost

In [42]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [43]:
from catboost import CatBoostRegressor

# Define your parameter grid for hyperparameter tuning
param_grid = {
    'iterations': [100, 200, 300],  # Number of boosting rounds (trees)
    'depth': [6, 8, 10],  # Maximum depth of each tree
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage to prevent overfitting
    'l2_leaf_reg': [3, 5, 7]  # L2 regularization coefficient
}

# Create a CatBoost regressor
catboost_regressor = CatBoostRegressor()

# Create the GridSearchCV object
grid_search = GridSearchCV(catboost_regressor, param_grid, cv=5, verbose=1, n_jobs=-1)  # cv is the number of cross-validation folds

# Fit the model to the data
grid_search.fit(X_train, y_train)  # X_train and y_train are your training data

# Print the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best estimator (best model)
best_model = grid_search.best_estimator_

# You can use best_model for predictions


Fitting 5 folds for each of 81 candidates, totalling 405 fits
0:	learn: 0.9803707	total: 48.5ms	remaining: 4.8s
1:	learn: 0.9627306	total: 50ms	remaining: 2.45s
2:	learn: 0.9506665	total: 51.2ms	remaining: 1.65s
3:	learn: 0.9369664	total: 53.4ms	remaining: 1.28s
4:	learn: 0.9264448	total: 55.7ms	remaining: 1.06s
5:	learn: 0.9172156	total: 58.1ms	remaining: 910ms
6:	learn: 0.9093378	total: 60.4ms	remaining: 802ms
7:	learn: 0.9024151	total: 62.9ms	remaining: 723ms
8:	learn: 0.8965233	total: 65.2ms	remaining: 659ms
9:	learn: 0.8920803	total: 67.4ms	remaining: 607ms
10:	learn: 0.8871354	total: 69.8ms	remaining: 565ms
11:	learn: 0.8836143	total: 73.5ms	remaining: 539ms
12:	learn: 0.8800440	total: 75ms	remaining: 502ms
13:	learn: 0.8760316	total: 76ms	remaining: 467ms
14:	learn: 0.8734150	total: 77.6ms	remaining: 440ms
15:	learn: 0.8701569	total: 79.2ms	remaining: 416ms
16:	learn: 0.8681823	total: 80.2ms	remaining: 391ms
17:	learn: 0.8661721	total: 81.2ms	remaining: 370ms
18:	learn: 0.864301

In [44]:
y_pred = best_model.predict(X_test)

# Calculate the MSE (Mean Squared Error)
CB_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
CB_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
CB_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
CB_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", CB_rmse)
print("Mean Absolute Error (MAE):", CB_mae)
print("R-squared (R²):", CB_r2)

Root Mean Squared Error (RMSE): 0.8706639653628561
Mean Absolute Error (MAE): 0.685967396736435
R-squared (R²): 0.20658800905226637


#Comparing the metrics

In [45]:

# Define your model names and their corresponding RMSE, MAE, and R² values
models = ["MLR", "PR", "SVR", "DT", "RFR", "ANN", "XGB", "CB"]
rmse_values = [MLR_rmse, PR_rmse, SVR_rmse, DT_rmse, RFR_rmse, ANN_rmse, XGB_rmse, CB_rmse]
mae_values = [MLR_mae, PR_mae, SVR_mae, DT_mae, RFR_mae, ANN_mae, XGB_mae, CB_mae]
r2_values = [MLR_r2, PR_r2, SVR_r2, DT_r2, RFR_r2, ANN_r2, XGB_r2, CB_r2]

# Create a DataFrame to display the results
results_df = pd.DataFrame({
    "Model": models,
    "RMSE": rmse_values,
    "MAE": mae_values,
    "R²": r2_values
})

# Print the DataFrame
print(results_df)


  Model      RMSE       MAE        R²
0   MLR  0.870664  0.695653  0.212153
1    PR  0.870664  0.685486  0.220055
2   SVR  0.870664  0.680293  0.206980
3    DT  0.870664  0.694432  0.198013
4   RFR  0.870664  0.693315  0.198218
5   ANN  0.870664  0.794517  0.058012
6   XGB  0.870664  0.685249  0.225521
7    CB  0.870664  0.685967  0.206588


# So the best model is SVR for this dataset, based on low MAE, average R^2 and same RMSE values