#REGRESSION - A basic template to compare different Regression models

##AIM - Given no of beds,square lot size and other attributes, our model predicts the price it can sell for.

##Compared Models
##MLR - Multiple Linear Regression
##PR - Polynomial Regression
##SVR - Support Vector Machines
##DT - Decision Trees
##RFR - Random Forest Regressor
##ANN - Artificial Neural Network
##XBG - XG Boost
##CB - Cat Boost


##Importing libraries

In [None]:
import numpy as np
import pandas as pd

##Loading pre-processed Dataset

In [None]:
dataset = pd.read_csv('/content/analytical_base_table_real_estate.csv')
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values
y = y.reshape(len(y),1)

##Checking for null values

In [None]:
dataset.isnull().sum()

tx_price                                       0
beds                                           0
baths                                          0
sqft                                           0
lot_size                                       0
restaurants                                    0
groceries                                      0
cafes                                          0
shopping                                       0
arts_entertainment                             0
beauty_spas                                    0
active_life                                    0
median_age                                     0
married                                        0
college_grad                                   0
median_school                                  0
num_schools                                    0
two_and_two                                    0
old_properties                                 0
tax_and_insurance                              0
during_recession    

In [None]:
print(X)

[[1.000e+00 1.000e+00 5.840e+02 ... 0.000e+00 1.000e+00 0.000e+00]
 [1.000e+00 1.000e+00 6.120e+02 ... 0.000e+00 1.000e+00 0.000e+00]
 [1.000e+00 1.000e+00 6.150e+02 ... 0.000e+00 1.000e+00 0.000e+00]
 ...
 [5.000e+00 6.000e+00 7.064e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [5.000e+00 6.000e+00 7.500e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [5.000e+00 6.000e+00 7.515e+03 ... 0.000e+00 0.000e+00 1.000e+00]]


In [None]:
print(y)

[[295850]
 [216500]
 [279900]
 ...
 [600000]
 [759900]
 [735000]]


## Splitting dataset into Training and Testing Splits

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [None]:
print(X_train)

[[1.000e+00 1.000e+00 7.810e+02 ... 0.000e+00 1.000e+00 0.000e+00]
 [5.000e+00 3.000e+00 4.081e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [3.000e+00 2.000e+00 1.360e+03 ... 0.000e+00 1.000e+00 0.000e+00]
 ...
 [5.000e+00 3.000e+00 3.384e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [3.000e+00 2.000e+00 1.682e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [3.000e+00 2.000e+00 2.766e+03 ... 0.000e+00 1.000e+00 0.000e+00]]


In [None]:
print(X_test)

[[4.000e+00 1.000e+00 1.365e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [2.000e+00 2.000e+00 1.170e+03 ... 0.000e+00 1.000e+00 0.000e+00]
 [4.000e+00 3.000e+00 3.513e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 ...
 [3.000e+00 2.000e+00 1.404e+03 ... 0.000e+00 0.000e+00 1.000e+00]
 [1.000e+00 1.000e+00 7.030e+02 ... 0.000e+00 1.000e+00 0.000e+00]
 [4.000e+00 2.000e+00 2.394e+03 ... 0.000e+00 0.000e+00 1.000e+00]]


In [None]:
print(y_train)

[[345000]
 [392500]
 [250000]
 ...
 [345000]
 [485000]
 [392235]]


In [None]:
print(y_test)

[[325000]
 [245000]
 [483820]
 [560000]
 [450000]
 [678000]
 [469500]
 [363700]
 [614000]
 [230000]
 [471200]
 [550000]
 [433000]
 [330000]
 [560000]
 [353140]
 [385000]
 [465000]
 [520000]
 [450000]
 [647500]
 [585000]
 [630000]
 [299000]
 [785000]
 [525000]
 [255000]
 [350000]
 [420000]
 [538430]
 [279000]
 [507500]
 [585000]
 [240000]
 [642500]
 [285000]
 [310100]
 [305000]
 [715000]
 [230000]
 [425000]
 [347644]
 [462330]
 [226000]
 [653931]
 [214900]
 [475000]
 [335000]
 [422500]
 [290000]
 [485000]
 [508000]
 [218250]
 [275000]
 [650000]
 [332815]
 [459900]
 [402500]
 [279000]
 [710000]
 [395000]
 [300000]
 [342500]
 [649000]
 [306484]
 [235000]
 [347000]
 [415000]
 [375000]
 [486990]
 [269000]
 [239000]
 [560000]
 [640000]
 [421770]
 [299000]
 [355000]
 [257000]
 [340000]
 [345000]
 [778500]
 [279000]
 [305000]
 [750000]
 [510000]
 [327500]
 [495000]
 [374900]
 [215000]
 [296000]
 [313500]
 [559900]
 [595000]
 [701900]
 [377400]
 [304000]
 [390000]
 [384406]
 [489000]
 [455900]


## Apply Scaling to our features

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


sc1 = StandardScaler()
y_train = sc1.fit_transform(y_train)
y_test = sc1.transform(y_test)

## Applying Dimensionality Reduction with Kernel PCA(Feature Reduction)

In [None]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components = 2, kernel = 'rbf')
X_train = kpca.fit_transform(X_train)
X_test = kpca.transform(X_test)

#Models

##Multi Linear Regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

# Create a LinearRegression model
MLR_regressor = LinearRegression()

# Define the parameter grid
param_grid = {
    'fit_intercept': [True, False]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(MLR_regressor, param_grid, cv=5)  # cv is the number of cross-validation folds

# Fit the model to the data
grid_search.fit(X_train, y_train)

In [None]:
best_model = grid_search.best_estimator_
# Use the best model to make predictions
y_pred = best_model.predict(X_test)

# y_pred now contains the predicted values for the test data


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calculate the MSE (Mean Squared Error)
MLR_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
MLR_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
MLR_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
MLR_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", MLR_rmse)
print("Mean Absolute Error (MAE):", MLR_mae)
print("R-squared (R²):", MLR_r2)



Root Mean Squared Error (RMSE): 0.8940075912139515
Mean Absolute Error (MAE): 0.7360220386880357
R-squared (R²): 0.18890075883045587


##Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge  # You can choose Ridge or Lasso
from sklearn.pipeline import make_pipeline

# Create a Polynomial Regression model with Ridge regularization
poly_reg = PolynomialFeatures(degree=2)
poly_regressor = Ridge()

# Create a pipeline for Polynomial Regression with Ridge regularization
model = make_pipeline(poly_reg, poly_regressor)

# Define the parameter grid
param_grid = {
    'polynomialfeatures__degree': [2, 3, 4],  # Try different polynomial degrees
    'ridge__alpha': [0.01, 0.1, 1.0]  # Try different alpha values for Ridge
}

# Create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5)  # cv is the number of cross-validation folds

# Fit the model to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best estimator
best_model = grid_search.best_estimator_

# You can use best_model for predictions


Best Hyperparameters: {'polynomialfeatures__degree': 4, 'ridge__alpha': 0.01}


In [None]:
y_pred=best_model.predict(X_test)

In [None]:
# Calculate the MSE (Mean Squared Error)
PR_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
PR_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
PR_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
PR_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", PR_rmse)
print("Mean Absolute Error (MAE):", PR_mae)
print("R-squared (R²):", PR_r2)

Root Mean Squared Error (RMSE): 0.8940075912139515
Mean Absolute Error (MAE): 0.707641625475943
R-squared (R²): 0.22182494187124424


In [None]:

np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))


[[-3.62e-01 -6.55e-01]
 [-2.03e-01 -1.18e+00]
 [ 3.31e-01  3.90e-01]
 [ 1.73e-01  8.91e-01]
 [ 5.94e-02  1.68e-01]
 [ 5.45e-01  1.67e+00]
 [ 3.99e-03  2.96e-01]
 [-2.13e-01 -4.00e-01]
 [ 4.29e-01  1.25e+00]
 [-6.38e-01 -1.28e+00]
 [ 1.05e+00  3.07e-01]
 [ 8.89e-01  8.26e-01]
 [ 4.72e-01  5.57e-02]
 [-7.08e-01 -6.22e-01]
 [ 3.59e-01  8.91e-01]
 [-2.90e-01 -4.70e-01]
 [-4.12e-01 -2.60e-01]
 [ 5.72e-01  2.66e-01]
 [ 1.98e-01  6.28e-01]
 [-3.99e-01  1.68e-01]
 [-7.68e-02  1.47e+00]
 [-7.11e-02  1.06e+00]
 [-1.16e-01  1.35e+00]
 [ 1.44e-03 -8.26e-01]
 [ 7.76e-01  2.37e+00]
 [-2.28e-01  6.61e-01]
 [-3.86e-01 -1.12e+00]
 [-5.34e-01 -4.90e-01]
 [-2.25e-02 -2.99e-02]
 [-1.53e-01  7.49e-01]
 [ 6.59e-01 -9.58e-01]
 [ 1.11e+00  5.46e-01]
 [ 5.57e-01  1.06e+00]
 [-9.18e-02 -1.21e+00]
 [ 5.72e-01  1.43e+00]
 [-9.08e-02 -9.18e-01]
 [-1.38e-01 -7.53e-01]
 [-5.48e-01 -7.87e-01]
 [ 9.05e-01  1.91e+00]
 [-2.72e-01 -1.28e+00]
 [-5.98e-01  3.04e-03]
 [-2.72e-01 -5.06e-01]
 [-5.96e-01  2.49e-01]
 [-8.91e-01

##Support vector machine

In [None]:
from sklearn.svm import SVR  # For Support Vector Regression, or you can use SVC for classification

# Create an SVM model
svm_regressor = SVR()  # For classification, you can use SVC()

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],          # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type: linear or radial basis function (RBF)
    'gamma': [0.1, 1, 10]        # Kernel coefficient for 'rbf'
}

# Create the GridSearchCV object
grid_search = GridSearchCV(svm_regressor, param_grid, cv=5)  # cv is the number of cross-validation folds

# Fit the model to the data
grid_search.fit(X_train, y_train)  # X_train and y_train are your training data

# Print the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best estimator
best_model = grid_search.best_estimator_

# You can use best_model for predictions


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Hyperparameters: {'C': 1, 'gamma': 10, 'kernel': 'rbf'}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
y_pred = best_model.predict(X_test)

In [None]:
# Calculate the MSE (Mean Squared Error)
SVR_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
SVR_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
SVR_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
SVR_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", SVR_rmse)
print("Mean Absolute Error (MAE):", SVR_mae)
print("R-squared (R²):", SVR_r2)

Root Mean Squared Error (RMSE): 0.8940075912139515
Mean Absolute Error (MAE): 0.6980711850119169
R-squared (R²): 0.22514981173301263


##Decision Trees

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create a RandomForestRegressor model
rf_regressor = RandomForestRegressor()

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4]  # Minimum samples required for a leaf node
}

# Create the GridSearchCV object
grid_search = GridSearchCV(rf_regressor, param_grid, cv=5)

# Fit the model to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best estimator
best_model = grid_search.best_estimator_

# You can use best_model for predictions



  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}


In [None]:
y_pred = best_model.predict(X_test)

In [None]:
# Calculate the MSE (Mean Squared Error)
DT_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
DT_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
DT_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
DT_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", DT_rmse)
print("Mean Absolute Error (MAE):", DT_mae)
print("R-squared (R²):", DT_r2)

Root Mean Squared Error (RMSE): 0.8940075912139515
Mean Absolute Error (MAE): 0.7099355911485635
R-squared (R²): 0.19348769656098985


##Random Forest


In [None]:
from sklearn.ensemble import RandomForestRegressor  # For regression tasks, or use RandomForestClassifier for classification

# Create a Random Forest model
rf_regressor = RandomForestRegressor()  # For classification, use RandomForestClassifier()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4]  # Minimum samples required for a leaf node
}

# Create the GridSearchCV object
grid_search = GridSearchCV(rf_regressor, param_grid, cv=5)  # cv is the number of cross-validation folds

# Fit the model to the data
grid_search.fit(X_train, y_train)  # X_train and y_train are your training data

# Print the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best estimator
best_model = grid_search.best_estimator_

# You can use best_model for predictions


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}


  self.best_estimator_.fit(X, y, **fit_params)


In [None]:
y_pred = best_model.predict(X_test)

In [None]:
# Calculate the MSE (Mean Squared Error)
RFR_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
RFR_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
RFR_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
RFR_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", RFR_rmse)
print("Mean Absolute Error (MAE):", RFR_mae)
print("R-squared (R²):", RFR_r2)

Root Mean Squared Error (RMSE): 0.8940075912139515
Mean Absolute Error (MAE): 0.7092531207901281
R-squared (R²): 0.1912195618702065


##Artificial Neural Networks

In [None]:
import tensorflow as tf

In [None]:
ann = tf.keras.models.Sequential()

In [None]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [None]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [None]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [None]:
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [None]:
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
ann.fit(X_train, y_train, batch_size = 2, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7ccbbe361870>

In [None]:
y_pred = ann.predict(X_test)



In [None]:
# Calculate the MSE (Mean Squared Error)
ANN_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
ANN_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
ANN_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
ANN_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", ANN_rmse)
print("Mean Absolute Error (MAE):", ANN_mae)
print("R-squared (R²):", ANN_r2)

Root Mean Squared Error (RMSE): 0.8940075912139515
Mean Absolute Error (MAE): 0.8112536008103742
R-squared (R²): 0.03957189442208642


##XGBOOST

In [None]:
from xgboost import XGBRegressor  # For classification tasks, use XGBClassifier

# Create an XGBoost model
xgb_regressor = XGBRegressor()  # For classification, use XGBClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting rounds (trees)
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage to prevent overfitting
    'max_depth': [3, 4, 5],  # Maximum depth of each tree
    'min_child_weight': [1, 2, 3]  # Minimum sum of instance weight (hessian) needed in a child
}

# Create the GridSearchCV object
grid_search = GridSearchCV(xgb_regressor, param_grid, cv=5)  # cv is the number of cross-validation folds

# Fit the model to the data
grid_search.fit(X_train, y_train)  # X_train and y_train are your training data

# Print the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best estimator
best_model = grid_search.best_estimator_

# You can use best_model for predictions


Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 300}


In [None]:
y_pred = best_model.predict(X_test)

In [None]:
# Calculate the MSE (Mean Squared Error)
XGB_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
XGB_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
XGB_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
XGB_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", XGB_rmse)
print("Mean Absolute Error (MAE):", XGB_mae)
print("R-squared (R²):", XGB_r2)

Root Mean Squared Error (RMSE): 0.8940075912139515
Mean Absolute Error (MAE): 0.7027252428395259
R-squared (R²): 0.22885117986944892


##Catboost

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
from catboost import CatBoostRegressor

# Define your parameter grid for hyperparameter tuning
param_grid = {
    'iterations': [100, 200, 300],  # Number of boosting rounds (trees)
    'depth': [6, 8, 10],  # Maximum depth of each tree
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage to prevent overfitting
    'l2_leaf_reg': [3, 5, 7]  # L2 regularization coefficient
}

# Create a CatBoost regressor
catboost_regressor = CatBoostRegressor()

# Create the GridSearchCV object
grid_search = GridSearchCV(catboost_regressor, param_grid, cv=5, verbose=1, n_jobs=-1)  # cv is the number of cross-validation folds

# Fit the model to the data
grid_search.fit(X_train, y_train)  # X_train and y_train are your training data

# Print the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best estimator (best model)
best_model = grid_search.best_estimator_

# You can use best_model for predictions


Fitting 5 folds for each of 81 candidates, totalling 405 fits
0:	learn: 0.9979691	total: 52.1ms	remaining: 15.6s
1:	learn: 0.9959153	total: 56ms	remaining: 8.35s
2:	learn: 0.9938255	total: 60ms	remaining: 5.94s
3:	learn: 0.9919062	total: 63.9ms	remaining: 4.73s
4:	learn: 0.9898859	total: 67.9ms	remaining: 4s
5:	learn: 0.9878997	total: 71.9ms	remaining: 3.52s
6:	learn: 0.9859993	total: 75.8ms	remaining: 3.17s
7:	learn: 0.9840646	total: 79.6ms	remaining: 2.91s
8:	learn: 0.9822568	total: 83.2ms	remaining: 2.69s
9:	learn: 0.9805155	total: 87.2ms	remaining: 2.53s
10:	learn: 0.9784763	total: 91.1ms	remaining: 2.39s
11:	learn: 0.9768311	total: 92.5ms	remaining: 2.22s
12:	learn: 0.9752325	total: 96.4ms	remaining: 2.13s
13:	learn: 0.9735927	total: 96.9ms	remaining: 1.98s
14:	learn: 0.9718096	total: 101ms	remaining: 1.92s
15:	learn: 0.9701214	total: 105ms	remaining: 1.86s
16:	learn: 0.9685235	total: 109ms	remaining: 1.81s
17:	learn: 0.9669230	total: 113ms	remaining: 1.77s
18:	learn: 0.9652886	to

In [None]:
y_pred = best_model.predict(X_test)

# Calculate the MSE (Mean Squared Error)
CB_mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE (Root Mean Squared Error)
CB_rmse = np.sqrt(MLR_mse)

# Calculate the MAE (Mean Absolute Error)
CB_mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared (R²) score
CB_r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Root Mean Squared Error (RMSE):", CB_rmse)
print("Mean Absolute Error (MAE):", CB_mae)
print("R-squared (R²):", CB_r2)

Root Mean Squared Error (RMSE): 0.8940075912139515
Mean Absolute Error (MAE): 0.7057090659083248
R-squared (R²): 0.22480097988223413


#Comparing the metrics

In [None]:

# Define your model names and their corresponding RMSE, MAE, and R² values
models = ["MLR", "PR", "SVR", "DT", "RFR", "ANN", "XGB", "CB"]
rmse_values = [MLR_rmse, PR_rmse, SVR_rmse, DT_rmse, RFR_rmse, ANN_rmse, XGB_rmse, CB_rmse]
mae_values = [MLR_mae, PR_mae, SVR_mae, DT_mae, RFR_mae, ANN_mae, XGB_mae, CB_mae]
r2_values = [MLR_r2, PR_r2, SVR_r2, DT_r2, RFR_r2, ANN_r2, XGB_r2, CB_r2]

# Create a DataFrame to display the results
results_df = pd.DataFrame({
    "Model": models,
    "RMSE": rmse_values,
    "MAE": mae_values,
    "R²": r2_values
})

# Print the DataFrame
print(results_df)


  Model      RMSE       MAE        R²
0   MLR  0.894008  0.736022  0.188901
1    PR  0.894008  0.707642  0.221825
2   SVR  0.894008  0.698071  0.225150
3    DT  0.894008  0.709936  0.193488
4   RFR  0.894008  0.709253  0.191220
5   ANN  0.894008  0.811254  0.039572
6   XGB  0.894008  0.702725  0.228851
7    CB  0.894008  0.705709  0.224801


### So the best model is SVR for this dataset, based on low RMSE and MAE values