In [1]:
import os

In [2]:
%pwd

'c:\\Users\\harik\\OneDrive\\Desktop\\HARIKRISHNAN_DETAILS\\Real_Estate_Predictor_Web_App\\Real_Estate_Price_Predictor_Web_App\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\harik\\OneDrive\\Desktop\\HARIKRISHNAN_DETAILS\\Real_Estate_Predictor_Web_App\\Real_Estate_Price_Predictor_Web_App'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    X_train_scaled_data_file: Path
    X_test_scaled_data_file: Path
    Y_train_data_file: Path
    Y_test_data_file: Path
    performance_metrics_file_path: Path
    params_list_of_models:list
    params_polynomial_type:str
    params_polynomial_model:str
    params_polynomial_degree:int
    params_kernel:str
    params_C:int
    params_n_estimators:int
    params_max_depth:int
    params_number_of_iteration: int
    params_cv: int
    params_verbose: int
    params_random_state_for_randomised_cv: int
    params_n_jobs: int
    params_list_of_models_for_hyper_parameter_tuning: list
    params_ridge_regression_solver: list
    params_tol: list
    params_SVR_gamma: list
    params_max_features: list
    params_hyper_n_estimators: list
    params_gradient_boost_learning_rate: list
    params_subsample: list
    params_xgboost_learning_rate: list


In [6]:
from real_estate_price_predictor.constants import *
from real_estate_price_predictor.utils.common import read_yaml, create_directories,save_object

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_model_training(self) -> ModelTrainingConfig:
        config = self.config.model_training

        create_directories([config.root_dir])

        model_training_config= ModelTrainingConfig(
            root_dir = config.root_dir,
            X_train_scaled_data_file = config.X_train_scaled_data_file,
            X_test_scaled_data_file = config.X_test_scaled_data_file,
            Y_train_data_file = config.Y_train_data_file,
            Y_test_data_file = config.Y_test_data_file,
            performance_metrics_file_path = config.performance_metrics_file_path,
            params_list_of_models = self.params.list_of_models,
            params_polynomial_type = self.params.polynomial_type,
            params_polynomial_model = self.params.polynomial_model,
            params_polynomial_degree = self.params.polynomial_degree,
            params_kernel = self.params.kernel,
            params_C = self.params.C,
            params_n_estimators = self.params.n_estimators,
            params_max_depth = self.params.max_depth,
            params_number_of_iteration = self.params.number_of_iteration,
            params_cv = self.params.cv,
            params_verbose = self.params.verbose,
            params_random_state_for_randomised_cv = self.params.random_state_for_randomised_cv,
            params_n_jobs = self.params.n_jobs,
            params_list_of_models_for_hyper_parameter_tuning = self.params.list_of_models_for_hyper_parameter_tuning,
            params_ridge_regression_solver = self.params.ridge_regression_solver,
            params_tol = self.params.tol,
            params_SVR_gamma = self.params.SVR_gamma,
            params_max_features = self.params.max_features,
            params_hyper_n_estimators = self.params.hyper_n_estimators,
            params_gradient_boost_learning_rate = self.params.gradient_boost_learning_rate,
            params_subsample = self.params.subsample,
            params_xgboost_learning_rate = self.params.xgboost_learning_rate

        )

        return model_training_config

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np

In [9]:
class ModelTraining:
    def __init__(self,config=ModelTrainingConfig):
        self.config = config
        self.best_params ={}
    
    def model_training(self):
        X_test_data = pd.read_csv(self.config.X_test_scaled_data_file)
        X_train_data  = pd.read_csv(self.config.X_train_scaled_data_file)
        y_train = pd.read_csv(self.config.Y_train_data_file)
        y_test = pd.read_csv(self.config.Y_test_data_file)
        list_of_models = self.config.params_list_of_models
        r2_score_of_models=[]
        adjusted_r2_score =[]
        mse=[]
        for i  in list_of_models:
            if i == 'Linear Regression':
                model = LinearRegression()
            elif i == 'Ridge Regression':
                model = Ridge()
            elif i == 'Polynomial Regression':
                model = Pipeline([(self.config.params_polynomial_type, PolynomialFeatures(degree=self.config.params_polynomial_degree)),(self.config.params_polynomial_model, LinearRegression())])
            elif i == 'SVR':
                model = SVR(kernel=self.config.params_kernel, C=self.config.params_C)
            elif i == 'Random Forrest Regressor':
                model = RandomForestRegressor(n_estimators=self.config.params_n_estimators)
            elif i == 'AdaBoost Regressor':
                model = AdaBoostRegressor()
            elif i == 'Gradient Boosting Regressor':
                model = GradientBoostingRegressor()
            elif i == 'XGBRegressor':
                model = XGBRegressor()
            else:
                model = DecisionTreeRegressor(max_depth=self.config.params_max_depth)
        # Train the model on the training data

            model.fit(X_train_data, y_train)

        # Make predictions on the testing data

            y_pred = model.predict(X_test_data)

        # Evaluate the model performance (e.g., R-squared, Mean Squared Error)
            
            r2 = r2_score(y_test, y_pred)
            r2_score_of_models.append(r2)

        # Calculate the adjusted R²

            n = X_test_data.shape[0]  # Number of observations (samples) in the testing set
            p = X_test_data.shape[1]  # Number of features in the model
            adjusted_r2_score.append(1 - (1 - r2) * (n - 1) / (n - p - 1))
            mse.append(mean_squared_error(y_test, y_pred))
        data = {'Models': list_of_models, 'Adjusted_R2_Score': adjusted_r2_score, 'R2_Score': r2_score_of_models , 'Mean_Squared_Error': mse}
        performance_metrics = pd.DataFrame.from_dict(data)
        performance_metrics.set_index('Models', inplace = False)
        return performance_metrics
    
    def hyperparameter_tuning(self,performance_metrics:pd.DataFrame):
        X_test_data = pd.read_csv(self.config.X_test_scaled_data_file)
        X_train_data  = pd.read_csv(self.config.X_train_scaled_data_file)
        y_train = pd.read_csv(self.config.Y_train_data_file)
        y_test = pd.read_csv(self.config.Y_test_data_file)
        list_of_models_for_hyper_parameter_tuning = self.config.params_list_of_models_for_hyper_parameter_tuning
        r2_score_of_models_hyper=[]
        adjusted_r2_score_hyper =[]
        mse_hyper = []
        for i in list_of_models_for_hyper_parameter_tuning:
            if i == 'Hyper Parameter Ridge Regression':
                random_grid = {'alpha': [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)],
                            'solver': self.config.params_ridge_regression_solver,
                            'tol': self.config.params_tol}
                model = Ridge()
                rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=self.config.params_number_of_iteration,cv=self.config.params_cv,verbose=self.config.params_verbose,
                                            random_state=self.config.params_random_state_for_randomised_cv,n_jobs=self.config.params_n_jobs)
            elif i == 'Hyper Parameter Support Vector Regression':
                random_grid = {'kernel': [self.config.params_kernel],
                            'C': [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 10)] + [int(x) for x in np.arange(1, 11)],
                            'epsilon': [float(x) for x in np.linspace(start = 0.01, stop = 0.1, num = 10)] + [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 10)],
                            'gamma': self.config.params_SVR_gamma
                            }
                model = SVR()
                rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=self.config.params_number_of_iteration,cv=self.config.params_cv,verbose=self.config.params_verbose,
                                            random_state=self.config.params_random_state_for_randomised_cv,n_jobs=self.config.params_n_jobs)
            elif i == 'Hyper Parameter Randomn Forrest Regression':
                random_grid = {'max_features':self.config.params_max_features,
                    'n_estimators': self.config.params_hyper_n_estimators
                    }
                model = RandomForestRegressor()
                rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=self.config.params_number_of_iteration,cv=self.config.params_cv,verbose=self.config.params_verbose,
                                            random_state=self.config.params_random_state_for_randomised_cv,n_jobs=self.config.params_n_jobs)
            elif i == 'Hyper Parameter Gradient Boost Regression':
                random_grid = {
                            'learning_rate':self.config.params_gradient_boost_learning_rate,
                            'subsample':self.config.params_subsample,
                            'n_estimators': self.config.params_hyper_n_estimators
                    }
                model = GradientBoostingRegressor()
                rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=self.config.params_number_of_iteration,cv=self.config.params_cv,verbose=self.config.params_verbose,
                                            random_state=self.config.params_random_state_for_randomised_cv,n_jobs=self.config.params_n_jobs)
            elif i == 'Hyper Parameter XGBoost Regression':
                random_grid = {
                            'learning_rate':self.config.params_xgboost_learning_rate,
                            'n_estimators': self.config.params_hyper_n_estimators
                    }
                model = XGBRegressor()
                rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=self.config.params_number_of_iteration,cv=self.config.params_cv,verbose=self.config.params_verbose,
                                            random_state=self.config.params_random_state_for_randomised_cv,n_jobs=self.config.params_n_jobs)
            rf_randomcv.fit(X_train_data,y_train)
            best_random_grid=rf_randomcv.best_estimator_
            y_pred=best_random_grid.predict(X_test_data)
            self.best_params[i] = rf_randomcv.best_estimator_
            # Evaluate the model performance (e.g., R-squared, Mean Squared Error)

            r2_hyper = r2_score(y_test, y_pred)
            r2_score_of_models_hyper.append(r2_hyper)

            # Calculate the adjusted R²

            n = X_test_data.shape[0]  # Number of observations (samples) in the testing set
            p = X_test_data.shape[1]  # Number of features in the model
            adjusted_r2_hyper = 1 - (1 - r2_hyper) * (n - 1) / (n - p - 1)
            adjusted_r2_score_hyper.append(adjusted_r2_hyper)
            mse = mean_squared_error(y_test, y_pred)
            mse_hyper.append(mse)
        data = {'Models': list_of_models_for_hyper_parameter_tuning, 'Adjusted_R2_Score': adjusted_r2_score_hyper, 'R2_Score': r2_score_of_models_hyper , 'Mean_Squared_Error': mse_hyper}
        performance_metrics_hyper = pd.DataFrame.from_dict(data)
        performance_metrics_hyper.set_index('Models', inplace = False)
        combined_results = pd.concat([performance_metrics,performance_metrics_hyper],axis=0)
        combined_results = combined_results.sort_values(['Adjusted_R2_Score'],ascending=False)
        return combined_results
    
    def save_the_best_model(self,combined_performance_metrics:pd.DataFrame):
        best_model_name = combined_performance_metrics.sort_values(['Adjusted_R2_Score'],ascending=False).head(1)['Models'].values[0]
        if best_model_name == 'Hyper Parameter Ridge Regression':
            model = self.best_params[best_model_name]
        elif best_model_name == 'Hyper Parameter Support Vector Regression':
            model = self.best_params[best_model_name]
        elif best_model_name == 'Hyper Parameter Randomn Forrest Regression':
            model = self.best_params[best_model_name]
        elif best_model_name == 'Hyper Parameter Gradient Boost Regression':
            model = self.best_params[best_model_name]
        elif best_model_name == 'Hyper Parameter XGBoost Regression':
            model = self.best_params[best_model_name]
        elif best_model_name == 'linear Regression':
            model = LinearRegression()
        elif best_model_name == 'Ridge Regression':
            model = Ridge()
        elif best_model_name == 'Polynomial Regression':
            model = Pipeline([(self.config.params_polynomial_type, PolynomialFeatures(degree=self.config.params_polynomial_degree)),(self.config.params_polynomial_model, LinearRegression())])
        elif best_model_name == 'SVR':
            model = SVR(kernel=self.config.params_kernel, C=self.config.params_C)
        elif best_model_name == 'Random Forrest Regressor':
            model = RandomForestRegressor(n_estimators=self.config.params_n_estimators)
        elif best_model_name == 'AdaBoost Regressor':
            model = AdaBoostRegressor()
        elif best_model_name == 'Gradient Boosting Regressor':
            model = GradientBoostingRegressor()
        elif best_model_name == 'XGBRegressor':
            model = XGBRegressor()
        else:
            model = DecisionTreeRegressor(max_depth=self.config.params_max_depth)

        combined_performance_metrics.to_csv(self.config.performance_metrics_file_path)
        best_model_name = best_model_name.replace(" ","") + '.pkl'
        file_path = os.path.join(self.config.root_dir,best_model_name)
        save_object(file_path,model)

In [10]:
try:
    config = ConfigurationManager()
    model_training_config = config.get_model_training()
    model_training = ModelTraining(config=model_training_config)
    results = model_training.model_training()
    combined_performance_metrics = model_training.hyperparameter_tuning(results)
    model_training.save_the_best_model(combined_performance_metrics)
except Exception as e:
    raise e

[2024-10-03 20:19:13,887: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-03 20:19:13,901: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-03 20:19:13,905: INFO: common: created directory at: artifacts]
[2024-10-03 20:19:13,907: INFO: common: created directory at: artifacts/model_trainer]


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Fitting 3 folds for each of 50 candidates, totalling 150 fits


  y = column_or_1d(y, warn=True)


Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
13 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\harik\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\harik\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\harik\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\harik\AppData\Local\Programs\Python\Python312\Lib\

Fitting 3 folds for each of 50 candidates, totalling 150 fits


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [12]:
combined_performance_metrics

Unnamed: 0,Models,Adjusted_R2_Score,R2_Score,Mean_Squared_Error
2,Hyper Parameter Randomn Forrest Regression,0.879479,0.898596,0.013885
1,Hyper Parameter Support Vector Regression,0.874726,0.894597,0.014433
4,Random Forrest Regressor,0.873196,0.89331,0.014609
0,Linear Regression,0.871083,0.891532,0.014853
1,Ridge Regression,0.868095,0.889018,0.015197
0,Hyper Parameter Ridge Regression,0.868095,0.889018,0.015197
7,Gradient Boosting Regressor,0.864571,0.886053,0.015603
3,SVR,0.858088,0.880598,0.01635
3,Hyper Parameter Gradient Boost Regression,0.854211,0.877336,0.016797
4,Hyper Parameter XGBoost Regression,0.846933,0.871213,0.017635


In [42]:
combined_performance_metrics

{'Hyper Parameter Ridge Regression': Ridge(alpha=1, solver='lsqr', tol=1e-06),
 'Hyper Parameter Support Vector Regression': SVR(C=0.6, epsilon=0.01),
 'Hyper Parameter Randomn Forrest Regression': RandomForestRegressor(max_features='sqrt', n_estimators=256),
 'Hyper Parameter Gradient Boost Regression': GradientBoostingRegressor(learning_rate=0.05, n_estimators=256, subsample=0.6),
 'Hyper Parameter XGBoost Regression': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_w

In [43]:
x = combined_performance_metrics[1]['Hyper Parameter Gradient Boost Regression']
y = GradientBoostingRegressor(learning_rate=0.05, n_estimators=256, subsample=0.6)
print(type(x))
print(type(y))

<class 'sklearn.ensemble._gb.GradientBoostingRegressor'>
<class 'sklearn.ensemble._gb.GradientBoostingRegressor'>


In [54]:
best_model_name = combined_performance_metrics[0].sort_values(['Adjusted_R2_Score'],ascending=False).head(1)['Models'].values[0]
best_model_name
if best_model_name == 'Hyper Parameter Ridge Regression':
    model = combined_performance_metrics[1][best_model_name]
elif best_model_name == 'Hyper Parameter Support Vector Regression':
    model = combined_performance_metrics[1][best_model_name]
elif best_model_name == 'Hyper Parameter Randomn Forrest Regression':
    model = combined_performance_metrics[1][best_model_name]
elif best_model_name == 'Hyper Parameter Gradient Boost Regression':
    model = combined_performance_metrics[1][best_model_name]
elif best_model_name == 'Hyper Parameter XGBoost Regression':
    model = combined_performance_metrics[1][best_model_name]
elif best_model_name == 'linear Regression':
    model = LinearRegression()
elif best_model_name == 'Ridge Regression':
    model = Ridge()
elif best_model_name == 'Polynomial Regression':
    model = Pipeline([(self.config.params_polynomial_type, PolynomialFeatures(degree=self.config.params_polynomial_degree)),(self.config.params_polynomial_model, LinearRegression())])
elif best_model_name == 'SVR':
    model = SVR(kernel=self.config.params_kernel, C=self.config.params_C)
elif best_model_name == 'Random Forrest Regressor':
    model = RandomForestRegressor(n_estimators=self.config.params_n_estimators)
elif best_model_name == 'AdaBoost Regressor':
    model = AdaBoostRegressor()
elif best_model_name == 'Gradient Boosting Regressor':
    model = GradientBoostingRegressor()
elif best_model_name == 'XGBRegressor':
    model = XGBRegressor()
else:
    model = DecisionTreeRegressor(max_depth=self.config.params_max_depth)

combined_performance_metrics[0].to_csv(self.config.performance_metrics_file_path)
best_model_name = best_model_name.replace(" ","") + '.pkl'
file_path = os.path.join(self.config.root_dir,best_model_name)
save_object(file_path,model)




'Hyper Parameter Randomn Forrest Regression'

In [15]:
results

Unnamed: 0,Models,Adjusted_R2_Score,R2_Score,Mean_Squared_Error
0,Linear Regression,0.871083,0.891532,0.014853
1,Ridge Regression,0.868095,0.889018,0.015197
2,Polynomial Regression,0.773495,0.809423,0.026096
3,SVR,0.858088,0.880598,0.01635
4,Random Forrest Regressor,0.874312,0.894249,0.014481
5,Decision Tree Regressor,0.702408,0.749612,0.034286
6,AdaBoost Regressor,0.753011,0.792189,0.028456
7,Gradient Boosting Regressor,0.860381,0.882528,0.016086
8,XGBRegressor,0.839446,0.864913,0.018498


In [16]:
X_test_data = pd.read_csv(r'C:\Users\harik\OneDrive\Desktop\HARIKRISHNAN_DETAILS\Real_Estate_Predictor_Web_App\Real_Estate_Price_Predictor_Web_App\artifacts\train_test_data_scaled\X_test_scaled.csv')
X_train_data  = pd.read_csv(r'C:\Users\harik\OneDrive\Desktop\HARIKRISHNAN_DETAILS\Real_Estate_Predictor_Web_App\Real_Estate_Price_Predictor_Web_App\artifacts\train_test_data_scaled\X_train_scaled.csv')
y_train = pd.read_csv(r'C:\Users\harik\OneDrive\Desktop\HARIKRISHNAN_DETAILS\Real_Estate_Predictor_Web_App\Real_Estate_Price_Predictor_Web_App\artifacts\train_test_data\Y_train.csv')
y_test = pd.read_csv(r'C:\Users\harik\OneDrive\Desktop\HARIKRISHNAN_DETAILS\Real_Estate_Predictor_Web_App\Real_Estate_Price_Predictor_Web_App\artifacts\train_test_data\Y_test.csv')


In [36]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
list_of_models_for_hyper_parameter_tuning = ['Hyper Parameter Ridge Regression', 'Hyper Parameter Support Vector Regression', 'Hyper Parameter Randomn Forrest Regression', 'Hyper Parameter Gradient Boost Regression','Hyper Parameter XGBoost Regression']
best_params ={}
r2_score_of_models_hyper=[]
adjusted_r2_score_hyper =[]
mse_hyper = []
for i in list_of_models_for_hyper_parameter_tuning:
    if i == 'Ridge Regression':
        random_grid = {'alpha': [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)],
                    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'],
                    'tol': [1e-4, 1e-5, 1e-6]}
        model = Ridge()
        rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                                    random_state=100,n_jobs=-1)
    elif i == 'Hyper Parameter Support Vector Regression':
        random_grid = {'kernel': ['rbf'],
                    'C': [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 10)] + [int(x) for x in np.arange(1, 11)],
                    'epsilon': [float(x) for x in np.linspace(start = 0.01, stop = 0.1, num = 10)] + [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 10)],
                    'gamma': ['scale','auto']
                    }
        model = SVR()
        rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=50,cv=3,verbose=2,
                                    random_state=100,n_jobs=-1)
    elif i == 'Hyper Parameter Randomn Forrest Regression':
        random_grid = {'max_features':['sqrt','log2',None],
               'n_estimators': [8,16,32,64,128,256]
              }
        model = RandomForestRegressor()
        rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=50,cv=3,verbose=2,
                                    random_state=100,n_jobs=-1)
    elif i == 'Hyper Parameter Gradient Boost Regression':
        random_grid = {
                    'learning_rate':[.1,.01,.05,.001],
                    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                    'n_estimators': [8,16,32,64,128,256]
              }
        model = GradientBoostingRegressor()
        rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=50,cv=3,verbose=2,
                                    random_state=100,n_jobs=-1)
    elif i == 'Hyper Parameter XGBoost Regression':
        random_grid = {
                    'learning_rate':[.1,.01,.05,.001],
                    'n_estimators': [8,16,32,64,128,256]
              }
        model = XGBRegressor()
        rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=50,cv=3,verbose=2,
                                    random_state=100,n_jobs=-1)
    rf_randomcv.fit(X_train_data,y_train)
    best_random_grid=rf_randomcv.best_estimator_
    y_pred=best_random_grid.predict(X_test_data)
    best_params[i] = rf_randomcv.best_estimator_
    # Evaluate the model performance (e.g., R-squared, Mean Squared Error)

    r2_hyper = r2_score(y_test, y_pred)
    r2_score_of_models_hyper.append(r2_hyper)

    # Calculate the adjusted R²

    n = X_test_data.shape[0]  # Number of observations (samples) in the testing set
    p = X_test_data.shape[1]  # Number of features in the model
    adjusted_r2_hyper = 1 - (1 - r2_hyper) * (n - 1) / (n - p - 1)
    adjusted_r2_score_hyper.append(adjusted_r2_hyper)
    mse = mean_squared_error(y_test, y_pred)
    mse_hyper.append(mse)
data = {'Models': list_of_models_for_hyper_parameter_tuning, 'Adjusted_R2_Score': adjusted_r2_score_hyper, 'R2_Score': r2_score_of_models_hyper , 'Mean_Squared_Error': mse_hyper}
performance_metrics_hyper = pd.DataFrame.from_dict(data)
performance_metrics_hyper.set_index('Models', inplace = False)
combined_results = pd.concat([results,performance_metrics_hyper],axis=0)
combined_results = combined_results.sort_values(['Adjusted_R2_Score'],ascending=False)
combined_results



Fitting 3 folds for each of 24 candidates, totalling 72 fits
Fitting 3 folds for each of 50 candidates, totalling 150 fits


  y = column_or_1d(y, warn=True)


Fitting 3 folds for each of 18 candidates, totalling 54 fits


  return fit_method(estimator, *args, **kwargs)


Fitting 3 folds for each of 50 candidates, totalling 150 fits


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Fitting 3 folds for each of 24 candidates, totalling 72 fits


Unnamed: 0,Models,Adjusted_R2_Score,R2_Score,Mean_Squared_Error
0,Hyper Parameter Ridge Regression,0.846933,0.871213,0.017635
1,Hyper Parameter Support Vector Regression,0.874726,0.894597,0.014433
2,Hyper Parameter Randomn Forrest Regression,0.880867,0.899764,0.013726
3,Hyper Parameter Gradient Boost Regression,0.861119,0.883148,0.016001
4,Hyper Parameter XGBoost Regression,0.846933,0.871213,0.017635


In [40]:
combined_results = pd.concat([results,performance_metrics_hyper],axis=0)
combined_results = combined_results.sort_values(['Adjusted_R2_Score'],ascending=False)
combined_results

Unnamed: 0,Models,Adjusted_R2_Score,R2_Score,Mean_Squared_Error
2,Hyper Parameter Randomn Forrest Regression,0.880867,0.899764,0.013726
1,Hyper Parameter Support Vector Regression,0.874726,0.894597,0.014433
4,Random Forrest Regressor,0.874312,0.894249,0.014481
0,Linear Regression,0.871083,0.891532,0.014853
1,Ridge Regression,0.868095,0.889018,0.015197
3,Hyper Parameter Gradient Boost Regression,0.861119,0.883148,0.016001
7,Gradient Boosting Regressor,0.860381,0.882528,0.016086
3,SVR,0.858088,0.880598,0.01635
0,Hyper Parameter Ridge Regression,0.846933,0.871213,0.017635
4,Hyper Parameter XGBoost Regression,0.846933,0.871213,0.017635


In [32]:
best_params

{'Hyper Parameter Ridge Regression': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=128, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...),
 'Hyper Parameter Support Vector Regression': SVR(C=0.6, epsilon=0.01),
 'Hyper Parameter Randomn Forrest Regression': RandomForestRegressor(max_features='sqrt', n_estimators=64),
 'Hyper Parameter Gradient 

In [17]:
# Choosing the best parameter for Ridge Regression
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Alpha value in the Ridge Regrssion
alpha = [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
# Solver for the ridge regression
solver = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
# Tolereance the stopping criteria for the Ridge Regression
tol = [1e-4, 1e-5, 1e-6]
# Create the random grid
random_grid = {'alpha': alpha,
               'solver': solver,
               'tol': tol}
model = Ridge()
rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train_data,y_train)
best_random_grid=rf_randomcv.best_estimator_
y_pred=best_random_grid.predict(X_test_data)
# Evaluate the model performance (e.g., R-squared, Mean Squared Error)

r2_linear = r2_score(y_test, y_pred)

# Calculate the adjusted R²

n = X_test_data.shape[0]  # Number of observations (samples) in the testing set
p = X_test_data.shape[1]  # Number of features in the model
adjusted_r2_linear = 1 - (1 - r2_linear) * (n - 1) / (n - p - 1)
mse_linear = mean_squared_error(y_test, y_pred)
print("The R2 score is ",r2_linear)
print("The Mean squared error is ",mse_linear)
print("The adjusted R2 score is ",adjusted_r2_linear)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
The R2 score is  0.8890182488758518
The Mean squared error is  0.015196984496418655
The adjusted R2 score is  0.868095459729496


33 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
33 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\harik\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\harik\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\harik\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_ridge.py", line 1251, in fit
    return super().fit(X, y, sam

In [20]:
rf_randomcv.best_params_

{'tol': 1e-06, 'solver': 'lsqr', 'alpha': 1}

In [21]:
# Choosing the best parameter for Support Vector Regression
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Kernel for the Support Vector regression
kernel = ['rbf']
# C Value for the Support Vector regression
C = [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 10)] + [int(x) for x in np.arange(1, 11)]
# Epsilon value for the Support Vector regression
epsilon = [float(x) for x in np.linspace(start = 0.01, stop = 0.1, num = 10)] + [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 10)]
# Gamma value for the Support Vector regression
gamma = ['scale','auto']

# Create the random grid
random_grid = {'kernel': kernel,
               'C': C,
               'epsilon': epsilon,
               'gamma': gamma
              }
model = SVR()
rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=50,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train_data,y_train)
best_random_grid=rf_randomcv.best_estimator_
y_pred=best_random_grid.predict(X_test_data)
# Evaluate the model performance (e.g., R-squared, Mean Squared Error)

r2_linear = r2_score(y_test, y_pred)

# Calculate the adjusted R²

n = X_test_data.shape[0]  # Number of observations (samples) in the testing set
p = X_test_data.shape[1]  # Number of features in the model
adjusted_r2_linear = 1 - (1 - r2_linear) * (n - 1) / (n - p - 1)
mse_linear = mean_squared_error(y_test, y_pred)
print("The R2 score is ",r2_linear)
print("The Mean squared error is ",mse_linear)
print("The adjusted R2 score is ",adjusted_r2_linear)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
The R2 score is  0.8945971419238966
The Mean squared error is  0.014433053937569613
The adjusted R2 score is  0.8747261113029918


  y = column_or_1d(y, warn=True)


In [22]:
rf_randomcv.best_params_

{'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.01, 'C': 0.6}

In [23]:
random_grid = {'max_features':['sqrt','log2',None],
               'n_estimators': [8,16,32,64,128,256]
              }
model = RandomForestRegressor()
rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=50,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train_data,y_train)
best_random_grid=rf_randomcv.best_estimator_
y_pred=best_random_grid.predict(X_test_data)
# Evaluate the model performance (e.g., R-squared, Mean Squared Error)

r2_linear = r2_score(y_test, y_pred)

# Calculate the adjusted R²

n = X_test_data.shape[0]  # Number of observations (samples) in the testing set
p = X_test_data.shape[1]  # Number of features in the model
adjusted_r2_linear = 1 - (1 - r2_linear) * (n - 1) / (n - p - 1)
mse_linear = mean_squared_error(y_test, y_pred)
print("The R2 score is ",r2_linear)
print("The Mean squared error is ",mse_linear)
print("The adjusted R2 score is ",adjusted_r2_linear)



Fitting 3 folds for each of 18 candidates, totalling 54 fits


  return fit_method(estimator, *args, **kwargs)


The R2 score is  0.9001948426994395
The Mean squared error is  0.013666547993665778
The adjusted R2 score is  0.8813791163231043


In [24]:
rf_randomcv.best_params_

{'n_estimators': 256, 'max_features': 'sqrt'}

In [25]:
random_grid = {
                    'learning_rate':[.1,.01,.05,.001],
                    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                    'n_estimators': [8,16,32,64,128,256]
              }
model = GradientBoostingRegressor()
rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=50,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train_data,y_train)
best_random_grid=rf_randomcv.best_estimator_
y_pred=best_random_grid.predict(X_test_data)
# Evaluate the model performance (e.g., R-squared, Mean Squared Error)

r2_linear = r2_score(y_test, y_pred)

# Calculate the adjusted R²

n = X_test_data.shape[0]  # Number of observations (samples) in the testing set
p = X_test_data.shape[1]  # Number of features in the model
adjusted_r2_linear = 1 - (1 - r2_linear) * (n - 1) / (n - p - 1)
mse_linear = mean_squared_error(y_test, y_pred)
print("The R2 score is ",r2_linear)
print("The Mean squared error is ",mse_linear)
print("The adjusted R2 score is ",adjusted_r2_linear)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


The R2 score is  0.8773038118882264
The Mean squared error is  0.016801069091246075
The adjusted R2 score is  0.8541725633097772


In [26]:
rf_randomcv.best_params_

{'subsample': 0.6, 'n_estimators': 256, 'learning_rate': 0.05}

In [27]:
random_grid = {
                    'learning_rate':[.1,.01,.05,.001],
                    'n_estimators': [8,16,32,64,128,256]
              }
model = XGBRegressor()
rf_randomcv=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=50,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train_data,y_train)
best_random_grid=rf_randomcv.best_estimator_
y_pred=best_random_grid.predict(X_test_data)
# Evaluate the model performance (e.g., R-squared, Mean Squared Error)

r2_linear = r2_score(y_test, y_pred)

# Calculate the adjusted R²

n = X_test_data.shape[0]  # Number of observations (samples) in the testing set
p = X_test_data.shape[1]  # Number of features in the model
adjusted_r2_linear = 1 - (1 - r2_linear) * (n - 1) / (n - p - 1)
mse_linear = mean_squared_error(y_test, y_pred)
print("The R2 score is ",r2_linear)
print("The Mean squared error is ",mse_linear)
print("The adjusted R2 score is ",adjusted_r2_linear)



Fitting 3 folds for each of 24 candidates, totalling 72 fits
The R2 score is  0.8712127208709717
The Mean squared error is  0.017635147817695408
The adjusted R2 score is  0.8469331518548434


In [28]:
rf_randomcv.best_params_

{'n_estimators': 128, 'learning_rate': 0.1}