In [26]:
import pandas as pd
import numpy as np
import panel as pn
import mlflow
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split  
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_percentage_error   



In [30]:
class Regression:

    def __init__(self, data:pd.DataFrame, target_column:str, test_size:float=0.2, datetime_column:str=None):

        if datetime_column== None:
            data = data.reset_index(drop=True).reset_index()
            datetime_column= 'index'
        self.datetime_column = datetime_column
        self.target_column = target_column

        # Dropping nan values
        data= data.dropna()

        # Raising exception if data is not present
        if data.shape[0] == 0:
            raise NoDataPresentException
        
        # sorting values accoriding to datetime
        data = data.sort_values(by = self.datetime_column, ascending = True)
        self.data_test = data.tail(int(test_size*len(data)))

        data = data.drop(self.data_test.index)
        self.y = data[target_column]
        self.X = data.drop(target_column, axis = 1)
        self.X_test = self.data_test.drop(target_column, axis = 1)
        self.y_test = self.data_test[target_column]

    
    @staticmethod
    def evaluate(test:pd.Series, preds: pd.Series, process:str)->tuple:
        score=r2_score(test,preds)
        mse = mean_squared_error(test,preds)
        rmse = np.sqrt(mean_squared_error(test,preds))
        mape = mean_absolute_percentage_error(test,preds)
        mae= mean_absolute_error(test, preds)

        print('For {}'.format(process))
        print('r2_score: {}\nmse:{}\nrmse:{}\nmape:{}\nmae:{}'.format(score, mse, rmse, mape, mae))

        return (score, mse, rmse, mape, mae)


    def LinearRegression(self, train_size:int=20, random_state:int=0, **kwargs)->dict:
        session = mlflow.start_run()
        
        with session : 
            
            data_for_graphs= {}

            self.train_size = train_size       
            self.random_state = random_state
            
            self.x_train, self.x_valid, self.y_train, self.y_valid = train_test_split(self.X, self.y, train_size = train_size, random_state = random_state)
            
            self.x_valid1 = self.x_valid.drop(self.datetime_column, axis = 1)
            self.x_train1 = self.x_train.drop(self.datetime_column, axis = 1) 
            
            ## Building a model
            mlflow.log_param("Model Type", 'LinearRegressor' )
            self.model =  LinearRegression(**kwargs)
            self.model.fit(self.x_train1,self.y_train)
            
            ## On Training Data:
            y_pred_train = self.model.predict(self.x_train1)
            (self.train_score, self.train_mse, self.train_rmse, self.train_mape, self.train_mae) = Regression.evaluate(self.y_train, y_pred_train, 'Train')
            data_for_graph['train'] = pd.DataFrame({'Actual': self.y_train, 'Predictions': y_pred_train, 'Date_time': self.x_train[self.datetime_column]}).sort_values(by = 'Date_time', ascending=True).reset_index(drop = True)

            ## On Validation Data:
            y_pred_valid = self.model.predict(self.x_valid1)
            (self.valid_score, self.valid_mse, self.valid_rmse, self.valid_mape, self.valid_mae) = Regression.evaluate(self.y_valid, y_pred_valid, 'Validation')
            data_for_graph['validation'] = pd.DataFrame({'Actual': self.y_valid, 'Predictions': y_pred_valid, 'Date_time': self.x_valid[self.datetime_column]}).sort_values(by = 'Date_time', ascending=True).reset_index(drop = True)

            ## On Test data: 
            y_pred_test = self.model.predict(self.X_test.drop(self.datetime_column, axis = 1))
            (self.test_score, self.test_mse, self.test_rmse, self.test_mape, self.test_mae) = Regression.evaluate(self.y_test, y_pred_test, 'OOT')
            self.result_OOT = pd.DataFrame({'Actual': self.y_test, 'Predictions': y_pred_test, 'Date_time': self.X_test[self.datetime_column]}).sort_values(by = 'Date_time', ascending=True).reset_index(drop = True)

            

            # Log parameters and metrics to MLflow
            mlflow.log_param("train_size", train_size)
            mlflow.log_param("random_state", random_state)
            mlflow.log_param("model_type", "Linear Regressor")
            mlflow.log_params(kwargs)

            mlflow.log_metric("r2_score_train", self.train_score)
            mlflow.log_metric("mse_train", self.train_mse)
            mlflow.log_metric("rmse_train", self.train_rmse)
            mlflow.log_metric("mape_train", self.train_mape)

            mlflow.log_metric("r2_score_test", self.test_score)
            mlflow.log_metric("mse_test", self.test_mse)
            mlflow.log_metric("rmse_test", self.test_rmse)
            mlflow.log_metric("mape_test", self.test_mape)

            mlflow.log_metric("r2_score_OOT", self.OOT_score)
            mlflow.log_metric("mse_OOT", self.OOT_mse)
            mlflow.log_metric("rmse_OOT", self.OOT_rmse)
            mlflow.log_metric("mape_OOT", self.OOT_mape)

            mlflow.sklearn.log_model(self.model, "model")
            
            return self 

In [7]:
df= pd.read_csv('test_data.csv')

In [8]:
df

Unnamed: 0.1,Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,...,DRHO,MUDWEIGHT,RMIC,ROPA,RXO,WELL,GROUP,FORMATION,FORCE_2020_LITHOFACIES_LITHOLOGY,FORCE_2020_LITHOFACIES_CONFIDENCE
0,0,1518.2800,433906.7500,6460000.5,-1493.241821,15.506232,2.237042,0.950333,0.878615,2.072248,...,0.109706,0.275208,0.853690,88.968864,0.822429,15/9-23,HORDALAND GP.,Skade Fm.,65000,3.0
1,1,1518.4320,433906.7500,6460000.5,-1493.393799,18.524611,2.198390,0.946200,0.874237,2.049179,...,-0.006418,0.543080,0.831179,92.287186,0.826374,15/9-23,HORDALAND GP.,Skade Fm.,65000,3.0
2,2,1518.5840,433906.7500,6460000.5,-1493.545776,18.855669,2.114124,0.929856,0.869858,2.038348,...,0.022769,0.610418,0.835320,95.605499,0.819632,15/9-23,HORDALAND GP.,Skade Fm.,65000,3.0
3,3,1518.7360,433906.7500,6460000.5,-1493.697754,19.163353,1.946680,0.927579,0.865479,2.020606,...,0.024972,0.538517,0.868298,98.923820,0.841034,15/9-23,HORDALAND GP.,Skade Fm.,65000,3.0
4,4,1518.8880,433906.7500,6460000.5,-1493.849609,18.489744,1.193619,0.849849,0.863804,2.130803,...,0.024527,0.359408,0.851085,102.242142,0.814464,15/9-23,HORDALAND GP.,Skade Fm.,65000,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122392,122392,2973.2988,536096.0625,6793022.0,-2943.444580,8.276272,2.644125,2.820439,3.158570,2.464931,...,0.502458,5.594451,2.311106,24.306124,2.972307,35/9-7,BAAT GP.,Etive Fm.,65000,2.0
122393,122393,2973.4508,536096.0625,6793022.0,-2943.595947,8.267273,2.201899,3.020778,3.332977,2.470371,...,0.374753,4.898014,1.853418,22.201078,2.516858,35/9-7,BAAT GP.,Etive Fm.,65000,2.0
122394,122394,2973.6028,536096.0625,6793022.0,-2943.747559,8.250099,1.715108,2.795711,3.044179,2.472233,...,0.211487,3.569129,1.325961,20.096741,1.723662,35/9-7,BAAT GP.,Etive Fm.,65000,2.0
122395,122395,2973.7548,536096.0625,6793022.0,-2943.899170,8.695217,1.575916,2.658694,2.847681,2.518067,...,0.147950,0.342615,1.260347,17.992323,0.330439,35/9-7,BAAT GP.,Etive Fm.,65000,2.0


In [9]:
cols= df.dtypes[(df.dtypes == 'category') | (df.dtypes == 'str') | (df.dtypes == 'object')].index
df= df.drop(cols, axis= 1)

In [31]:
Regression(df, 'FORCE_2020_LITHOFACIES_LITHOLOGY').LinearRegression()

NameError: name 'r2_score' is not defined

In [None]:
    class fit_model():

        def __init__(self, data, column, OOT_shape = 20, datetime_column = None, cleaned=False):
            if cleaned:
                data = data[data['Switch']==1]
                if data.shape[0]==0:
                    print('No non-outlying data present.')
                    return None
                data = data.drop(['Comment', 'Switch'], axis = 1)
            if datetime_column== None:
                data = data.reset_index(drop=True).reset_index()
                datetime_column= 'index'
            self.datetime_column = datetime_column
            self.target_column = column
            data= data.dropna()
            if data.shape[0] == 0:
                raise NoDataPresentException
            data = data.sort_values(by = self.datetime_column, ascending = True)
            self.data_OOT = data.tail(OOT_shape)
            data = data.drop(self.data_OOT.index)
            self.y = data[column]
            self.X = data.drop(column, axis = 1)
            self.X_OOT = self.data_OOT.drop(column, axis = 1)
            self.y_OOT = self.data_OOT[column]

        def evaluate(self, test, preds, process='Train'):
            score=r2_score(test,preds)
            mse = mean_squared_error(test,preds)
            rmse = np.sqrt(mean_squared_error(test,preds))
            mape = mean_absolute_percentage_error(test,preds)

            print('For {}'.format(process))
            print('r2_score: {}\nmse:{}\nrmse:{}\nmape:{}\n'.format(score, mse, rmse, mape))

            return score, mse, rmse, mape
            
            
        def linear_regression(self, train_size = 0.8, random_state = 0, plot=True, **kwargs):
            session = mlflow.start_run()
            
            with session : 
                
                self.train_size = train_size       
                self.random_state = random_state
                
                
                self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.X, self.y, train_size = train_size, random_state = random_state)
                
                self.x_test1 = self.x_test.drop(self.datetime_column, axis = 1)
                self.x_train1 = self.x_train.drop(self.datetime_column, axis = 1) 
                
                ## Building a model
                self.model_type = 'LinearRegressor'       
                self.model =  LinearRegression(**kwargs)
                self.model.fit(self.x_train1,self.y_train)
                
                #Plotting feature importance
                feat_importances = pd.Series(self.model.coef_, index=self.x_train1.columns)
                feat_importances.nlargest(20).plot(kind='barh', figsize = (30, 30))
                plt.show()
                
                ## On Training Data:
                y_pred_train = self.model.predict(self.x_train1)
                self.train_score, self.train_mse, self.train_rmse, self.train_mape = self.evaluate(self.y_train, y_pred_train, 'Train')
                self.result_train = pd.DataFrame({'Actual': self.y_train, 'Predictions': y_pred_train, 'Date_time': self.x_train[self.datetime_column]}).sort_values(by = 'Date_time', ascending=True).reset_index(drop = True)


                ## On Testing Data:
                y_pred_test = self.model.predict(self.x_test1)
                self.test_score, self.test_mse, self.test_rmse, self.test_mape = self.evaluate(self.y_test, y_pred_test, 'Test')
                self.result_test = pd.DataFrame({'Actual': self.y_test, 'Predictions': y_pred_test, 'Date_time': self.x_test[self.datetime_column]}).sort_values(by = 'Date_time', ascending=True).reset_index(drop = True)

                ## On OOT data: 
                y_pred_OOT = self.model.predict(self.X_OOT.drop(self.datetime_column, axis = 1))
                self.OOT_score, self.OOT_mse, self.OOT_rmse, self.OOT_mape = self.evaluate(self.y_OOT, y_pred_OOT, 'OOT')
                self.result_OOT = pd.DataFrame({'Actual': self.y_OOT, 'Predictions': y_pred_OOT, 'Date_time': self.X_OOT[self.datetime_column]}).sort_values(by = 'Date_time', ascending=True).reset_index(drop = True)


                if plot:
                    self.result_train.plot(y=['Actual', 'Predictions'], x='Date_time', figsize = (30, 5))
                    self.result_test.plot(y=['Actual', 'Predictions'], x='Date_time', figsize = (30, 5))
                    self.result_OOT.plot(y=['Actual', 'Predictions'], x='Date_time', figsize = (30, 5))
                    plt.show()

                # Log parameters and metrics to MLflow
                mlflow.log_param("train_size", train_size)
                mlflow.log_param("random_state", random_state)
                mlflow.log_param("model_type", "Linear Regressor")
                mlflow.log_params(kwargs)

                mlflow.log_metric("r2_score_train", self.train_score)
                mlflow.log_metric("mse_train", self.train_mse)
                mlflow.log_metric("rmse_train", self.train_rmse)
                mlflow.log_metric("mape_train", self.train_mape)

                mlflow.log_metric("r2_score_test", self.test_score)
                mlflow.log_metric("mse_test", self.test_mse)
                mlflow.log_metric("rmse_test", self.test_rmse)
                mlflow.log_metric("mape_test", self.test_mape)

                mlflow.log_metric("r2_score_OOT", self.OOT_score)
                mlflow.log_metric("mse_OOT", self.OOT_mse)
                mlflow.log_metric("rmse_OOT", self.OOT_rmse)
                mlflow.log_metric("mape_OOT", self.OOT_mape)

                mlflow.sklearn.log_model(self.model, "model")
                
                return self 

        def RandomForestRegressor(self, train_size = 0.8, random_state = 0, plot = True, **kwargs):
            session = mlflow.start_run()
            
            with session : 
                
                self.train_size = train_size       
                self.random_state = random_state
                
                
                self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.X, self.y, train_size = train_size, random_state = random_state)
                
                self.x_test1 = self.x_test.drop(self.datetime_column, axis = 1)
                self.x_train1 = self.x_train.drop(self.datetime_column, axis = 1) 
                
                ## Building a model
                self.model_type = 'RandomForestRegressor'       
                self.model =  RandomForestRegressor(**kwargs)
                self.model.fit(self.x_train1,self.y_train)
                
                #Plotting feature importance
                feat_importances = pd.Series(self.model.feature_importances_, index=self.x_train1.columns)
                feat_importances.nlargest(20).plot(kind='barh', figsize = (30, 30))
                plt.show()
                
                ## On Training Data:
                y_pred_train = self.model.predict(self.x_train1)
                self.train_score, self.train_mse, self.train_rmse, self.train_mape = self.evaluate(self.y_train, y_pred_train, 'Train')
                self.result_train = pd.DataFrame({'Actual': self.y_train, 'Predictions': y_pred_train, 'Date_time': self.x_train[self.datetime_column]}).sort_values(by = 'Date_time', ascending=True).reset_index(drop = True)


                ## On Testing Data:
                y_pred_test = self.model.predict(self.x_test1)
                self.test_score, self.test_mse, self.test_rmse, self.test_mape = self.evaluate(self.y_test, y_pred_test, 'Test')
                self.result_test = pd.DataFrame({'Actual': self.y_test, 'Predictions': y_pred_test, 'Date_time': self.x_test[self.datetime_column]}).sort_values(by = 'Date_time', ascending=True).reset_index(drop = True)

                ## On OOT data: 
                y_pred_OOT = self.model.predict(self.X_OOT.drop(self.datetime_column, axis = 1))
                self.OOT_score, self.OOT_mse, self.OOT_rmse, self.OOT_mape = self.evaluate(self.y_OOT, y_pred_OOT, 'OOT')
                self.result_OOT = pd.DataFrame({'Actual': self.y_OOT, 'Predictions': y_pred_OOT, 'Date_time': self.X_OOT[self.datetime_column]}).sort_values(by = 'Date_time', ascending=True).reset_index(drop = True)


                if plot:
                    self.result_train.plot(y=['Actual', 'Predictions'], x='Date_time', figsize = (30, 5))
                    self.result_test.plot(y=['Actual', 'Predictions'], x='Date_time', figsize = (30, 5))
                    self.result_OOT.plot(y=['Actual', 'Predictions'], x='Date_time', figsize = (30, 5))
                    plt.show()

                # Log parameters and metrics to MLflow
                mlflow.log_param("train_size", train_size)
                mlflow.log_param("random_state", random_state)
                mlflow.log_param("model_type", "Random Forest Regressor")
                mlflow.log_params(kwargs)

                mlflow.log_metric("r2_score_train", self.train_score)
                mlflow.log_metric("mse_train", self.train_mse)
                mlflow.log_metric("rmse_train", self.train_rmse)
                mlflow.log_metric("mape_train", self.train_mape)

                mlflow.log_metric("r2_score_test", self.test_score)
                mlflow.log_metric("mse_test", self.test_mse)
                mlflow.log_metric("rmse_test", self.test_rmse)
                mlflow.log_metric("mape_test", self.test_mape)

                mlflow.log_metric("r2_score_OOT", self.OOT_score)
                mlflow.log_metric("mse_OOT", self.OOT_mse)
                mlflow.log_metric("rmse_OOT", self.OOT_rmse)
                mlflow.log_metric("mape_OOT", self.OOT_mape)

                mlflow.sklearn.log_model(self.model, "model")
                
                return self  
 
        def XGBRegressor(self, train_size = 0.8, random_state = 0, plot = True, **kwargs):
            session = mlflow.start_run()
            
            with session : 
                
                self.train_size = train_size       
                self.random_state = random_state
                
                
                self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.X, self.y, train_size = train_size, random_state = random_state)
                
                self.x_test1 = self.x_test.drop(self.datetime_column, axis = 1)
                self.x_train1 = self.x_train.drop(self.datetime_column, axis = 1) 
                
                ## Building a model
                self.model_type = 'XGBRegressor'       
                self.model =  XGBRegressor(**kwargs)
                self.model.fit(self.x_train1,self.y_train)
                
                #Plotting feature importance
                feat_importances = pd.Series(self.model.feature_importances_, index=self.x_train1.columns)
                feat_importances.nlargest(20).plot(kind='barh', figsize = (30, 30))
                plt.show()
                
                ## On Training Data:
                y_pred_train = self.model.predict(self.x_train1)
                self.train_score, self.train_mse, self.train_rmse, self.train_mape = self.evaluate(self.y_train, y_pred_train, 'Train')
                self.result_train = pd.DataFrame({'Actual': self.y_train, 'Predictions': y_pred_train, 'Date_time': self.x_train[self.datetime_column]}).sort_values(by = 'Date_time', ascending=True).reset_index(drop = True)


                ## On Testing Data:
                y_pred_test = self.model.predict(self.x_test1)
                self.test_score, self.test_mse, self.test_rmse, self.test_mape = self.evaluate(self.y_test, y_pred_test, 'Test')
                self.result_test = pd.DataFrame({'Actual': self.y_test, 'Predictions': y_pred_test, 'Date_time': self.x_test[self.datetime_column]}).sort_values(by = 'Date_time', ascending=True).reset_index(drop = True)

                ## On OOT data: 
                y_pred_OOT = self.model.predict(self.X_OOT.drop(self.datetime_column, axis = 1))
                self.OOT_score, self.OOT_mse, self.OOT_rmse, self.OOT_mape = self.evaluate(self.y_OOT, y_pred_OOT, 'OOT')
                self.result_OOT = pd.DataFrame({'Actual': self.y_OOT, 'Predictions': y_pred_OOT, 'Date_time': self.X_OOT[self.datetime_column]}).sort_values(by = 'Date_time', ascending=True).reset_index(drop = True)


                if plot:
                    self.result_train.plot(y=['Actual', 'Predictions'], x='Date_time', figsize = (30, 5))
                    self.result_test.plot(y=['Actual', 'Predictions'], x='Date_time', figsize = (30, 5))
                    self.result_OOT.plot(y=['Actual', 'Predictions'], x='Date_time', figsize = (30, 5))
                    plt.show()

                # Log parameters and metrics to MLflow
                mlflow.log_param("train_size", train_size)
                mlflow.log_param("random_state", random_state)
                mlflow.log_param("model_type", "XGBRegressor")
                mlflow.log_params(kwargs)

                mlflow.log_metric("r2_score_train", self.train_score)
                mlflow.log_metric("mse_train", self.train_mse)
                mlflow.log_metric("rmse_train", self.train_rmse)
                mlflow.log_metric("mape_train", self.train_mape)

                mlflow.log_metric("r2_score_test", self.test_score)
                mlflow.log_metric("mse_test", self.test_mse)
                mlflow.log_metric("rmse_test", self.test_rmse)
                mlflow.log_metric("mape_test", self.test_mape)

                mlflow.log_metric("r2_score_OOT", self.OOT_score)
                mlflow.log_metric("mse_OOT", self.OOT_mse)
                mlflow.log_metric("rmse_OOT", self.OOT_rmse)
                mlflow.log_metric("mape_OOT", self.OOT_mape)

                mlflow.sklearn.log_model(self.model, "model")
                
                return self  
            
        def PolynomialRegression(self, degree = 2, train_size = 0.8, random_state = 0, plot = True, **kwargs):
            from sklearn.preprocessing import PolynomialFeatures  
            from sklearn.model_selection import train_test_split  
            from sklearn.linear_model import LinearRegression 
            from sklearn.metrics import r2_score  
            from sklearn.metrics import mean_squared_error     
            from sklearn.metrics import mean_absolute_percentage_error 
            with mlflow.start_run():
                self.train_size = train_size    
                self.random_state = random_state
                self.model_type = 'PolynomialRegression' 
                x_train, x_test, y_train, y_test = train_test_split(self.X, self.y, train_size = train_size, random_state = random_state)
                self.x_train = x_train    
                self.x_test = x_test 
                self.y_train = y_train   
                self.y_test = y_test   
                print('Degree=',degree)
                poly = PolynomialFeatures(degree = degree, include_bias = False)
                x_train1 = poly.fit_transform(x_train)
                self.model = LinearRegression()
                self.model.fit(x_train1, y_train)
                x_test1 = poly.fit_transform(x_test)
                sc = self.model.score(x_test1, y_test)
                y_prediction = self.model.predict(x_test1)
                score=r2_score(y_test,y_prediction)
                print('r2 socre is ',score)
                print("Model Score: ", sc)
                # Log parameters and metrics to MLflow
                mlflow.log_param("train_size", train_size)
                mlflow.log_param("random_state", random_state)
                mlflow.log_param("model_type", "LinearRegression")
                mlflow.log_params(kwargs)
                mlflow.log_metric("r2_score", score)
                mlflow.log_metric("mse", self.mse)
                mlflow.log_metric("rmse", self.rmse)
                mlflow.log_metric("mape", self.mape)
                mlflow.sklearn.log_model(self.model, "model")
            return self 
        def tune_hyperparameters_XGBRegressor(self, n_estimators_range, max_depth_range, reg_lambda_range, eta_range, alpha_range, scoring = 'neg_mean_absolute_error', n_trials = 100, cv=5, plot=True):
            def objective(trial):
                n_estimators = trial.suggest_int('n_estimators', n_estimators_range[0], n_estimators_range[1])
                max_depth = trial.suggest_int('max_depth', max_depth_range[0], max_depth_range[1])
                reg_lambda = trial.suggest_loguniform('reg_lambda', reg_lambda_range[0], reg_lambda_range[1])
                eta = trial.suggest_float("eta", eta_range[0], eta_range[1])
                alpha = trial.suggest_float("alpha", alpha_range[0], alpha_range[1])
                regressor = XGBRegressor(n_estimators = n_estimators, eta= eta, max_depth = max_depth, alpha = alpha, reg_lambda= reg_lambda)
                return np.absolute(sklearn.model_selection.cross_val_score(regressor, self.X, self.y, scoring=scoring, n_jobs=-1, cv=cv)).mean()
            study = optuna.create_study(direction='minimize')
            study.optimize(objective, n_trials=n_trials)
            trial = study.best_trial        
            lg.info('Accuracy: {}'.format(trial.value))
            lg.info("Best hyperparameters: {}".format(trial.params))
            optuna.visualization.plot_optimization_history(study)
            self.model = XGBRegressor(n_estimators= trial.params['n_estimators'], eta= trial.params['eta'], max_depth= trial.params['max_depth'], alpha= trial.params['alpha'])
            # Cross Validating the model        cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
            scores = cross_val_score(self.model, self.X, self.y, scoring=scoring, cv=cv, n_jobs=-1)
            scores = absolute(scores)
            print('Displaying Cross Validation Scores.')
            print('mean_MAE {}'.format(scores.mean()))
            print('std_MAE: {}'.format(scores.std()))
            self.model.fit(self.x_train, self.y_train)
            y_prediction =  self.model.predict(self.x_test)
            if plot:
                result = pd.DataFrame({'Actual': self.y_test,
                                    'Predictions': y_prediction})
                result = result.reset_index(drop = True)
                result.reset_index().plot(y=['Actual', 'Predictions'], figsize = (30, 5))
                plt.show()
            score=r2_score(self.y_test,y_prediction)
            print('r2 socre is ',score)
            self.r2_score = score       
            print('mean_sqrd_error is==',mean_squared_error(self.y_test,y_prediction))
            self.mse = mean_squared_error(self.y_test,y_prediction)
            print('root_mean_squared error of is==',np.sqrt(mean_squared_error(self.y_test,y_prediction)))
            self.rmse = np.sqrt(mean_squared_error(self.y_test,y_prediction))
            print('Mean Absolute Percentage Error:', mean_absolute_percentage_error(self.y_test,y_prediction))
            self.mape = mean_absolute_percentage_error(self.y_test,y_prediction)
            return self    
        def tune_hyperparameters_RFRegressor(self, n_estimators_range, max_depth_range, scoring = 'neg_mean_absolute_error', n_trials = 100, cv= 3,plot=True):

            def objective(trial):
                n_estimators = trial.suggest_int('n_estimators', n_estimators_range[0], n_estimators_range[1])
                max_depth = int(trial.suggest_loguniform('max_depth', max_depth_range[0], max_depth_range[1]))
                regressor = RandomForestRegressor(n_estimators = n_estimators,  max_depth = max_depth)
                return np.absolute(sklearn.model_selection.cross_val_score(regressor, self.X, self.y, scoring=scoring, n_jobs=-1, cv=cv)).mean()
            # OPTUNA APPLICATION     
            study = optuna.create_study(direction='minimize')
            study.optimize(objective, n_trials=100)
            trial = study.best_trial   
            lg.info('\t\tAccuracy: {}'.format(trial.value))
            lg.info("\t\tBest hyperparameters: {}".format(trial.params))
            optuna.visualization.plot_optimization_history(study)
            #Random Forest    
            self.model = RandomForestRegressor(n_estimators=trial.params['n_estimators'], max_depth= trial.params['max_depth'])
            # Cross Validating the model   
            cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
            scores = cross_val_score(self.model, self.X, self.y, scoring=scoring, cv=cv, n_jobs=-1)
            scores = absolute(scores)
            print('Displaying Cross Validation Scores.')
            print('mean_MAE {}'.format(scores.mean()))
            print('std_MAE: {}'.format(scores.std()))
            self.model.fit(self.x_train, self.y_train)
            y_prediction =  self.model.predict(self.x_test)
            if plot:
                result = pd.DataFrame({'Actual': self.y_test,
                                    'Predictions': y_prediction})
                result.reset_index().plot(x = result.index, y=['Actual', 'Predictions'], figsize = (30, 5))
                plt.show()
            score=r2_score(self.y_test,y_prediction)
            print('r2 socre is ',score)
            self.r2_score = score       
            print('mean_sqrd_error is==',mean_squared_error(self.y_test,y_prediction))
            self.mse = mean_squared_error(self.y_test,y_prediction)
            print('root_mean_squared error of is==',np.sqrt(mean_squared_error(self.y_test,y_prediction)))
            self.rmse = np.sqrt(mean_squared_error(self.y_test,y_prediction))
            print('Mean Absolute Percentage Error:', mean_absolute_percentage_error(self.y_test,y_prediction))
            self.mape = mean_absolute_percentage_error(self.y_test,y_prediction)
            return self   
        def tune_hyperparameters_LinReg(self, ridge_alphas, ):
            
            #Applying Ridge Regression   
            model1 =  RidgeCV(alphas=ridge_alphas).fit(self.X, self.y)
            score1 = model1.score(self.X, self.y)


        def apply_shap(self, feature_col_names, target_col_name):

            data = pd.concat([self.X, self.y], axis = 1)
            y, X = dmatrices( "{}~ {} -1".format(target_col_name, " + ".join(feature_col_names)),
                        data=data)
            X_frame = pd.DataFrame(data=X, columns=X.design_info.column_names)

            explainer = shap.Explainer(self.model)
            shap_values = explainer(X_frame)
            shap.plots.waterfall(shap_values[0])
            plt.show()