In [1]:
cd /home/ml/Jupyter_root/123/git

/home/ml/Jupyter_root/123/git


In [2]:
# from ML.regression.utils.metrics.RegressionMetrics import RegressionMetrics
from ML.regression.utils.data_preprocessing.DataFramePreprocessing import DataFramePreprocessing

In [3]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

class RegressionMetrics:
    """
    A class for computing and printing regression evaluation metrics.

    Args:
        pipeline: The regression pipeline model.
        X_test: The test features.
        y_test: The test labels.
        X_val: The validation features.
        y_val: The validation labels.
        style: Flag indicating whether to apply styling to the output. Default is False.
    """

    def __init__(self, pipeline, X_test, y_test, X_val, y_val, style=False):
        self.pipeline = pipeline
        self.X_test = X_test
        self.y_test = y_test
        self.X_val = X_val
        self.y_val = y_val
        self.y_pred = None
        self.style = style

    def mean_absolute_error(self):
        """
        Calculates the mean absolute error (MAE).

        Returns:
            The mean absolute error value.
        """
        return mean_absolute_error(self.y_val, self.y_pred)

    def mean_absolute_percentage_error(self):
        """
        Calculates the mean absolute percentage error (MAPE).

        Returns:
            The mean absolute percentage error value.
        """
        return mean_absolute_percentage_error(self.y_val, self.y_pred)

    def mean_squared_error(self):
        """
        Calculates the mean squared error (MSE).

        Returns:
            The mean squared error value.
        """
        return mean_squared_error(self.y_val, self.y_pred)

    def mean_root_mean_squared_error(self):
        """
        Calculates the root mean squared error (RMSE).

        Returns:
            The root mean squared error value.
        """
        return mean_squared_error(self.y_val, self.y_pred, squared=False)

    def r2_test(self):
        """
        Calculates the R-squared score for the test set.

        Returns:
            The R-squared score for the test set.
        """
        return self.pipeline.score(self.X_test, self.y_test)

    def r2_val(self):
        """
        Calculates the R-squared score for the validation set.

        Returns:
            The R-squared score for the validation set.
        """
        return r2_score(self.y_val, self.y_pred)

    def aic(self):
        """
        Calculates the Akaike Information Criterion (AIC).

        Returns:
            The Akaike Information Criterion value.
        """
        y_pred = self.pipeline.predict(self.X_val)
        n_params = len(self.pipeline.named_steps)
        n = len(self.y_val)
        mse = mean_squared_error(self.y_val, y_pred)
        aic = 2 * n_params - 2 * np.log(mse) + n_params * np.log(n)
        return aic

    def bic(self):
        """
        Calculates the Bayesian Information Criterion (BIC).

        Returns:
            The Bayesian Information Criterion value.
        """
        y_pred = self.pipeline.predict(self.X_val)
        n_params = len(self.pipeline.named_steps)
        n = len(self.y_val)
        mse = mean_squared_error(self.y_val, y_pred)
        bic = -2 * np.log(mse) + n_params * np.log(n)
        return bic

    def std(self):
        """
        Calculates the standard deviation of the predicted values.

        Returns:
            The standard deviation value.
        """
        return self.y_pred.std()

    def mean(self):
        """
        Calculates the mean of the predicted values.

        Returns:
            The mean value.
        """
        return self.y_pred.mean()

    def predict(self):
        """
        Performs prediction on the validation set.
        """
        self.y_pred = self.pipeline.predict(self.X_val)

    def set_frame_style(self, df, caption=""):
        """
        Helper function to set dataframe presentation style.

        Args:
            df: The DataFrame to style.
            caption: The caption for the styled DataFrame. Default is an empty string.

        Returns:
            The styled DataFrame.
        """
        return df.style.background_gradient(
            cmap='coolwarm').set_caption(caption).set_table_styles([{
                'selector':
                'caption',
                'props': [('color', 'Blue'), ('font-size', '28px'),
                          ('font-weight', 'bold')]
            }])

    def run(self):
        """
        Runs the regression metrics calculation and printing.
        """
        try:
            self.predict()
            metrics = {
                "Test R-squared": self.r2_test(),
                "Val R-squared": self.r2_val(),
                "MAE": self.mean_absolute_error(),
                "MSE": self.mean_squared_error(),
                "RMSE": self.mean_root_mean_squared_error(),
                "MAPE": self.mean_absolute_percentage_error(),
                "AIC": self.aic(),
                "BIC": self.bic(),
                "Std Deviation": self.std(),
                "Mean": self.mean()
            }

            df_metrics = pd.DataFrame.from_dict(metrics,
                                                orient="index",
                                                columns=["Value"])
            df_metrics.index.name = "Metric"

            if self.style:
                return self.set_frame_style(df_metrics)
            else:
                return df_metrics
        except Exception as e:
            print("An error occurred:", str(e))

In [4]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

class RegressionMetrics:
    """
    A class for computing and printing regression evaluation metrics.

    Args:
        pipeline_or_model: The regression pipeline or model.
        X_test: The test features.
        y_test: The test labels.
        X_val: The validation features.
        y_val: The validation labels.
        style: Flag indicating whether to apply styling to the output. Default is False.
        is_model: Flag indicating whether the provided object is a regression model. Default is False.
    """

    def __init__(self, pipeline_or_model, X_test, y_test, X_val, y_val, style=False, is_model=False):
        if is_model:
            self.model = pipeline_or_model
        else:
            self.pipeline = pipeline_or_model
        self.X_test = X_test
        self.y_test = y_test
        self.X_val = X_val
        self.y_val = y_val
        self.y_pred = None
        self.style = style

    def mean_absolute_error(self):
        """
        Calculates the mean absolute error (MAE).

        Returns:
            The mean absolute error value.
        """
        return mean_absolute_error(self.y_val, self.y_pred)

    def mean_absolute_percentage_error(self):
        """
        Calculates the mean absolute percentage error (MAPE).

        Returns:
            The mean absolute percentage error value.
        """
        return mean_absolute_percentage_error(self.y_val, self.y_pred)

    def mean_squared_error(self):
        """
        Calculates the mean squared error (MSE).

        Returns:
            The mean squared error value.
        """
        return mean_squared_error(self.y_val, self.y_pred)

    def mean_root_mean_squared_error(self):
        """
        Calculates the root mean squared error (RMSE).

        Returns:
            The root mean squared error value.
        """
        return mean_squared_error(self.y_val, self.y_pred, squared=False)

    def r2_test(self):
        """
        Calculates the R-squared score for the test set.

        Returns:
            The R-squared score for the test set.
        """
        if hasattr(self, 'model'):
            return self.model.score(self.X_test, self.y_test)
        else:
            return self.pipeline.score(self.X_test, self.y_test)

    def r2_val(self):
        """
        Calculates the R-squared score for the validation set.

        Returns:
            The R-squared score for the validation set.
        """
        return r2_score(self.y_val, self.y_pred)

    def aic(self):
        """
        Calculates the Akaike Information Criterion (AIC).

        Returns:
            The Akaike Information Criterion value.
        """
        y_pred = self.predict()
        n_params = len(self.pipeline.named_steps) if hasattr(self, 'pipeline') else 0
        n = len(self.y_val)
        mse = mean_squared_error(self.y_val, y_pred)
        aic = 2 * n_params - 2 * np.log(mse) + n_params * np.log(n)
        return aic

    def bic(self):
        """
        Calculates the Bayesian Information Criterion (BIC).

        Returns:
            The Bayesian Information Criterion value.
        """
        y_pred = self.predict()
        n_params = len(self.pipeline.named_steps) if hasattr(self, 'pipeline') else 0
        n = len(self.y_val)
        mse = mean_squared_error(self.y_val, y_pred)
        bic = -2 * np.log(mse) + n_params * np.log(n)
        return bic

    def std(self):
        """
        Calculates the standard deviation of the predicted values.

        Returns:
            The standard deviation value.
        """
        return self.y_pred.std()

    def mean(self):
        """
        Calculates the mean of the predicted values.

        Returns:
            The mean value.
        """
        return self.y_pred.mean()

    def predict(self):
        """
        Performs prediction on the validation set.

        Returns:
            The predicted values.
        """
        if hasattr(self, 'model'):
            self.y_pred = self.model.predict(self.X_val)
        else:
            self.y_pred = self.pipeline.predict(self.X_val)
        return self.y_pred

    def set_frame_style(self, df, caption=""):
        """
        Helper function to set dataframe presentation style.

        Args:
            df: The DataFrame to style.
            caption: The caption for the styled DataFrame. Default is an empty string.

        Returns:
            The styled DataFrame.
        """
        return df.style.background_gradient(
            cmap='coolwarm').set_caption(caption).set_table_styles([{
                'selector':
                'caption',
                'props': [('color', 'Blue'), ('font-size', '28px'),
                          ('font-weight', 'bold')]
            }])

    def run(self):
        """
        Runs the regression metrics calculation and printing.

        Returns:
            The DataFrame containing the metric values.
        """
        try:
            self.predict()
            metrics = {
                "Test R-squared": self.r2_test(),
                "Val R-squared": self.r2_val(),
                "MAE": self.mean_absolute_error(),
                "MSE": self.mean_squared_error(),
                "RMSE": self.mean_root_mean_squared_error(),
                "MAPE": self.mean_absolute_percentage_error(),
                "AIC": self.aic(),
                "BIC": self.bic(),
                "Std Deviation": self.std(),
                "Mean": self.mean()
            }

            df_metrics = pd.DataFrame.from_dict(metrics,
                                                orient="index",
                                                columns=["Value"])
            df_metrics.index.name = "Metric"

            if self.style:
                return self.set_frame_style(df_metrics)
            else:
                return df_metrics
        except Exception as e:
            print("An error occurred:", str(e))


In [5]:
from sklearn.datasets import fetch_california_housing
# Split
from sklearn.model_selection import train_test_split
# Model
import xgboost as xgb

from sklearn.pipeline import Pipeline

## Config

In [6]:
class CFG:
    #Random seed
    SEED = 42

## train test validate split

In [7]:
X, y = fetch_california_housing(as_frame=True,return_X_y =True)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20,
                                                    random_state=CFG.SEED,
                                                    shuffle = True,
                                                   )

In [9]:
X_train.shape, y_train.shape,  X_val.shape, y_val.shape

((16512, 8), (16512,), (4128, 8), (4128,))

In [10]:
import pandas as pd
from sklearn.model_selection import KFold

class Splitter:
    """
    A class used to split data into train/test sets.
    """
    def __init__(self, X, y, n_splits=5, random_state=42, shuffle=True):
        """
        The constructor for Splitter class.

        Parameters:
           X (DataFrame): Features
           y (Series/DataFrame): Target variable
           n_splits (int): Number of folds. Must be at least 2.
           random_state (int): Random state for reproducibility.
           shuffle (bool): Whether to shuffle the data before splitting into batches.
        """
        self.X = X
        self.y = y
        self.n_splits = n_splits
        self.random_state = random_state
        self.shuffle = shuffle
        self.kfold = KFold(n_splits=self.n_splits, random_state=self.random_state, shuffle=self.shuffle)
        
    def split_data(self):
        X_train, X_test, y_train, y_test = [], [], [], []
        
        for train_index, test_index in self.kfold.split(self.X):
            X_train_fold, X_test_fold = self.X.iloc[train_index], self.X.iloc[test_index]
            y_train_fold, y_test_fold = self.y.iloc[train_index], self.y.iloc[test_index]
            
            X_train.append(X_train_fold)
            X_test.append(X_test_fold)
            y_train.append(y_train_fold)
            y_test.append(y_test_fold)
        
        return  X_train, X_test, y_train, y_test

In [11]:
def modeling(X_train_fold,y_train_fold, model):
    models = []
    for fold, data in enumerate(zip(X_train_fold, y_train_fold)):
        X_train_fold, y_train_fold = data
    
        # Train your model on the training data
        pipeline.fit(X_train_fold, y_train_fold)
        models.append(pipeline)
    return models


In [12]:
model = xgb.XGBRegressor()

preprocessing = DataFramePreprocessing()

steps = [('preprocessing', preprocessing)]# Create a list of tuples with the steps
steps.append(('model', model))
pipeline = Pipeline(steps=steps)

In [13]:
splitter = Splitter(X_train, y_train,n_splits=3, random_state=42)
X_train_fold, X_test_fold, y_train_fold, y_test_fold = splitter.split_data()

In [14]:
models = modeling(X_train_fold,y_train_fold, pipeline)

In [15]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
import xgboost as xgb

class ModelingPipeline:
    """
    A class to preprocess, train and evaluate a model on different folds of the data.
    """
    def __init__(self, X_train, y_train, model, preprocessing, n_splits=3, random_state=42):
        self.X_train = X_train
        self.y_train = y_train
        self.model = model
        self.preprocessing = preprocessing
        self.n_splits = n_splits
        self.random_state = random_state
        self.splitter = KFold(n_splits=self.n_splits, random_state=self.random_state, shuffle=True)
        self.models = []
        self.steps = [('preprocessing', self.preprocessing), ('model', self.model)]
        self.pipeline = Pipeline(steps=self.steps)

    def fit_and_evaluate(self, X_val, y_val):
        for train_index, test_index in self.splitter.split(self.X_train):
            X_train_fold, y_train_fold = self.X_train.iloc[train_index], self.y_train.iloc[train_index]
            
            # Train your model on the training data
            self.pipeline.fit(X_train_fold, y_train_fold)
            self.models.append(self.pipeline)
            
            # Evaluate the model
            print(f'Fold {len(self.models)}:', end=' ')
            metrics = RegressionMetrics(self.pipeline, X_train_fold, y_train_fold, X_val, y_val, style=True)
            output = metrics.run()
            display(output)

        return self.models


In [61]:
model = xgb.XGBRegressor()
preprocessing = DataFramePreprocessing()

pipeline = ModelingPipeline(X_train, y_train, model, preprocessing, n_splits=2, random_state=42)
models = pipeline.fit_and_evaluate(X_val, y_val)



Fold 1: 

Unnamed: 0_level_0,Value
Metric,Unnamed: 1_level_1
Test R-squared,0.963452
Val R-squared,0.809246
MAE,0.336536
MSE,0.249966
RMSE,0.499966
MAPE,0.196848
AIC,23.42396
BIC,19.42396
Std Deviation,1.047575
Mean,2.0625


Fold 2: 

Unnamed: 0_level_0,Value
Metric,Unnamed: 1_level_1
Test R-squared,0.963209
Val R-squared,0.805525
MAE,0.33384
MSE,0.254841
RMSE,0.504818
MAPE,0.193445
AIC,23.385325
BIC,19.385325
Std Deviation,1.062548
Mean,2.068632


In [46]:
from sklearn.ensemble import VotingRegressor, StackingRegressor

In [59]:
# Combine all the models into a single list
all_models = [(f'model_{i}', model) for i, model in enumerate(models_1 + models_2 + models_3)]

# Create the voting regressor
voting_regressor = VotingRegressor(estimators=all_models,)

# Fit the voting regressor to the data
voting_regressor.fit(X_train, y_train)

# Use the voting regressor for prediction
y_pred = voting_regressor.predict(X_val)

In [60]:
print("mae   ",mean_absolute_error(y_val, y_pred))
print("mape: ",mean_absolute_percentage_error(y_val, y_pred))
print("rmse: ",mean_squared_error(y_val, y_pred,squared=False))
print("r2:   ",r2_score(y_val, y_pred))
print('y_pred std', y_pred.std())

mae    0.31130193403396494
mape:  0.18034528238011363
rmse:  0.47390178084380813
r2:    0.8286161789059061
y_pred std 1.0591562


In [48]:
metrics = RegressionMetrics(voting_regressor, X_train, y_train, X_val, y_val, style = True,is_model=True)
metrics.run()

Unnamed: 0_level_0,Value
Metric,Unnamed: 1_level_1
Test R-squared,0.943187
Val R-squared,0.828616
MAE,0.311302
MSE,0.224583
RMSE,0.473902
MAPE,0.180345
AIC,2.987021
BIC,2.987021
Std Deviation,1.059156
Mean,2.064616


In [None]:
from sklearn.ensemble import VotingRegressor

In [None]:
models_1.extend(models_2)

In [None]:
models_1.extend(models_3)

In [None]:
VotingRegressor(models_1)

In [None]:
def modeling(X_train_fold,y_train_fold, model):
    models = []
    for fold, data in enumerate(zip(X_train_fold, y_train_fold)):
        X_train_fold, y_train_fold = data
    
        # Train your model on the training data
        pipeline.fit(X_train_fold, y_train_fold)
        models.append(pipeline)
    return models
model = xgb.XGBRegressor()

preprocessing = DataFramePreprocessing()

steps = [('preprocessing', preprocessing)]# Create a list of tuples with the steps
steps.append(('model', model))
pipeline = Pipeline(steps=steps)
splitter = Splitter(X_train, y_train,n_splits=3, random_state=42)
X_train_fold, X_test_fold, y_train_fold, y_test_fold = splitter.split_data()
for fold, model in enumerate(models):
    print(f'Fold {fold}:', end=' ')
    
    metrics = RegressionMetrics(model, X_train_fold[fold], y_train_fold[fold], X_val, y_val, style = True)
    output = metrics.run()
        
    display(output)

In [None]:
for fold, model in enumerate(models):
    print(f'Fold {fold}:', end=' ')
    
    metrics = RegressionMetrics(model, X_train_fold[fold], y_train_fold[fold], X_val, y_val, style = True)
    output = metrics.run()
        
    display(output)
    