In [1]:
from ML.regression.utils.metrics.RegressionMetrics import RegressionMetrics
from ML.regression.utils.data_preprocessing.DataFramePreprocessing import DataFramePreprocessing

In [2]:
from sklearn.datasets import fetch_california_housing
# Split
from sklearn.model_selection import train_test_split
# Model
import xgboost as xgb

from sklearn.pipeline import Pipeline

## Config

In [3]:
class CFG:
    #Random seed
    SEED = 42

## train test validate split

In [4]:
X, y = fetch_california_housing(as_frame=True,return_X_y =True)

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20,
                                                    random_state=CFG.SEED,
                                                    shuffle = True,
                                                   )

In [6]:
X_train.shape, y_train.shape,  X_val.shape, y_val.shape

((16512, 8), (16512,), (4128, 8), (4128,))

In [7]:
import pandas as pd
from sklearn.model_selection import KFold

class Splitter:
    """
    A class used to split data into train/test sets.
    """
    def __init__(self, X, y, n_splits=5, random_state=42, shuffle=True):
        """
        The constructor for Splitter class.

        Parameters:
           X (DataFrame): Features
           y (Series/DataFrame): Target variable
           n_splits (int): Number of folds. Must be at least 2.
           random_state (int): Random state for reproducibility.
           shuffle (bool): Whether to shuffle the data before splitting into batches.
        """
        self.X = X
        self.y = y
        self.n_splits = n_splits
        self.random_state = random_state
        self.shuffle = shuffle
        self.kfold = KFold(n_splits=self.n_splits, random_state=self.random_state, shuffle=self.shuffle)
        
    def split_data(self):
        X_train, X_test, y_train, y_test = [], [], [], []
        
        for train_index, test_index in self.kfold.split(self.X):
            X_train_fold, X_test_fold = self.X.iloc[train_index], self.X.iloc[test_index]
            y_train_fold, y_test_fold = self.y.iloc[train_index], self.y.iloc[test_index]
            
            X_train.append(X_train_fold)
            X_test.append(X_test_fold)
            y_train.append(y_train_fold)
            y_test.append(y_test_fold)
        
        return  X_train, X_test, y_train, y_test

In [8]:
def modeling(X_train_fold,y_train_fold, model):
    models = []
    for fold, data in enumerate(zip(X_train_fold, y_train_fold)):
        X_train_fold, y_train_fold = data
    
        # Train your model on the training data
        pipeline.fit(X_train_fold, y_train_fold)
        models.append(pipeline)
    return models


In [9]:
model = xgb.XGBRegressor()

preprocessing = DataFramePreprocessing()

steps = [('preprocessing', preprocessing)]# Create a list of tuples with the steps
steps.append(('model', model))
pipeline = Pipeline(steps=steps)

In [10]:
splitter = Splitter(X_train, y_train,n_splits=3, random_state=42)
X_train_fold, X_test_fold, y_train_fold, y_test_fold = splitter.split_data()

In [11]:
models = modeling(X_train_fold,y_train_fold, pipeline)

In [12]:
for fold, model in enumerate(models):
    print(f'Fold {fold}:', end=' ')
    
    metrics = RegressionMetrics(model, X_train_fold[fold], y_train_fold[fold], X_val, y_val, style = True)
    output = metrics.run()
        
    display(output)
    

Fold 0: 

Unnamed: 0_level_0,Value
Metric,Unnamed: 1_level_1
Test R-squared,0.88721
Val R-squared,0.817397
MAE,0.326664
MSE,0.239285
RMSE,0.489167
MAPE,0.188783
AIC,23.5113
BIC,19.5113
Std Deviation,1.058499
Mean,2.061726


Fold 1: 

Unnamed: 0_level_0,Value
Metric,Unnamed: 1_level_1
Test R-squared,0.888519
Val R-squared,0.817397
MAE,0.326664
MSE,0.239285
RMSE,0.489167
MAPE,0.188783
AIC,23.5113
BIC,19.5113
Std Deviation,1.058499
Mean,2.061726


Fold 2: 

Unnamed: 0_level_0,Value
Metric,Unnamed: 1_level_1
Test R-squared,0.955586
Val R-squared,0.817397
MAE,0.326664
MSE,0.239285
RMSE,0.489167
MAPE,0.188783
AIC,23.5113
BIC,19.5113
Std Deviation,1.058499
Mean,2.061726
