# Example

## Libs

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn_pandas import DataFrameMapper

class DataFramePreprocessing(TransformerMixin, BaseEstimator):
    '''DataFramePreprocessing class fits and transforms all features and returns a Pandas DataFrame'''

    def __init__(self):
        '''
        Constructor for DataFramePreprocessing class.

        Parameters:
        - X (pandas.DataFrame): Input DataFrame.
        '''
        self.X = None
        self.numerical_features = []
        self.categorical_features = []
        self.boolean_features = []
    def get_features(self):
        '''
        Extracts the numeric, categorical, and boolean features from the input DataFrame.

        Parameters:
        - X (pandas.DataFrame): Input DataFrame.

        Returns:
        - None
        '''
        self.numerical_features = self.X.select_dtypes(include=['int16', 'float16', 'int32', 'float32', 'int64', 'float64']).columns
        self.categorical_features = self.X.select_dtypes(include=['object']).columns
        self.boolean_features = self.X.select_dtypes(include=['bool']).columns
        
    def categorical_transformer(self):
        '''
        Creates a list of tuples specifying the transformations for categorical features.

        Returns:
        - list: List of tuples, where each tuple contains a feature name and a transformation pipeline.
        '''
        return [([feature], [SimpleImputer(strategy='most_frequent'), OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)]) for feature in self.categorical_features]

    def numerical_transformer(self):
        '''
        Creates a list of tuples specifying the transformations for numerical features.

        Returns:
        - list: List of tuples, where each tuple contains a feature name and a transformation pipeline.
        '''
        return [([feature], [SimpleImputer(strategy='most_frequent'), StandardScaler()]) for feature in self.numerical_features]

    def boolean_transformer(self):
        '''
        Creates a list of tuples specifying the transformations for boolean features.

        Returns:
        - list: List of tuples, where each tuple contains a feature name and a transformation pipeline.
        '''
        return [([feature], [SimpleImputer(strategy='most_frequent'), OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)]) for feature in self.boolean_features]

    def mapper(self):
        '''
        Creates a DataFrameMapper object that combines all the feature transformations.

        Returns:
        - DataFrameMapper: DataFrameMapper object that applies the specified transformations to the input data.
        '''
        
        return DataFrameMapper(self.numerical_transformer() + self.categorical_transformer() + self.boolean_transformer(), df_out=True)

    def fit(self, X, y=None):
        '''
        Fits the DataFramePreprocessing transformer on the input data.

        Parameters:
        - X (array-like or DataFrame): Input data to fit the transformer on.
        - y (array-like or None): Target values (ignored).

        Returns:
        - self: Returns the instance itself.
        '''
        X = X.copy()
        self.X = X
        self.get_features()
        self.mapper()
        X = check_array(X, accept_sparse=False)

        self.n_features_in_ = X.shape[1]
        self.n_features_ = X.shape[1]
        self.is_fitted_ = True

        return self

    def transform(self, X):
        '''
        Transforms the input data using the fitted DataFramePreprocessing transformer.
    
        Parameters:
        - X (array-like or DataFrame): Input data to transform.
    
        Returns:
        - array-like or DataFrame: Transformed data.
        '''
        X = X.copy()
        
        check_is_fitted(self, ['is_fitted_'])
        X = check_array(X, accept_sparse=True)
    
        if X.shape[1] != self.n_features_:
            raise ValueError('Shape of input is different from what was seen in `fit`')
        return X

In [2]:
from ML.regression.utils.metrics.RegressionMetrics import RegressionMetrics

In [3]:
from sklearn.datasets import fetch_california_housing
# Split
from sklearn.model_selection import train_test_split
# Model
import xgboost as xgb

from sklearn.pipeline import Pipeline

## Config

In [4]:
class CFG:
    #Random seed
    SEED = 42

## train test validate split

In [5]:
X, y = fetch_california_housing(as_frame=True,return_X_y =True)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,
                                                    random_state=CFG.SEED,
                                                    shuffle = True,
                                                   )
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.50,
                                                random_state=CFG.SEED,
                                                shuffle = True,
                                               )

## Shape

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape, X_val.shape, y_val.shape

((16512, 8), (2064, 8), (16512,), (2064,), (2064, 8), (2064,))

In [8]:
model = xgb.XGBRegressor()

In [9]:
preprocessing = DataFramePreprocessing()

In [10]:
steps = [('preprocessing', preprocessing)]# Create a list of tuples with the steps
steps.append(('model', model))
pipeline = Pipeline(steps=steps)

## Modeling

In [11]:
pipeline.fit(X_train, y_train)

## Metrics

In [12]:
metrics = RegressionMetrics(pipeline, X_test, y_test, X_val, y_val)
metrics.run()

Unnamed: 0_level_0,Value
Metric,Unnamed: 1_level_1
Test R-squared,0.84035
Val R-squared,0.816672
MAE,0.317786
MSE,0.238944
RMSE,0.488819
MAPE,0.182462
AIC,22.127852
BIC,18.127852
Std Deviation,1.058918
Mean,2.08217


# With style


In [13]:
metrics = RegressionMetrics(pipeline, X_test, y_test, X_val, y_val, style = True)
metrics.run()

Unnamed: 0_level_0,Value
Metric,Unnamed: 1_level_1
Test R-squared,0.84035
Val R-squared,0.816672
MAE,0.317786
MSE,0.238944
RMSE,0.488819
MAPE,0.182462
AIC,22.127852
BIC,18.127852
Std Deviation,1.058918
Mean,2.08217
