In [None]:
# to handle datasets
import numpy as np
import pandas as pd

# base or mother object export from sickit-learn
# for gettting and setting variables compatible to sickit-learn variables
from sklearn.base import BaseEstimator

# for fit_transform object compatible to sickit-learn
# you need to write fit and transform method in child object contruction
from sklearn.base import TransformerMixin

In [None]:
# Extraction of the letter and keep it only in the variable Cabin

class VariableCabinTransform(BaseEstimator, TransformerMixin):
	# Temporal elapsed time transformer

    def __init__(self, variable_list):

        if not isinstance(variable_list, list):
            raise ValueError('variable_list should be a list')

        self.variable_list = variable_list

    def fit(self, X, y=None):
        # we need this step to fit the sklearn pipeline
        return self

    def transform(self, X):

        if not isinstance(X, pd.DataFrame):
              raise ValueError('X should be a dataframe')

    	# so that we do not over-write the original dataframe
        X = X.copy()

        for feature in self.variable_list:
            X[feature] = X[feature].str[0]

        return X

In [None]:
# Create new features that capture information about presence or abscence of Outliers in data set

class OutliersFeatureCreation(BaseEstimator, TransformerMixin):
	# Temporal elapsed time transformer

    def __init__(self, outliers_num_vars_list):

        if not isinstance(outliers_num_vars_list, list):
            raise ValueError('outliers_num_vars should be a list')

        self.outliers_num_vars_list = outliers_num_vars_list

    def fit(self, X, y=None):
        # we need this step to fit the sklearn pipeline
        return self

    def transform(self, X):

        if not isinstance(X, pd.DataFrame):
              raise ValueError('X should be a dataframe')

    	# so that we do not over-write the original dataframe
        X = X.copy()

        # capture the outliers and create the new feature
        for var in self.outliers_num_vars_list:
            # Identify outliers using IQR
            Q1 = np.percentile(X[var], 25)
            Q3 = np.percentile(X[var], 75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = (X[var] < lower_bound) | (X[var] > upper_bound)

            # add outliers indicator for each columns with outliers data
            X[var + '_outliers'] = np.where(outliers, 1, 0)

        return X

In [None]:
# Temporal elapsed time transformer

class TemporalVariableTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, variables, reference_variable):

        if not isinstance(variables, list):
            raise ValueError('variables should be a list')

        self.variables = variables
        self.reference_variable = reference_variable

    def fit(self, X, y=None):
        # we need this step to fit the sklearn pipeline
        return self

    def transform(self, X):

    	# so that we do not over-write the original dataframe
        X = X.copy()

        for feature in self.variables:
            X[feature] = X[self.reference_variable] - X[feature]

        return X

In [None]:
# for mapping categorical variable like quality, ... : when we have mapping dictionnary

class Mapper(BaseEstimator, TransformerMixin):

    def __init__(self, variables, mappings):

        if not isinstance(variables, list):
            raise ValueError('variables should be a list')

        self.variables = variables
        self.mappings = mappings

    def fit(self, X, y=None):
        # we need the fit statement to accomodate the sklearn pipeline
        return self

    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature] = X[feature].map(self.mappings)

        return X

In [None]:
# for imputation of missing numerical variables (replaced by the mean or mediane or ...)

class MeanImputer(BaseEstimator, TransformerMixin):
    """Numerical missing value imputer."""

    def __init__(self, variables):
        if not isinstance(variables, list):
            raise ValueError('variables should be a list')
        self.variables = variables

    def fit(self, X, y=None):
        # persist mean values in a dictionary
        self.imputer_dict_ = X[self.variables].mean().to_dict()
        return self

    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature].fillna(self.imputer_dict_[feature],
                              inplace=True)
        return X

In [None]:
# for encoding Rare labels (categorical variable)

class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin):
    """Groups infrequent categories into a single string"""

    def __init__(self, variables, tol=0.05):

        if not isinstance(variables, list):
            raise ValueError('variables should be a list')

        self.tol = tol
        self.variables = variables

    def fit(self, X, y=None):
        # persist frequent labels in dictionary
        self.encoder_dict_ = {}

        for var in self.variables:
            # the encoder will learn the most frequent categories
            t = pd.Series(X[var].value_counts(normalize=True))
            # frequent labels:
            self.encoder_dict_[var] = list(t[t >= self.tol].index)

        return self

    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature] = np.where(
                X[feature].isin(self.encoder_dict_[feature]),
                                X[feature], "Rare")

        return X

In [None]:
# for regression problem
# one way for encoding categorical variable to capture monotonic relationship between categorical variables and target

# this object will assign discrete values to the strings of the variables,
# so that the smaller value corresponds to the category that shows the smaller
# mean house sale price

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """String to numbers categorical encoder."""

    def __init__(self, variables):

        if not isinstance(variables, list):
            raise ValueError('variables should be a list')

        self.variables = variables

    def fit(self, X, y):
        temp = pd.concat([X, y], axis=1)
        temp.columns = list(X.columns) + ["target"]

        # persist transforming dictionary
        self.encoder_dict_ = {}

        for var in self.variables:
            t = temp.groupby([var])["target"].mean().sort_values(ascending=True).index
            self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)}

        return self

    def transform(self, X):
        # encode labels
        X = X.copy()
        for feature in self.variables:
            X[feature] = X[feature].map(self.encoder_dict_[feature])

        return X