In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder



class MyPipeline(BaseEstimator, TransformerMixin):
    
    def __init__(self,df: pd.DataFrame):
        self.df = None
        
    '''
    This is Pipeline Created by me to allow my dataset to be ready for different transformations 
    according to the algorithm we need to train our data on.
    
    This is my Parent Class which will consist of all those methods which can be used for transformations
    required in multiple child pipelines repetitively
    '''
    
    def map_cat_by_statmeasure(self,df: pd.DataFrame,columns=None,mapby='mean',prediction_var='price'):
        '''
        This method will be used in data transformation pipelines to encode categorical columns
        
        df: Takes input only DataFrame (Make sure to not have any null values in dataframe)
        columns: list of categorical columns you want to encode by any statistical measure
        '''
        
        
        # Validate the 'mapby' parameter
        if mapby not in ['mean', 'median']:
            raise ValueError("mapby must be either 'mean' or 'median'")
        
        if columns is None:
            columns = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
        
        for col in columns:
            if mapby=='mean':
                group = df.groupby([col])[prediction_var].mean()
            else:
                group = df.groupby([col])[prediction_var].median()
                
            self.df[col] = self.df[col].map(group)
            
        return self.df
    
    def OHE(self, df: pd.DataFrame, columns=None):
        
        if columns is None:
            columns = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # Create the encoder
        encoder = OneHotEncoder(sparse_output=False)
        
        # Assuming that 'df' is your DataFrame and 'column_to_encode' is the column you want to encode
        encoded_columns = encoder.fit_transform(self.df[columns])

        # The result is a numpy array of encoded columns
        
        self.df.reset_index(drop=True, inplace=True)
        # Assuming that 'df' is your DataFrame and 'encoded_columns' is the one-hot encoded numpy array

        encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out())

        # Concatenate the original DataFrame and the encoded DataFrame
        self.df = pd.concat([self.df, encoded_df], axis=1)
        
        # Check for any null values
        if self.df.isnull().values.any():
            print("Null values found in the DataFrame after encoding:")
            print(self.df[self.df.isnull().any(axis=1)])
        
        self.df =self.df.drop(columns=columns)
        
        return self.df
    
    
        

        
                
            
            
    

In [None]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class MyOrdinalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, target_column=None,by='mean'):
        self.columns = columns
        self.target_column = target_column
        self.encoding_dict_ = {}
        self.by =by
        # Validate the 'mapby' parameter
        if by not in ['mean', 'median']:
            raise ValueError("mapby must be either 'mean' or 'median'")

    def fit(self, X, y=None):
        if self.target_column is None:
            raise ValueError("Columns to encode and target column must be specified.")
        
        if self.columns is None:
            columns = self.X.select_dtypes(include=['object', 'category']).columns.tolist()
        
        
        for col in self.columns:
            # Calculate the mean of the target column for each category
            if self.by=='mean':
                self.encoding_dict_[col] = X.groupby(col)[self.target_column].mean().to_dict()
            else:
                self.encoding_dict_[col] = X.groupby(col)[self.target_column].median().to_dict()
                
            
        
        return self

    def transform(self, X, y=None):
        X = X.copy()
        
        for col in self.columns:
            # Replace each category with the calculated mean
            X[col] = X[col].map(self.encoding_dict_[col])
        
        return X
    



In [None]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class MyOHE(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, target_column=None):
        self.columns = columns
        self.target_column = target_column
        self.encoding_dict_ = {}

    def fit(self, X, y=None):
        if self.target_column is None:
            raise ValueError("Columns to encode and target column must be specified.")
        
        if self.columns is None:
            columns = self.X.select_dtypes(include=['object', 'category']).columns.tolist()
        
        
        for col in self.columns:
            # Calculate the mean of the target column for each category
            if self.by=='mean':
                self.encoding_dict_[col] = X.groupby(col)[self.target_column].mean().to_dict()
            else:
                self.encoding_dict_[col] = X.groupby(col)[self.target_column].median().to_dict()
                
            
        
        return self

    def transform(self, X, y=None):
        X = X.copy()
        
        for col in self.columns:
            # Replace each category with the calculated mean
            X[col] = X[col].map(self.encoding_dict_[col])
        
        return X
    

