In [31]:
import pandas as  pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

In [32]:
df = pd.DataFrame({'a': np.random.randint(10,100, size=10),
                   'b': np.random.randint(10,100, size=10),
                   'c': np.random.randint(10,100, size=10)})
df

Unnamed: 0,a,b,c
0,92,98,81
1,25,63,45
2,39,52,27
3,90,44,82
4,71,84,47
5,60,76,42
6,13,75,75
7,59,74,14
8,79,53,91
9,44,21,69


In [33]:
class StandardScaler(BaseEstimator,TransformerMixin):
    def __init__(self,variables: list|None = None):
        self.variables = variables
        
    def fit(self,X: pd.DataFrame, y = None):
        if self.variables is not None:
            self.numeric_columns_ = pd.Index(self.variables)
        else:
            numeric_columns = X.select_dtypes(include=[np.number]).columns
            self.numeric_columns_ = numeric_columns 
        self.mean_ = X[self.numeric_columns_].mean()
        self.std_ = X[self.numeric_columns_].std()
        return self
    
    def transform(self,X: pd.DataFrame,y=None):
        data = X.copy()
        try:
            data[self.numeric_columns_] = (data[self.numeric_columns_] - self.mean_) / self.std_
        except Exception as e:
            self.errors_ = e
        return data

In [34]:
scaler = StandardScaler(['a','b'])
scaler.fit(df)

In [35]:
scaler.transform(df)

Unnamed: 0,a,b,c
0,1.298966,1.531456,81
1,-1.201917,-0.045043,45
2,-0.679344,-0.540514,27
3,1.224313,-0.900856,82
4,0.515107,0.900856,47
5,0.104514,0.540514,42
6,-1.649836,0.495471,75
7,0.067188,0.450428,14
8,0.81372,-0.495471,91
9,-0.492711,-1.936841,69


In [36]:
scaler.mean_
pd.concat([scaler.mean_,scaler.std_],axis=1,keys=['mean','std'])

Unnamed: 0,mean,std
a,57.2,26.790546
b,64.0,22.201101


In [37]:
class MinMaxScaler (BaseEstimator, TransformerMixin):
    def __init__(self, feature_range=(0,1), variables: list|None = None):
        self.feature_range = feature_range
        self.variables = variables 
    
    def fit (self,X : pd.DataFrame, y=None ):
        if self.variables is not None:
            self.numeric_columns_ = pd.Index(self.variables)
        else:
            numeric_columns = X.select_dtypes(include=[np.number]).columns
            self.numeric_columns_ = numeric_columns 
        self.min_ = X[self.numeric_columns_].min()
        self.max_ = X[self.numeric_columns_].max()
        return self
    
    def transform(self,X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            min_val, max_val = self.feature_range
            data[self.numeric_columns_] = (data[self.numeric_columns_] - self.min_) / (self.max_ - self.min_) * (max_val - min_val) + min_val
        except Exception as e:
            self.errors_ = e
        return data

In [38]:
minimax = MinMaxScaler()
minimax.fit(df)
minimax.transform(df)

Unnamed: 0,a,b,c
0,1.0,1.0,0.87013
1,0.151899,0.545455,0.402597
2,0.329114,0.402597,0.168831
3,0.974684,0.298701,0.883117
4,0.734177,0.818182,0.428571
5,0.594937,0.714286,0.363636
6,0.0,0.701299,0.792208
7,0.582278,0.688312,0.0
8,0.835443,0.415584,1.0
9,0.392405,0.0,0.714286


In [39]:
class winsorizer(BaseEstimator, TransformerMixin):
    def __init__(self,variables: list|None = None,lower_quantile=0.25, upper_quantile=0.75, K=5):
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile
        self.variables = variables
        self.K = K

    def fit(self, X: pd.DataFrame , y=None):
        if self.variables is not None:
            self.numeric_columns_ = pd.Index(self.variables)
        else:
            numeric_columns = X.select_dtypes(include=[np.number]).columns
            self.numeric_columns_ = numeric_columns 
            
        self.q1=X[self.numeric_columns_].quantile(self.lower_quantile)
        self.q3=X[self.numeric_columns_].quantile(self.upper_quantile)
        self.iqr=self.q3-self.q1
            
        #calculate thresholds
        self.lower_threshold=self.q1-self.K*self.iqr
        self.upper_threshold=self.q3+self.K*self.iqr
        return self

    def transform(self, X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            data[self.numeric_columns_]=data[self.numeric_columns_].clip(lower=self.lower_threshold,upper=self.upper_threshold)
        except Exception as e:
            self.errors_ = e
        return data

In [40]:
np.random.seed(0)
df1 = pd.DataFrame({
    'A': np.random.normal(loc=100, scale=20, size=1000),
    'B': np.random.normal(loc=50, scale=10, size=1000),
    'C': np.random.normal(loc=200, scale=30, size=1000),
    'D': np.random.normal(loc=300, scale=40, size=1000)
})

In [41]:
win = winsorizer(lower_quantile=0.05, upper_quantile=0.95, K=2)
win.fit(df1)

In [42]:
transformed = win.transform(df1)

In [43]:
class MeanMedianImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables: list|None = None,imputation_type='mean'):
        self.imputation_type = imputation_type
        self.variables = variables
        
    def fit(self, X : pd.DataFrame, y=None):
        if self.variables is not None:
            self.numeric_columns_ = pd.Index(self.variables)
        else:
            numeric_columns = X.select_dtypes(include=[np.number]).columns
            self.numeric_columns_ = numeric_columns
        self.mean_ = X[self.numeric_columns_].mean()
        self.median_ = X[self.numeric_columns_].median()
        return self
    
    def transform(self, X : pd.DataFrame, y=None):
        data = X.copy()
        if self.imputation_type == 'mean':
            data[self.numeric_columns_] = data[self.numeric_columns_].fillna(self.mean_)
        else :
            data[self.numeric_columns_] = data[self.numeric_columns_].fillna(self.median_)
        return data

In [44]:
df2 = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [5, np.nan, 7, 8, 9],
    'C': [np.nan, 12, 13, 14, 15],
    'D': [16, 17, 18, np.nan, 20]
})

In [45]:
df2

Unnamed: 0,A,B,C,D
0,1.0,5.0,,16.0
1,2.0,,12.0,17.0
2,,7.0,13.0,18.0
3,4.0,8.0,14.0,
4,5.0,9.0,15.0,20.0


In [46]:
MMI = MeanMedianImputer(['A','C'],imputation_type= 'median')

In [47]:
MMI.fit(df2)

In [48]:
t = MMI.transform(df2)

In [49]:
t

Unnamed: 0,A,B,C,D
0,1.0,5.0,13.5,16.0
1,2.0,,12.0,17.0
2,3.0,7.0,13.0,18.0
3,4.0,8.0,14.0,
4,5.0,9.0,15.0,20.0


In [50]:
class categoricalImputer(BaseEstimator,TransformerMixin):
    def __init__(self,variables : list|None=None, strategy='most_frequent'):
        self.variables=variables
        self.strategy=strategy
        
    def fit(self,X: pd.DataFrame,y=None):
        if self.variables is not None:
            self.categorical_columns_ = pd.Index(self.variables)
        else:
            categorical_columns = X.select_dtypes(include=['object']).columns
            self.categorical_columns_ = categorical_columns
        self.fill_values_ = X[self.categorical_columns_].mode().iloc[0]
        return self
    def transform(self,X : pd.DataFrame):
        data = X.copy()
        data[self.categorical_columns_] = data[self.categorical_columns_].fillna(self.fill_values_)
        return data

In [51]:
df3 = pd.DataFrame({
    'A': ['apple', 'banana', 'apple', np.nan, 'banana'],
    'B': ['red', 'green', np.nan, 'red', 'green'],
    'C': ['small', 'large', 'medium', 'medium', np.nan]
})

In [52]:
CI = categoricalImputer()
CI.fit(df3)

In [53]:
CIT = CI.transform(df3)

In [54]:
CIT

Unnamed: 0,A,B,C
0,apple,red,small
1,banana,green,large
2,apple,green,medium
3,apple,red,medium
4,banana,green,medium


In [55]:
class count_frequency_encoder(BaseEstimator,TransformerMixin):
    def __init__(self, variables: list | None = None):
        self.variables = variables

    def fit(self, X, y=None):
        if self.variables is not None:
            self.categorical_variables = self.variables
        else:
            categorical_columns_ = X.select_dtypes(include=['object']).columns
            self.categorical_variables = categorical_columns_
        
        self.encoding_dict_ = {}
        for var in self.categorical_variables:
            value_counts = X[var].value_counts()
            total_count = value_counts.sum()
            self.encoding_dict_[var] = value_counts / total_count 
        return self

    def transform(self, X):
        data = X.copy()
        for var in self.categorical_variables:
            data[var] = data[var].map(self.encoding_dict_[var])
        return data

In [56]:
CFE = count_frequency_encoder()
CFE.fit(df3)

In [57]:
CFET = CFE.transform(CIT)
CFET

Unnamed: 0,A,B,C
0,0.5,0.5,0.25
1,0.5,0.5,0.25
2,0.5,0.5,0.5
3,0.5,0.5,0.5
4,0.5,0.5,0.5


In [80]:
class OneHotEncorder (BaseEstimator,TransformerMixin):
    def __init__(self, variables : list |None =None ):
        self.variables = variables
    
    def fit (self , X : pd.DataFrame ,y=None):
        if self.variables is not None:
            self.categorical_variables = self.variables
        else:
            categorical_columns_ = X.select_dtypes(include=['object']).columns
            self.categorical_variables = categorical_columns_
         
        self.categorical_variables = {}   
        for col in self.categorical_variables:
            self.categorical_variables[col] = X[col].unique()
        return self
    
    def transform (self,X: pd.DataFrame,y=None):
        data = X.copy()
        for col in self.categorical_variables:
            categories = self.categorical_variables[col]
            for category in categories:
                new_col_name = f'{col}_{category}'
        data[new_col_name] = (data[col] == category).astype(int)
        data.drop(col,axis=1)
        return data
            

In [81]:
df4 = pd.DataFrame({
    'A': ['apple', 'banana', 'apple','apple', 'banana'],
    'B': ['red', 'green', 'green', 'red', 'green'],
    'C': ['small', 'large', 'medium', 'medium', 'large']
})

In [82]:
ohe = OneHotEncorder()
ohe.fit(df4)

In [83]:
df4_ = ohe.transform(df4)
df4_

UnboundLocalError: cannot access local variable 'col' where it is not associated with a value