In [1]:
import pandas as  pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
df = pd.DataFrame({'a': np.random.randint(10,100, size=10),
                   'b': np.random.randint(10,100, size=10),
                   'c': np.random.randint(10,100, size=10)})
df

Unnamed: 0,a,b,c
0,76,21,81
1,75,18,63
2,35,90,42
3,54,60,50
4,97,43,63
5,15,88,15
6,18,94,73
7,17,39,69
8,17,10,19
9,58,93,74


In [3]:
class StandardScaler(BaseEstimator,TransformerMixin):
    def __init__(self,variables: list|None = None):
        self.variables = variables
        
    def fit(self,X: pd.DataFrame, y = None):
        if self.variables is not None:
            self.numeric_columns_ = pd.Index(self.variables)
        else:
            numeric_columns = X.select_dtypes(include=[np.number]).columns
            self.numeric_columns_ = numeric_columns 
        self.mean_ = X[self.numeric_columns_].mean()
        self.std_ = X[self.numeric_columns_].std()
        return self
    
    def transform(self,X: pd.DataFrame,y=None):
        data = X.copy()
        try:
            data[self.numeric_columns_] = (data[self.numeric_columns_] - self.mean_) / self.std_
        except Exception as e:
            self.errors_ = e
        return data

In [4]:
scaler = StandardScaler(['a','b'])
scaler.fit(df)

In [5]:
scaler.transform(df)

Unnamed: 0,a,b,c
0,0.993481,-1.025243,81
1,0.960142,-1.114137,63
2,-0.373389,1.019317,42
3,0.260039,0.130378,50
4,1.693584,-0.373355,63
5,-1.040154,0.960054,15
6,-0.940139,1.137842,73
7,-0.973478,-0.49188,69
8,-0.973478,-1.351188,19
9,0.393392,1.108211,74


In [6]:
scaler.mean_
pd.concat([scaler.mean_,scaler.std_],axis=1,keys=['mean','std'])

Unnamed: 0,mean,std
a,46.2,29.995555
b,55.6,33.748086


In [7]:
class MinMaxScaler (BaseEstimator, TransformerMixin):
    def __init__(self, feature_range=(0,1), variables: list|None = None):
        self.feature_range = feature_range
        self.variables = variables 
    
    def fit (self,X : pd.DataFrame, y=None ):
        if self.variables is not None:
            self.numeric_columns_ = pd.Index(self.variables)
        else:
            numeric_columns = X.select_dtypes(include=[np.number]).columns
            self.numeric_columns_ = numeric_columns 
        self.min_ = X[self.numeric_columns_].min()
        self.max_ = X[self.numeric_columns_].max()
        return self
    
    def transform(self,X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            min_val, max_val = self.feature_range
            data[self.numeric_columns_] = (data[self.numeric_columns_] - self.min_) / (self.max_ - self.min_) * (max_val - min_val) + min_val
        except Exception as e:
            self.errors_ = e
        return data

In [8]:
minimax = MinMaxScaler()
minimax.fit(df)
minimax.transform(df)

Unnamed: 0,a,b,c
0,0.743902,0.130952,1.0
1,0.731707,0.095238,0.727273
2,0.243902,0.952381,0.409091
3,0.47561,0.595238,0.530303
4,1.0,0.392857,0.727273
5,0.0,0.928571,0.0
6,0.036585,1.0,0.878788
7,0.02439,0.345238,0.818182
8,0.02439,0.0,0.060606
9,0.52439,0.988095,0.893939


In [9]:
class winsorizer(BaseEstimator, TransformerMixin):
    def __init__(self,variables: list|None = None,lower_quantile=0.25, upper_quantile=0.75, K=5):
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile
        self.variables = variables
        self.K = K

    def fit(self, X: pd.DataFrame , y=None):
        if self.variables is not None:
            self.numeric_columns_ = pd.Index(self.variables)
        else:
            numeric_columns = X.select_dtypes(include=[np.number]).columns
            self.numeric_columns_ = numeric_columns 
            
        self.q1=X[self.numeric_columns_].quantile(self.lower_quantile)
        self.q3=X[self.numeric_columns_].quantile(self.upper_quantile)
        self.iqr=self.q3-self.q1
            
        #calculate thresholds
        self.lower_threshold=self.q1-self.K*self.iqr
        self.upper_threshold=self.q3+self.K*self.iqr
        return self

    def transform(self, X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            data[self.numeric_columns_]=data[self.numeric_columns_].clip(lower=self.lower_threshold,upper=self.upper_threshold)
        except Exception as e:
            self.errors_ = e
        return data

In [10]:
np.random.seed(0)
df1 = pd.DataFrame({
    'A': np.random.normal(loc=100, scale=20, size=1000),
    'B': np.random.normal(loc=50, scale=10, size=1000),
    'C': np.random.normal(loc=200, scale=30, size=1000),
    'D': np.random.normal(loc=300, scale=40, size=1000)
})

In [11]:
win = winsorizer(lower_quantile=0.05, upper_quantile=0.95, K=2)
win.fit(df1)

In [12]:
transformed = win.transform(df1)

In [13]:
class MeanMedianImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables: list|None = None,imputation_type='mean'):
        self.imputation_type = imputation_type
        self.variables = variables
        
    def fit(self, X : pd.DataFrame, y=None):
        if self.variables is not None:
            self.numeric_columns_ = pd.Index(self.variables)
        else:
            numeric_columns = X.select_dtypes(include=[np.number]).columns
            self.numeric_columns_ = numeric_columns
        self.mean_ = X[self.numeric_columns_].mean()
        self.median_ = X[self.numeric_columns_].median()
        return self
    
    def transform(self, X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            if self.imputation_type == 'mean':
                data[self.numeric_columns_] = data[self.numeric_columns_].fillna(self.mean_)
            else :
                data[self.numeric_columns_] = data[self.numeric_columns_].fillna(self.median_)
        except Exception as e:
            self.errors_ = e
        return data

In [14]:
df2 = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [5, np.nan, 7, 8, 9],
    'C': [np.nan, 12, 13, 14, 15],
    'D': [16, 17, 18, np.nan, 20]
})

In [15]:
df2

Unnamed: 0,A,B,C,D
0,1.0,5.0,,16.0
1,2.0,,12.0,17.0
2,,7.0,13.0,18.0
3,4.0,8.0,14.0,
4,5.0,9.0,15.0,20.0


In [16]:
MMI = MeanMedianImputer(['A','C'],imputation_type= 'median')

In [17]:
MMI.fit(df2)

In [18]:
t = MMI.transform(df2)

In [19]:
t

Unnamed: 0,A,B,C,D
0,1.0,5.0,13.5,16.0
1,2.0,,12.0,17.0
2,3.0,7.0,13.0,18.0
3,4.0,8.0,14.0,
4,5.0,9.0,15.0,20.0


In [20]:
class categoricalImputer(BaseEstimator,TransformerMixin):
    def __init__(self,variables : list|None=None, strategy='most_frequent'):
        self.variables=variables
        self.strategy=strategy
        
    def fit(self,X: pd.DataFrame,y=None):
        if self.variables is not None:
            self.categorical_columns_ = pd.Index(self.variables)
        else:
            categorical_columns = X.select_dtypes(include=['object']).columns
            self.categorical_columns_ = categorical_columns
        self.fill_values_ = X[self.categorical_columns_].mode().iloc[0]
        return self
    def transform(self,X : pd.DataFrame, y = None ):
        data = X.copy()
        try:
            data[self.categorical_columns_] = data[self.categorical_columns_].fillna(self.fill_values_)
        except Exception as e:
            self.errors_ = e
        return data

In [21]:
df3 = pd.DataFrame({
    'A': ['apple', 'banana', 'apple', np.nan, 'banana'],
    'B': ['red', 'green', np.nan, 'red', 'green'],
    'C': ['small', 'large', 'medium', 'medium', np.nan]
})

In [22]:
CI = categoricalImputer()
CI.fit(df3)

In [23]:
CIT = CI.transform(df3)

In [24]:
CIT

Unnamed: 0,A,B,C
0,apple,red,small
1,banana,green,large
2,apple,green,medium
3,apple,red,medium
4,banana,green,medium


In [25]:
class count_frequency_encoder(BaseEstimator,TransformerMixin):
    def __init__(self, variables: list | None = None):
        self.variables = variables

    def fit(self, X, y=None):
        if self.variables is not None:
            self.categorical_variables = self.variables
        else:
            categorical_columns_ = X.select_dtypes(include=['object']).columns
            self.categorical_variables = categorical_columns_
        
        self.encoding_dict_ = {}
        for var in self.categorical_variables:
            value_counts = X[var].value_counts()
            total_count = value_counts.sum()
            self.encoding_dict_[var] = value_counts / total_count 
        return self

    def transform(self, X: pd.DataFrame , y = None):
        data = X.copy()
        try:
            for var in self.categorical_variables:
                data[var] = data[var].map(self.encoding_dict_[var])
        except Exception as e:
            self.errors_ = e
        return data

In [26]:
CFE = count_frequency_encoder()
CFE.fit(df3)

In [27]:
CFET = CFE.transform(CIT)
CFET

Unnamed: 0,A,B,C
0,0.5,0.5,0.25
1,0.5,0.5,0.25
2,0.5,0.5,0.5
3,0.5,0.5,0.5
4,0.5,0.5,0.5


In [28]:

class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, variables: list |None = None):
        self.variables = variables
        self.categorical_variables = {}
    
    def fit(self, X: pd.DataFrame, y=None):
        if self.variables is not None:
            self.categorical_columns_ = self.variables
        else:
            self.categorical_columns_ = X.select_dtypes(include=['object']).columns
        
        for col in self.categorical_columns_:
            self.categorical_variables[col] = X[col].unique()
        
        print("OneHotEncoder fitted successfully!")
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        data = X.copy()
        
        try:
            for col in self.categorical_variables:
                categories = self.categorical_variables[col]
                for category in categories:
                    new_col_name = f'{col}_{category}'
                    data[new_col_name] = (data[col] == category).astype(int)
                
                # Drop the original categorical column
                data.drop(col, axis=1, inplace=True)
        except Exception as e:
            self.errors_ = e    
        return data


In [29]:
df4 = pd.DataFrame({
    'A': ['apple', 'banana', 'apple','apple', 'banana'],
    'B': ['red', 'green', 'green', 'red', 'green'],
    'C': ['small', 'large', 'medium', 'medium', 'large']
})

In [31]:
ohe = OneHotEncoder()
ohe.fit(df4)

OneHotEncoder fitted successfully!


In [32]:
df4_ = ohe.transform(df4)
df4_

Unnamed: 0,A_apple,A_banana,B_red,B_green,C_small,C_large,C_medium
0,1,0,1,0,1,0,0
1,0,1,0,1,0,1,0
2,1,0,0,1,0,0,1
3,1,0,1,0,0,0,1
4,0,1,0,1,0,1,0


In [33]:
class OrdinalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, variables: list | None= None, mapping: dict = None):
        self.variables = variables
        self.mapping = mapping
        self.ordinal_mapping = {}
    
    def fit(self, X: pd.DataFrame, y=None):
        if self.variables is not None:
            self.categorical_columns_ = self.variables
        else:
            self.categorical_columns_ = X.select_dtypes(include=['object']).columns
        
        if self.mapping is None:
            self.generate_mapping(X)
        else:
            self.ordinal_mapping = self.mapping
        
        return self
    
    def generate_mapping(self, X : pd.DataFrame , y = None):
        for col in self.categorical_columns_:
            unique_values = X[col].unique()
            self.ordinal_mapping[col] = {value: i for i, value in enumerate(unique_values)}
    
    def transform(self, X: pd.DataFrame, y=None):
        data = X.copy()
        
        try:
            for col, mapping in self.ordinal_mapping.items():
                data[col] = data[col].map(mapping)
        except Exception as e:
            self.errors_ = e  
        return data


In [34]:
OrdinalEncoder = OrdinalEncoder()
df4_ordinal = OrdinalEncoder.fit(df4)

In [35]:
df4_ordinal

In [36]:
class DropConstantFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
    
    def fit(self, X, y=None):
        
        self.columns_to_drop_ = X.columns[X.apply(lambda col: col.value_counts(normalize=True).values[0] >= self.threshold)].tolist()
        return self
    
    def transform(self, X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            data = data.drop(columns=self.columns_to_drop_)
        
        except Exception as e:
            self.errors_ = e
        return data

In [37]:
df5 = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [1, 1, 1, 1],
    'C': [2, 3, 2, 3],
    'D': ['a', 'a', 'a', 'a'],
    'E': ['x', 'x', 'x', 'y']
})

In [38]:
DropConstantFeatures = DropConstantFeatures()

In [39]:
df5_dropped = DropConstantFeatures.fit(df5)
df5_dropped

In [40]:
class HighCardinalityImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables : list | None = None, threshold=0.3, fill_value='Other'):
        self.threshold = threshold
        self.variables = variables
        self.fill_value = fill_value
        self.categories_to_replace = {}
    
    def fit(self, X, y=None):
        if self.variables is not None:
              for col in self.variables:
                # Calculate the frequency of each category
                freq = X[col].value_counts(normalize=True)
                
                cumulative_freq = freq.cumsum()
                infrequent_categories = cumulative_freq[cumulative_freq < self.threshold].index.tolist()
                
                # Store these categories for replacement
                self.categories_to_replace[col] = infrequent_categories
        
        else:
            for col in X.columns:
           
                freq = X[col].value_counts(normalize=True)
            
            cumulative_freq = freq.cumsum()
            infrequent_categories = cumulative_freq[cumulative_freq < self.threshold].index.tolist()
            
            self.categories_to_replace[col] = infrequent_categories
        
        return self
    
    def transform(self, X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            for col, categories in self.categories_to_replace.items():
                data[col] = data[col].apply(lambda x: self.fill_value if x in categories else x)
        except Exception as e:
            self.errors_ = e
        return data

In [41]:
df6 = pd.DataFrame({
    'Color': ['Red', 'Blue', 'Green', 'Yellow', 'Purple', 'Blue', 'Green'],
    'Shape': ['Circle', 'Square', 'Triangle', 'Circle', 'Circle', 'Square', 'Square'],
    'Size': ['Small', 'Medium', 'Large', 'Small', 'Small', 'Medium', 'Medium']
})

In [42]:
hic = HighCardinalityImputer(threshold=0.7)

In [43]:
df6_ = hic.fit(df6)

In [44]:
class DropDuplicateFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.duplicate_columns = []
    
    def fit(self, X, y=None):
        self.duplicate_columns = self.find_duplicate_columns(X)
        return self
    
    def transform(self, X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            data = data.drop(columns=self.duplicate_columns)
        except Exception as e:
            self.errors_ = e
        return data
    
    def find_duplicate_columns(self, X : pd.DataFrame):
        duplicate_columns = []
        for i in range(X.shape[1]):
            col1 = X.iloc[:, i]
            for j in range(i+1, X.shape[1]):
                col2 = X.iloc[:, j]
                if col1.equals(col2):
                    duplicate_columns.append(X.columns[j])
                    
        return duplicate_columns


In [45]:
df7_ = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [1, 2, 3],
    'C': [4, 5, 6],
    'D': [4, 5, 6],
    'E': [7, 8, 9]
})

In [46]:
dropper = DropDuplicateFeatures()
df_fil = dropper.fit_transform(df7_)

In [47]:
df_fil

Unnamed: 0,A,C,E
0,1,4,7
1,2,5,8
2,3,6,9


In [48]:
class DropCorrelatedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.95):
        self.threshold = threshold
        self.correlated_features = []
    
    def fit(self, X: pd.DataFrame, y=None):

        corr_matrix = X.corr().abs()
        self.correlated_features = self.find_correlated_features(corr_matrix)
        
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        data = X.copy()
        try:
            data = data.drop(columns=self.correlated_features)
        except Exception as e:
            self.errors_ = e
        return data
    
    def find_correlated_features(self, corr_matrix):
        correlated_features = set()
        
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i, j]) > self.threshold:
                    colname = corr_matrix.columns[i]
                    correlated_features.add(colname)
        
        return list(correlated_features)


In [49]:
df8 = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [2, 4, 6, 8],
    'C': [1, 2, 3, 4],
    'D': [2, 4, 6, 8]
})

In [50]:
dropcf = DropCorrelatedFeatures()


In [51]:
dropcf.fit_transform(df8)

Unnamed: 0,A
0,1
1,2
2,3
3,4
