In [812]:
import pandas as  pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

In [813]:
df = pd.DataFrame({'a': np.random.randint(10,100, size=10),
                   'b': np.random.randint(10,100, size=10),
                   'c': np.random.randint(10,100, size=10)})
df

Unnamed: 0,a,b,c
0,92,98,81
1,25,63,45
2,39,52,27
3,90,44,82
4,71,84,47
5,60,76,42
6,13,75,75
7,59,74,14
8,79,53,91
9,44,21,69


In [814]:
class StandardScaler(BaseEstimator,TransformerMixin):
    def __init__(self,variables: list|None = None):
        self.variables = variables
        self.errors_ =None
        
    def fit(self,X: pd.DataFrame, y = None):
        if self.variables is not None:
            self.numeric_columns_ = pd.Index(self.variables)
        else:
            numeric_columns = X.select_dtypes(include=[np.number]).columns
            self.numeric_columns_ = numeric_columns 
        self.mean_ = X[self.numeric_columns_].mean()
        self.std_ = X[self.numeric_columns_].std()
        return self
    
    def transform(self,X: pd.DataFrame,y=None):
        data = X.copy()
        try:
            data[self.numeric_columns_] = (data[self.numeric_columns_] - self.mean_) / self.std_
        except Exception as e:
            self.errors_ = e
        return data

In [815]:
scaler = StandardScaler(['a','b'])
scaler.fit(df)

In [816]:
scaler.transform(df)

Unnamed: 0,a,b,c
0,1.298966,1.531456,81
1,-1.201917,-0.045043,45
2,-0.679344,-0.540514,27
3,1.224313,-0.900856,82
4,0.515107,0.900856,47
5,0.104514,0.540514,42
6,-1.649836,0.495471,75
7,0.067188,0.450428,14
8,0.81372,-0.495471,91
9,-0.492711,-1.936841,69


In [817]:
scaler.mean_
pd.concat([scaler.mean_,scaler.std_],axis=1,keys=['mean','std'])

Unnamed: 0,mean,std
a,57.2,26.790546
b,64.0,22.201101


In [818]:
class MinMaxScaler (BaseEstimator, TransformerMixin):
    def __init__(self, feature_range=(0,1), variables: list|None = None):
        self.feature_range = feature_range
        self.variables = variables 
        self.errors_=None
    
    def fit (self,X : pd.DataFrame, y=None ):
        if self.variables is not None:
            self.numeric_columns_ = pd.Index(self.variables)
        else:
            numeric_columns = X.select_dtypes(include=[np.number]).columns
            self.numeric_columns_ = numeric_columns 
        self.min_ = X[self.numeric_columns_].min()
        self.max_ = X[self.numeric_columns_].max()
        return self
    
    def transform(self,X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            min_val, max_val = self.feature_range
            data[self.numeric_columns_] = (data[self.numeric_columns_] - self.min_) / (self.max_ - self.min_) * (max_val - min_val) + min_val
        except Exception as e:
            self.errors_ = e
        return data

In [819]:
minimax = MinMaxScaler()
minimax.fit(df)
minimax.transform(df)

Unnamed: 0,a,b,c
0,1.0,1.0,0.87013
1,0.151899,0.545455,0.402597
2,0.329114,0.402597,0.168831
3,0.974684,0.298701,0.883117
4,0.734177,0.818182,0.428571
5,0.594937,0.714286,0.363636
6,0.0,0.701299,0.792208
7,0.582278,0.688312,0.0
8,0.835443,0.415584,1.0
9,0.392405,0.0,0.714286


In [820]:
class winsorizer(BaseEstimator, TransformerMixin):
    def __init__(self,variables: list|None = None,lower_quantile=0.25, upper_quantile=0.75, K=5):
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile
        self.variables = variables
        self.K = K
        self.errors_ =None

    def fit(self, X: pd.DataFrame , y=None):
        if self.variables is not None:
            self.numeric_columns_ = pd.Index(self.variables)
        else:
            numeric_columns = X.select_dtypes(include=[np.number]).columns
            self.numeric_columns_ = numeric_columns 
            
        self.q1=X[self.numeric_columns_].quantile(self.lower_quantile)
        self.q3=X[self.numeric_columns_].quantile(self.upper_quantile)
        self.iqr=self.q3-self.q1
            
        #calculate thresholds
        self.lower_threshold=self.q1-self.K*self.iqr
        self.upper_threshold=self.q3+self.K*self.iqr
        return self

    def transform(self, X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            data[self.numeric_columns_]=data[self.numeric_columns_].clip(lower=self.lower_threshold,upper=self.upper_threshold)
        except Exception as e:
            self.errors_ = e
        return data

In [821]:
np.random.seed(0)
df1 = pd.DataFrame({
    'A': np.random.normal(loc=100, scale=20, size=1000),
    'B': np.random.normal(loc=50, scale=10, size=1000),
    'C': np.random.normal(loc=200, scale=30, size=1000),
    'D': np.random.normal(loc=300, scale=40, size=1000)
})

In [822]:
win = winsorizer(lower_quantile=0.05, upper_quantile=0.95, K=2)
win.fit(df1)

In [823]:
transformed = win.transform(df1)

In [824]:
class MeanMedianImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables: list|None = None,imputation_type='mean'):
        self.imputation_type = imputation_type
        self.variables = variables
        self.errors_ =None
        
    def fit(self, X : pd.DataFrame, y=None):
        if self.variables is not None:
            self.numeric_columns_ = pd.Index(self.variables)
        else:
            numeric_columns = X.select_dtypes(include=[np.number]).columns
            self.numeric_columns_ = numeric_columns
        self.mean_ = X[self.numeric_columns_].mean()
        self.median_ = X[self.numeric_columns_].median()
        return self
    
    def transform(self, X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            if self.imputation_type == 'mean':
                data[self.numeric_columns_] = data[self.numeric_columns_].fillna(self.mean_)
            else :
                data[self.numeric_columns_] = data[self.numeric_columns_].fillna(self.median_)
        except Exception as e:
            self.errors_ = e
        return data

In [825]:
df2 = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [5, np.nan, 7, 8, 9],
    'C': [np.nan, 12, 13, 14, 15],
    'D': [16, 17, 18, np.nan, 20]
})

In [826]:
df2

Unnamed: 0,A,B,C,D
0,1.0,5.0,,16.0
1,2.0,,12.0,17.0
2,,7.0,13.0,18.0
3,4.0,8.0,14.0,
4,5.0,9.0,15.0,20.0


In [827]:
MMI = MeanMedianImputer(['A','C'],imputation_type= 'median')

In [828]:
MMI.fit(df2)

In [829]:
t = MMI.transform(df2)

In [830]:
t

Unnamed: 0,A,B,C,D
0,1.0,5.0,13.5,16.0
1,2.0,,12.0,17.0
2,3.0,7.0,13.0,18.0
3,4.0,8.0,14.0,
4,5.0,9.0,15.0,20.0


In [831]:
class categoricalImputer(BaseEstimator,TransformerMixin):
    def __init__(self,variables : list|None=None, strategy='most_frequent'):
        self.variables=variables
        self.strategy=strategy
        self.errors_ =None
        
    def fit(self,X: pd.DataFrame,y=None):
        if self.variables is not None:
            self.categorical_columns_ = pd.Index(self.variables)
        else:
            categorical_columns = X.select_dtypes(include=['object']).columns
            self.categorical_columns_ = categorical_columns
        self.fill_values_ = X[self.categorical_columns_].mode().iloc[0]
        return self
    def transform(self,X : pd.DataFrame, y = None ):
        data = X.copy()
        try:
            data[self.categorical_columns_] = data[self.categorical_columns_].fillna(self.fill_values_)
        except Exception as e:
            self.errors_ = e
        return data

In [832]:
df3 = pd.DataFrame({
    'A': ['apple', 'banana', 'apple', np.nan, 'banana'],
    'B': ['red', 'green', np.nan, 'red', 'green'],
    'C': ['small', 'large', 'medium', 'medium', np.nan]
})

In [833]:
CI = categoricalImputer()
CI.fit(df3)

In [834]:
CIT = CI.transform(df3)

In [835]:
CIT

Unnamed: 0,A,B,C
0,apple,red,small
1,banana,green,large
2,apple,green,medium
3,apple,red,medium
4,banana,green,medium


In [836]:
class Count_frequency_encoder(BaseEstimator,TransformerMixin):
    def __init__(self, variables: list | None = None):
        self.variables = variables
        self.errors_ =None

    def fit(self, X, y=None):
        if self.variables is not None:
            self.categorical_variables = self.variables
        else:
            categorical_columns_ = X.select_dtypes(include=['object']).columns
            self.categorical_variables = categorical_columns_
        
        self.encoding_dict_ = {}
        for var in self.categorical_variables:
            value_counts = X[var].value_counts()
            total_count = value_counts.sum()
            self.encoding_dict_[var] = value_counts / total_count 
        return self

    def transform(self, X: pd.DataFrame , y = None):
        data = X.copy()
        try:
            for var in self.categorical_variables:
                data[var] = data[var].map(self.encoding_dict_[var])
        except Exception as e:
            self.errors_ = e
        return data

In [837]:
CFE = Count_frequency_encoder()
CFE.fit(df3)

In [838]:
CFET = CFE.transform(CIT)
CFET

Unnamed: 0,A,B,C
0,0.5,0.5,0.25
1,0.5,0.5,0.25
2,0.5,0.5,0.5
3,0.5,0.5,0.5
4,0.5,0.5,0.5


In [839]:

class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, variables: list |None = None):
        self.variables = variables
        self.categorical_variables = {}
        self.errors_= None
    
    def fit(self, X: pd.DataFrame, y=None):
        if self.variables is not None:
            self.categorical_columns_ = self.variables
        else:
            self.categorical_columns_ = X.select_dtypes(include=['object']).columns
        
        for col in self.categorical_columns_:
            self.categorical_variables[col] = X[col].unique()
        
        print("OneHotEncoder fitted successfully!")
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        data = X.copy()
        
        try:
            for col in self.categorical_variables:
                categories = self.categorical_variables[col]
                for category in categories:
                    new_col_name = f'{col}_{category}'
                    data[new_col_name] = (data[col] == category).astype(int)
                
                # Drop the original categorical column
                data.drop(col, axis=1, inplace=True)
        except Exception as e:
            self.errors_ = e    
        return data


In [840]:
df4 = pd.DataFrame({
    'A': ['apple', 'banana', 'apple','apple', 'banana'],
    'B': ['red', 'green', 'green', 'red', 'green'],
    'C': ['small', 'large', 'medium', 'medium', 'large']
})

In [841]:
ohe = OneHotEncoder()
ohe.fit(df4)

OneHotEncoder fitted successfully!


In [842]:
df4_ = ohe.transform(df4)
df4_

Unnamed: 0,A_apple,A_banana,B_red,B_green,C_small,C_large,C_medium
0,1,0,1,0,1,0,0
1,0,1,0,1,0,1,0
2,1,0,0,1,0,0,1
3,1,0,1,0,0,0,1
4,0,1,0,1,0,1,0


In [843]:
class OrdinalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, variables: list | None= None, mapping: dict = None):
        self.variables = variables
        self.mapping = mapping
        self.ordinal_mapping = {}
        self.errors_ =None
    
    def fit(self, X: pd.DataFrame, y=None):
        if self.variables is not None:
            self.categorical_columns_ = self.variables
        else:
            self.categorical_columns_ = X.select_dtypes(include=['object']).columns
        
        if self.mapping is None:
            self.generate_mapping(X)
        else:
            self.ordinal_mapping = self.mapping
        
        return self
    
    def generate_mapping(self, X : pd.DataFrame , y = None):
        for col in self.categorical_columns_:
            unique_values = X[col].unique()
            self.ordinal_mapping[col] = {value: i for i, value in enumerate(unique_values)}
    
    def transform(self, X: pd.DataFrame, y=None):
        data = X.copy()
        
        try:
            for col, mapping in self.ordinal_mapping.items():
                data[col] = data[col].map(mapping)
        except Exception as e:
            self.errors_ = e  
        return data


In [844]:
OrdinalEncoder = OrdinalEncoder()
df4_ordinal = OrdinalEncoder.fit(df4)

In [845]:
df4_ordinal

In [846]:
class DropConstantFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.errors_ =None
    
    def fit(self, X, y=None):
        
        self.columns_to_drop_ = X.columns[X.apply(lambda col: col.value_counts(normalize=True).values[0] >= self.threshold)].tolist()
        return self
    
    def transform(self, X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            data = data.drop(columns=self.columns_to_drop_)
        
        except Exception as e:
            self.errors_ = e
        return data

In [847]:
df5 = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [1, 1, 1, 1],
    'C': [2, 3, 2, 3],
    'D': ['a', 'a', 'a', 'a'],
    'E': ['x', 'x', 'x', 'y']
})

In [848]:
DropConstantFeatures = DropConstantFeatures()

In [849]:
df5_dropped = DropConstantFeatures.fit(df5)
df5_dropped

In [850]:
class HighCardinalityImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables : list | None = None, threshold=0.3, fill_value='Other'):
        self.threshold = threshold
        self.variables = variables
        self.fill_value = fill_value
        self.categories_to_replace = {}
        self.errors_ =None
    
    def fit(self, X, y=None):
        if self.variables is not None:
              for col in self.variables:
                # Calculate the frequency of each category
                freq = X[col].value_counts(normalize=True)
                
                cumulative_freq = freq.cumsum()
                infrequent_categories = cumulative_freq[cumulative_freq < self.threshold].index.tolist()
                
                # Store these categories for replacement
                self.categories_to_replace[col] = infrequent_categories
        
        else:
            for col in X.columns:
           
                freq = X[col].value_counts(normalize=True)
            
            cumulative_freq = freq.cumsum()
            infrequent_categories = cumulative_freq[cumulative_freq < self.threshold].index.tolist()
            
            self.categories_to_replace[col] = infrequent_categories
        
        return self
    
    def transform(self, X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            for col, categories in self.categories_to_replace.items():
                data[col] = data[col].apply(lambda x: self.fill_value if x in categories else x)
        except Exception as e:
            self.errors_ = e
        return data

In [851]:
df6 = pd.DataFrame({
    'Color': ['Red', 'Blue', 'Green', 'Yellow', 'Purple', 'Blue', 'Green'],
    'Shape': ['Circle', 'Square', 'Triangle', 'Circle', 'Circle', 'Square', 'Square'],
    'Size': ['Small', 'Medium', 'Large', 'Small', 'Small', 'Medium', 'Medium']
})

In [852]:
hic = HighCardinalityImputer(threshold=0.7)

In [853]:
df6_ = hic.fit(df6)

In [854]:
class DropDuplicateFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.duplicate_columns = []
        self.errors_ =None
    
    def fit(self, X, y=None):
        self.duplicate_columns = self.find_duplicate_columns(X)
        return self
    
    def transform(self, X : pd.DataFrame, y=None):
        data = X.copy()
        try:
            data = data.drop(columns=self.duplicate_columns)
        except Exception as e:
            self.errors_ = e
        return data
    
    def find_duplicate_columns(self, X : pd.DataFrame):
        duplicate_columns = []
        for i in range(X.shape[1]):
            col1 = X.iloc[:, i]
            for j in range(i+1, X.shape[1]):
                col2 = X.iloc[:, j]
                if col1.equals(col2):
                    duplicate_columns.append(X.columns[j])
                    
        return duplicate_columns


In [855]:
df7_ = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [1, 2, 3],
    'C': [4, 5, 6],
    'D': [4, 5, 6],
    'E': [7, 8, 9]
})

In [856]:
dropper = DropDuplicateFeatures()
df_fil = dropper.fit_transform(df7_)

In [857]:
df_fil

Unnamed: 0,A,C,E
0,1,4,7
1,2,5,8
2,3,6,9


In [858]:
class DropCorrelatedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.95):
        self.threshold = threshold
        self.correlated_features = []
        self.errors_ =None
    
    def fit(self, X: pd.DataFrame, y=None):

        corr_matrix = X.corr().abs()
        self.correlated_features = self.find_correlated_features(corr_matrix)
        
        return self
    
    def transform(self, X: pd.DataFrame, y=None):
        data = X.copy()
        try:
            data = data.drop(columns=self.correlated_features)
        except Exception as e:
            self.errors_ = e
        return data
    
    def find_correlated_features(self, corr_matrix):
        correlated_features = set()
        
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i, j]) > self.threshold:
                    colname = corr_matrix.columns[i]
                    correlated_features.add(colname)
        
        return list(correlated_features)


In [859]:
df8 = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [2, 4, 6, 8],
    'C': [1, 2, 3, 4],
    'D': [2, 4, 6, 8]
})

In [860]:
dropcf = DropCorrelatedFeatures()


In [861]:
dropcf.fit_transform(df8)

Unnamed: 0,A
0,1
1,2
2,3
3,4


In [862]:
class BoxCoxTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, lambda_range=(0, 1.0)):
        self.lambda_range = lambda_range
        self.lambda_ = None
        self.log_likelihood_ = None
        self.errors_ = None
        
    def fit(self, X, y=None):
        try:
            best_lambda = None
            best_log_likelihood = float('-inf')

            for lambda_value in np.linspace(*self.lambda_range, num=100):
                transformed_data = self._boxcox_transform(X, lambda_value)
                log_likelihood = self._log_likelihood(transformed_data)

                if log_likelihood > best_log_likelihood:
                    best_log_likelihood = log_likelihood
                    best_lambda = lambda_value

            self.lambda_ = best_lambda
            self.log_likelihood_ = best_log_likelihood
        except Exception as e:
            self.errors_ = e
        return self
   
    def transform(self, X):
        try:
            return self._boxcox_transform(X, self.lambda_)
        except Exception as e:
            self.errors_ = e
            return None
   
    def _boxcox_transform(self, X, lambda_):
        try:
            if lambda_ == 0:
                return np.log(X)
            else:
                return (X**lambda_ - 1) / lambda_
        except Exception as e:
            self.errors_ = e
            return None
   
    def _log_likelihood(self, X):
        try:
            n = X.shape[0]
            if self.lambda_ is None or self.lambda_ == 0:
                return -n / 2 * (1 + np.log(2 * np.pi) + np.log(np.mean(X ** 2)))
            else:
                return n / 2 * (np.log(self.lambda_ / (2 * np.pi)) + (self.lambda_ - 1) * np.mean(np.log(X)))
        except Exception as e:
            self.errors_ = e
            return None

In [863]:
df9 = pd.DataFrame({
    'A': [1, 2, 3, None],  
    'B': [2, 4, 6, 8],
    'C': [1, 2, 3, 4],
    'D': [2, 4, 6, 8]
})

In [864]:
box= BoxCoxTransformer()

In [865]:
transformed=box.fit_transform(df9)
transformed

Unnamed: 0,A,B,C,D
0,0.0,0.693147,0.0,0.693147
1,0.693147,1.386294,0.693147,1.386294
2,1.098612,1.791759,1.098612,1.791759
3,,2.079442,1.386294,2.079442


In [866]:
class YeoJohnsonTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, lambda_range=(-3.0, 3.0)):
        self.lambda_range = lambda_range
        self.lambda_ = None
        self.log_likelihood_ = None
        self.errors_ = None
        
    def fit(self, X, y=None):
        try:
            best_lambda = None
            best_log_likelihood = float('-inf')

            for lambda_value in np.linspace(*self.lambda_range, num=100):
                transformed_data = self._yeo_johnson_transform(X, lambda_value)
                log_likelihood = self._log_likelihood(transformed_data)

                if log_likelihood > best_log_likelihood:
                    best_log_likelihood = log_likelihood
                    best_lambda = lambda_value

            self.lambda_ = best_lambda
            self.log_likelihood_ = best_log_likelihood
        except Exception as e:
            self.errors_ = e
        return self
   
    def transform(self, X):
        try:
            return self._yeo_johnson_transform(X, self.lambda_)
        except Exception as e:
            self.errors_ = e
            return None
   
    def _yeo_johnson_transform(self, X, lambda_):
        try:
            if lambda_ == 0:
                return np.log1p(X)
            else:
                if lambda_ < 0:  
                    offset = 0.5 
                    X += offset  
                    transformed = ((X)**lambda_ - 1) / lambda_
                    transformed -= transformed.min() + 1  
                    return transformed
                else:  
                    return (X**lambda_ - 1) / lambda_
        except Exception as e:
            self.errors_ = e
            return None
   
    def _log_likelihood(self, X):
        try:
            n = X.shape[0]
            if self.lambda_ is None or self.lambda_ == 0:
                return -n / 2 * (1 + np.log(2 * np.pi) + np.log(np.mean(X ** 2)))
            else:
                return n / 2 * (np.log(self.lambda_ / (2 * np.pi)) + (self.lambda_ - 1) * np.mean(np.log(X)))
        except Exception as e:
            self.errors_ = e
            return None

In [867]:
df10 = pd.DataFrame({
    'A': [1, -2, 3, None],  
    'B': [2, 4, 6, 8],
    'C': [1, 2, 3, -4],
    'D': [2, 4, 6, 8]
})


In [868]:
yeo = YeoJohnsonTransformer()

In [869]:
yeo_fitted=yeo.fit_transform(df10)
yeo_fitted

Unnamed: 0,A,B,C,D
0,-0.891015,-1.0,-0.810074,-1.0
1,-1.0,-0.936571,-0.776553,-0.936571
2,-0.825207,-0.877427,-0.744265,-0.877427
3,,-0.822031,-1.0,-0.822031


In [870]:
class RareLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.1):
        self.threshold = threshold
        self.rare_category = "Rare"
        self.errors_= None

    def fit(self, X, y=None):
        try:
            label_counts = X.value_counts(normalize=True)
            self.rare_labels = label_counts[label_counts < self.threshold].index
        except Exception as e:
            raise ValueError
        return self

    def transform(self, X):
        try:
            X_encoded = X.copy()
            X_encoded.loc[X.isin(self.rare_labels)] = self.rare_category
            return X_encoded
        except Exception as e:
            raise ValueError

In [871]:
data = {"Category": ["A", "B", "C", "D", "E", "F", "A", "B", "C", "D"]}
df = pd.DataFrame(data)

In [872]:
rare_encoder = RareLabelEncoder(threshold=0.134)

In [873]:
df_encoded = rare_encoder.fit_transform(df["Category"])

print(df_encoded)

0       A
1       B
2       C
3       D
4    Rare
5    Rare
6       A
7       B
8       C
9       D
Name: Category, dtype: object


In [874]:
class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, base=10):
        self.base = base
        self.errors_= None

    def fit(self, X, y=None):
        
        return self

    def transform(self, X):
       
        try:
          
            X_transformed = np.log(X + 1e-10) / np.log(self.base)
        except ValueError:

            raise ValueError
        
        return X_transformed

In [875]:
data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
log_transformer = LogTransformer(base=2)
transformed_data = log_transformer.transform(data)

In [876]:
data

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [877]:
transformed_data

array([[1.44269516e-10, 1.00000000e+00, 1.58496250e+00],
       [2.00000000e+00, 2.32192809e+00, 2.58496250e+00],
       [2.80735492e+00, 3.00000000e+00, 3.16992500e+00]])

In [879]:
class KBinsDiscretizer(BaseEstimator, TransformerMixin):
    def __init__(self, n_bins=5, strategy='uniform', encode='ordinal'):
        self.n_bins = n_bins
        self.strategy = strategy
        self.encode = encode
        self.errors_= None

    def fit(self, X, y=None):
        
        if self.strategy == 'uniform':
            self.bin_edges_ = np.linspace(np.min(X), np.max(X), self.n_bins + 1)[1:-1]
        elif self.strategy == 'quantile':
            self.bin_edges_ = np.percentile(X, np.linspace(0, 100, self.n_bins + 1)[1:-1])
        elif self.strategy == 'kmeans':
            from sklearn.cluster import KMeans
            kmeans = KMeans(n_clusters=self.n_bins)
            kmeans.fit(X)
            self.bin_edges_ = np.sort(kmeans.cluster_centers_.flatten())

        return self

    def transform(self, X):
        
        binned_data = np.digitize(X, bins=self.bin_edges_)

        
        if self.encode == 'onehot':
            from sklearn.preprocessing import OneHotEncoder
            encoder = OneHotEncoder(sparse=False)
            binned_data = encoder.fit_transform(binned_data.reshape(-1, 1))
        return binned_data

In [880]:
data = np.array([[1.2], [2.4], [3.6], [4.8], [5.0]])

In [882]:
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
est.fit(data)