### k-Fold TargetEncoder

In [35]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold

In [36]:
class KFoldTargetEncoderTrain():    
    
    def __init__(self,colnames,targetName,
                  n_fold=5, verbosity=True,
                  discardOriginal_col=False):  
        
        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col    
        
    def fit(self, X, y=None):
        
        return self    
    
    def transform(self,X):   
        
        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)   
        
        mean_of_target = X[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold,
                   shuffle = True, random_state=2019) # установлено перемешивание, для классической схемы его нет       
        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
        X[col_mean_name] = np.nan        
        
        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames]\
                                                    .map(X_tr.groupby(self.colnames)[self.targetName]\
                                                         .mean())            
            X[col_mean_name].fillna(mean_of_target, inplace = True)        
            
            if self.verbosity:            
                encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,self.targetName,                    
                   np.corrcoef(X[self.targetName].values,
                               encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
        return X

In [37]:
class KFoldTargetEncoderTest():
    
    def __init__(self,train,colNames,encodedName):
        
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName
            
    def fit(self, X, y=None):
        return self
        
    def transform(self,X):
            
        mean =  self.train[[self.colNames, self.encodedName]].groupby(self.colNames).mean().reset_index()
        
        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]
        
        X[self.encodedName] = X[self.colNames]
        X = X.replace({self.encodedName: dd})   
        
        return X

##### Пример использования

In [109]:
import random

cat = ['a','b']
random.seed(23)
x = np.array(random.choices(cat, k=14))

random.seed(12)
x_1 = np.array(random.choices(cat, k=14))
y = np.array([1,1,1,1,1,0,0,1,1,1,1,0,0,0])

In [110]:
X_train = pd.DataFrame({'x':x, 
                        'y': y})

X_test = pd.DataFrame({'x':x_1})

In [111]:
X_train.head()

Unnamed: 0,x,y
0,b,1
1,b,1
2,b,1
3,a,1
4,b,1


In [112]:
X_test.head()

Unnamed: 0,x
0,a
1,b
2,b
3,a
4,a


In [113]:
targetc = KFoldTargetEncoderTrain('x','y',n_fold=2)
new_train = targetc.transform(X_train)

Correlation between the new feature, x_Kfold_Target_Enc and, y is -0.36181361349331637.
Correlation between the new feature, x_Kfold_Target_Enc and, y is -0.3499900937926354.


In [114]:
new_train

Unnamed: 0,x,y,x_Kfold_Target_Enc
0,b,1,0.5
1,b,1,0.5
2,b,1,1.0
3,a,1,0.4
4,b,1,0.5
5,a,0,0.75
6,b,0,1.0
7,a,1,0.4
8,a,1,0.75
9,a,1,0.4


In [115]:
test_targetc = KFoldTargetEncoderTest(new_train,
                                      'x','x_Kfold_Target_Enc')
new_test = test_targetc.transform(X_test)

In [116]:
new_test

Unnamed: 0,x,x_Kfold_Target_Enc
0,a,0.594444
1,b,0.7
2,b,0.7
3,a,0.594444
4,a,0.594444
5,a,0.594444
6,a,0.594444
7,b,0.7
8,b,0.7
9,b,0.7


In [52]:
freqs_cat = X.groupby(x)['x'].count()

In [56]:
freqs_cat

a    5
b    9
Name: x, dtype: int64

In [54]:
encoded_col_train = X['x'].map(freqs_cat)

In [55]:
encoded_col_train

0     5
1     5
2     5
3     5
4     5
5     9
6     9
7     9
8     9
9     9
10    9
11    9
12    9
13    9
Name: x, dtype: int64