In [1]:
import pandas as pd
import numpy as np
from sklearn import base
from sklearn.model_selection import KFold

In [2]:
def getRandomDataFrame(data, numCol):
    
    if data== 'train':
    
        key = ["A" if x ==0  else 'B' for x in np.random.randint(2, size=(numCol,))]
        value = np.random.randint(2, size=(numCol,))
        df = pd.DataFrame({'Feature':key, 'Target':value})

        return df
    
    elif data=='test':
        
        key = ["A" if x ==0  else 'B' for x in np.random.randint(2, size=(numCol,))]
        df = pd.DataFrame({'Feature':key})

        return df
    else:
        print(';)')

In [3]:
# train = getRandomDataFrame('train',20)
# test = getRandomDataFrame('test',5)

In [4]:
# train.to_csv('./train.csv',index=False)
# test.to_csv('./test.csv',index=False)

In [4]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [5]:
train

Unnamed: 0,Feature,Target
0,A,1
1,B,0
2,B,0
3,B,1
4,B,1
5,A,1
6,B,0
7,A,0
8,A,0
9,B,0


In [6]:
train.groupby('Feature').mean()

Unnamed: 0_level_0,Target
Feature,Unnamed: 1_level_1
A,0.6
B,0.3


In [8]:
test

Unnamed: 0,Feature
0,B
1,B
2,B
3,A
4,A


In [9]:
class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):

    def __init__(self, colnames,targetName,n_fold=5,verbosity=True,discardOriginal_col=False):

        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col

    def fit(self, X, y=None):
        return self


    def transform(self,X):

        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)

        mean_of_target = X[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold, shuffle = True, random_state=2019)



        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
        X[col_mean_name] = np.nan

        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
#             print(tr_ind,val_ind)
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())

        X[col_mean_name].fillna(mean_of_target, inplace = True)

        if self.verbosity:

            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
                                                                                      self.targetName,
                                                                                      np.corrcoef(X[self.targetName].values, encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
            

        return X

In [10]:
targetc = KFoldTargetEncoderTrain('Feature','Target',n_fold=5)
new_train = targetc.fit_transform(train)

Correlation between the new feature, Feature_Kfold_Target_Enc and, Target is 0.18053954978064135.


In [11]:
new_train

Unnamed: 0,Feature,Target,Feature_Kfold_Target_Enc
0,A,1,0.571429
1,B,0,0.375
2,B,0,0.333333
3,B,1,0.25
4,B,1,0.333333
5,A,1,0.571429
6,B,0,0.333333
7,A,0,0.625
8,A,0,0.571429
9,B,0,0.375


In [12]:
train[['Feature','Target']].iloc[4:20,:].groupby('Feature').mean()

Unnamed: 0_level_0,Target
Feature,Unnamed: 1_level_1
A,0.555556
B,0.285714


In [14]:
train[['Feature','Target']].groupby('Feature').mean()

Unnamed: 0_level_0,Target
Feature,Unnamed: 1_level_1
A,0.6
B,0.3


In [15]:
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self,train,colNames,encodedName):
        
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName
        
        
    def fit(self, X, y=None):
        return self

    def transform(self,X):


        mean = self.train[[self.colNames,self.encodedName]].groupby(self.colNames).mean().reset_index() 
        
        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]

        
        X[self.encodedName] = X[self.colNames]
        X = X.replace({self.encodedName: dd})

        return X
        

In [17]:
test_targetc = KFoldTargetEncoderTest(new_train,'Feature','Feature_Kfold_Target_Enc')
test_targetc.fit_transform(test)

Unnamed: 0,Feature,Feature_Kfold_Target_Enc
0,B,0.294048
1,B,0.294048
2,B,0.294048
3,A,0.619841
4,A,0.619841
