In [1]:
import sys
sys.path.insert(0, "/home/patrick/Git/cobra/cobra/")
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as  pd
import numpy as np
from  cobra.preprocessing import TargetEncoder 


    Hi, welcome to Cobra!
    You can find some tutorials that explain the functioning of cobra on the PythonPredictions GitHub:
    https://github.com/PythonPredictions/cobra/tree/master/tutorials
        
  from .autonotebook import tqdm as notebook_tqdm


# Defining the data

In [3]:
np.random.seed(42)
data = pd.DataFrame({
    "numerical_var" : np.random.uniform(0,50, 15).tolist()+ [np.nan for _ in range(5)],
    "categorical_var": np.random.choice(["low", "middle", "high"],15).tolist() + [np.nan for _ in range(5)],
    "binary_target" : np.random.choice([True, False],20),
    "numerical_target" : np.random.uniform(0,10, 20),
})
# binning of the numerical variable
data["numerical_var_binned"] = pd.cut(data.numerical_var, bins=5, precision=0)

data.head()

Unnamed: 0,numerical_var,categorical_var,binary_target,numerical_target,numerical_var_binned
0,18.727006,low,False,3.854165,"(11.0, 20.0]"
1,47.535715,low,True,0.159663,"(39.0, 48.0]"
2,36.599697,middle,False,2.308938,"(30.0, 39.0]"
3,29.932924,middle,True,2.410255,"(30.0, 39.0]"
4,7.800932,low,False,6.832635,"(1.0, 11.0]"


In this data we have several missing values for the columns `numerical_var` and `categorical_var`. the Target encoder can due to this not assign an incidence value (for binary target) or a mean value (for numeric target) to those observations. The imputation strategy then defines how those encoded values should be estimated (possibilities: `"min", "max", "mean", "median"`).  
Those missing encoded values are then replaced by the `"min", "max", "mean", "median"` of the encoded variable.   

In [4]:
t_encoder = TargetEncoder(imputation_strategy="median")
encoded_data = t_encoder.fit_transform(data, column_names=["numerical_var_binned", "categorical_var"], target_column="binary_target" )
encoded_data

The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.
Fitting target encoding...: 100%|██████████| 2/2 [00:00<00:00, 251.07it/s]
Applying target encoding...: 100%|██████████| 2/2 [00:00<00:00, 169.43it/s]


Unnamed: 0,numerical_var,categorical_var,binary_target,numerical_target,numerical_var_binned,numerical_varned_enc,categorical_var_enc
0,18.727006,low,False,3.854165,"(11.0, 20.0]",0.5,0.4
1,47.535715,low,True,0.159663,"(39.0, 48.0]",1.0,0.4
2,36.599697,middle,False,2.308938,"(30.0, 39.0]",0.75,0.8
3,29.932924,middle,True,2.410255,"(30.0, 39.0]",0.75,0.8
4,7.800932,low,False,6.832635,"(1.0, 11.0]",0.4,0.4
5,7.799726,low,False,6.099967,"(1.0, 11.0]",0.4,0.4
6,2.904181,low,True,8.331949,"(1.0, 11.0]",0.4,0.4
7,43.308807,high,True,1.733647,"(39.0, 48.0]",1.0,0.8
8,30.055751,high,True,3.910606,"(30.0, 39.0]",0.75,0.8
9,35.403629,high,True,1.822361,"(30.0, 39.0]",0.75,0.8


In [5]:
encoded_data.dropna()[["numerical_varned_enc",	"categorical_var_enc"]].median()

numerical_varned_enc    0.75
categorical_var_enc     0.80
dtype: float64

Here above the `numerical_varned_enc` and `categorical_var_enc` contain the median value of the observations where `numerical_var`, `categorical_var` respectively are not missing.

**So only the values where we have an encoded value are taken into account**

In [33]:
t_encoder = TargetEncoder(imputation_strategy="mean")
encoded_data2 = t_encoder.fit_transform(data, column_names=["numerical_var_binned", "categorical_var"], target_column="binary_target" )
encoded_data2

The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.
Fitting target encoding...: 100%|██████████| 2/2 [00:12<00:00,  6.05s/it]
Applying target encoding...: 100%|██████████| 2/2 [00:00<00:00, 119.55it/s]


Unnamed: 0,numerical_var,categorical_var,binary_target,numerical_target,numerical_var_binned,numerical_varned_enc,categorical_var_enc
0,18.727006,low,False,3.854165,"(11.0, 20.0]",0.5,0.4
1,47.535715,low,True,0.159663,"(39.0, 48.0]",1.0,0.4
2,36.599697,middle,False,2.308938,"(30.0, 39.0]",0.75,0.8
3,29.932924,middle,True,2.410255,"(30.0, 39.0]",0.75,0.8
4,7.800932,low,False,6.832635,"(1.0, 11.0]",0.4,0.4
5,7.799726,low,False,6.099967,"(1.0, 11.0]",0.4,0.4
6,2.904181,low,True,8.331949,"(1.0, 11.0]",0.4,0.4
7,43.308807,high,True,1.733647,"(39.0, 48.0]",1.0,0.8
8,30.055751,high,True,3.910606,"(30.0, 39.0]",0.75,0.8
9,35.403629,high,True,1.822361,"(30.0, 39.0]",0.75,0.8


In [49]:
encoded_data2.binary_target.mean()

0.55

**In the mean implementation the overall global mean over the target varaible is taken**

Is this correct or should we here also take the mean over only the know ones ? 