# Multiclass Target Encoding

In [2]:
import pandas as pd
import numpy as np

In [3]:
np.random.seed(999)
target = list(np.random.randint(0, 3, 20))
footballers = ["Lionel Messi", "Cristiano Ronaldo", "Xavi"]

In [5]:
np.random.seed(123)
footballers = [footballers[i] for i in np.random.randint(0, len(footballers), 20)]

In [6]:
df = pd.DataFrame({"footballers" : footballers, "target" : target})
df

Unnamed: 0,footballers,target
0,Xavi,0
1,Cristiano Ronaldo,0
2,Xavi,1
3,Xavi,1
4,Lionel Messi,0
5,Xavi,1
6,Xavi,1
7,Cristiano Ronaldo,0
8,Xavi,1
9,Cristiano Ronaldo,1


## 1. Calculating by Mean

In [7]:
stats = df['target'].groupby(df['footballers']).agg(['count', 'mean'])
stats

Unnamed: 0_level_0,count,mean
footballers,Unnamed: 1_level_1,Unnamed: 2_level_1
Cristiano Ronaldo,7,0.714286
Lionel Messi,4,0.5
Xavi,9,1.0


In [8]:
df = df.join(stats.drop(columns = 'count'), on = 'footballers', how = 'left').rename(columns = {'mean'  : 'encoded_mean'})
df

Unnamed: 0,footballers,target,encoded_mean
0,Xavi,0,1.0
1,Cristiano Ronaldo,0,0.714286
2,Xavi,1,1.0
3,Xavi,1,1.0
4,Lionel Messi,0,0.5
5,Xavi,1,1.0
6,Xavi,1,1.0
7,Cristiano Ronaldo,0,0.714286
8,Xavi,1,1.0
9,Cristiano Ronaldo,1,0.714286


## 2. Sklearn Category Encoders 
https://contrib.scikit-learn.org/category_encoders/targetencoder.html

In [12]:
from category_encoders import TargetEncoder
encoder = TargetEncoder()

In [13]:
df['encoded_sklearn'] = encoder.fit_transform(df['footballers'], df['target'])

In [14]:
df

Unnamed: 0,footballers,target,encoded_mean,encoded_sklearn
0,Xavi,0,1.0,0.999933
1,Cristiano Ronaldo,0,0.714286,0.714498
2,Xavi,1,1.0,0.999933
3,Xavi,1,1.0,0.999933
4,Lionel Messi,0,0.5,0.514228
5,Xavi,1,1.0,0.999933
6,Xavi,1,1.0,0.999933
7,Cristiano Ronaldo,0,0.714286,0.714498
8,Xavi,1,1.0,0.999933
9,Cristiano Ronaldo,1,0.714286,0.714498


## 3. Using Probabilities

In [23]:
categories = df['footballers'].unique()
targets = df['target'].unique()
cat_list = []
for cat in categories:
    aux_dict = {}
    aux_dict['category'] = cat
    aux_df = df[df['footballers'] == cat]
    counts = aux_df['target'].value_counts()
    aux_dict['count'] = sum(counts)
    for t in targets:
        aux_dict['target_' + str(t)] = counts[t] if t in counts.keys() else 0
    cat_list.append(aux_dict)

In [24]:
cat_list = pd.DataFrame(cat_list)

In [25]:
for t in targets:
    cat_list['prob_target_' + str(t)] = cat_list['target_' + str(t)] / cat_list['count']

In [26]:
cat_list

Unnamed: 0,category,count,target_0,target_1,target_2,prob_target_0,prob_target_1,prob_target_2
0,Xavi,9,2,5,2,0.222222,0.555556,0.222222
1,Cristiano Ronaldo,7,3,3,1,0.428571,0.428571,0.142857
2,Lionel Messi,4,3,0,1,0.75,0.0,0.25


In [28]:
df = df.join(cat_list.drop(columns = (['count'] + ['target_' + str(t) for t in targets])).set_index('category'), on = 'footballers', how = 'left')
df

Unnamed: 0,footballers,target,encoded_mean,encoded_sklearn,prob_target_0,prob_target_1,prob_target_2
0,Xavi,0,1.0,0.999933,0.222222,0.555556,0.222222
1,Cristiano Ronaldo,0,0.714286,0.714498,0.428571,0.428571,0.142857
2,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222
3,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222
4,Lionel Messi,0,0.5,0.514228,0.75,0.0,0.25
5,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222
6,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222
7,Cristiano Ronaldo,0,0.714286,0.714498,0.428571,0.428571,0.142857
8,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222
9,Cristiano Ronaldo,1,0.714286,0.714498,0.428571,0.428571,0.142857


## 4. Using Sklearn

In [31]:
targets = df['target'].unique()
for t in targets:
    target_aux = df['target'].apply(lambda x: 1 if x == t else 0)
    encoder = TargetEncoder()
    df['sklearn_target_' + str(t)] = encoder.fit_transform(df['footballers'], target_aux)

In [32]:
df

Unnamed: 0,footballers,target,encoded_mean,encoded_sklearn,prob_target_0,prob_target_1,prob_target_2,sklearn_target_0,sklearn_target_1,sklearn_target_2
0,Xavi,0,1.0,0.999933,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215
1,Cristiano Ronaldo,0,0.714286,0.714498,0.428571,0.428571,0.142857,0.428501,0.428501,0.142998
2,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215
3,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215
4,Lionel Messi,0,0.5,0.514228,0.75,0.0,0.25,0.733401,0.01897,0.247629
5,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215
6,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215
7,Cristiano Ronaldo,0,0.714286,0.714498,0.428571,0.428571,0.142857,0.428501,0.428501,0.142998
8,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215
9,Cristiano Ronaldo,1,0.714286,0.714498,0.428571,0.428571,0.142857,0.428501,0.428501,0.142998


## 5. Using Sklearn Mean encoding

In [34]:
targets = df['target'].unique()
for t in targets:
    df['target_' + str(t)] = df['target'].apply(lambda x: 1 if x == t else 0)
    stats = df['target_' + str(t)].groupby(df['footballers']).agg(['mean'])
    df = df.join(stats, on = 'footballers', how = 'left').rename(columns = {'mean'  : 'encoded_mean_target_' + str(t)})
    df = df.drop(columns = ['target_' + str(t)])

In [35]:
df

Unnamed: 0,footballers,target,encoded_mean,encoded_sklearn,prob_target_0,prob_target_1,prob_target_2,sklearn_target_0,sklearn_target_1,sklearn_target_2,encoded_mean_target_0,encoded_mean_target_1,encoded_mean_target_2
0,Xavi,0,1.0,0.999933,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215,0.222222,0.555556,0.222222
1,Cristiano Ronaldo,0,0.714286,0.714498,0.428571,0.428571,0.142857,0.428501,0.428501,0.142998,0.428571,0.428571,0.142857
2,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215,0.222222,0.555556,0.222222
3,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215,0.222222,0.555556,0.222222
4,Lionel Messi,0,0.5,0.514228,0.75,0.0,0.25,0.733401,0.01897,0.247629,0.75,0.0,0.25
5,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215,0.222222,0.555556,0.222222
6,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215,0.222222,0.555556,0.222222
7,Cristiano Ronaldo,0,0.714286,0.714498,0.428571,0.428571,0.142857,0.428501,0.428501,0.142998,0.428571,0.428571,0.142857
8,Xavi,1,1.0,0.999933,0.222222,0.555556,0.222222,0.222282,0.555503,0.222215,0.222222,0.555556,0.222222
9,Cristiano Ronaldo,1,0.714286,0.714498,0.428571,0.428571,0.142857,0.428501,0.428501,0.142998,0.428571,0.428571,0.142857
