In [2]:
import pandas as pd 
import numpy as np

In [3]:
# create dataframe
cols = ['TransactionID', 'ClientID', 'Profession', 'Bank_dep', 'Risk', 'Number of credits', 'Revenue']

In [4]:
row_1 = [1,231,'Self-employed', '009', 'High', 2, 30200]
row_2 = [2,765,'students','005','high',3,12700]
row_3 = [3,453,'Horeca','007','medium',5,89400]
row_4 = [4,231,'self-employed','009','high',2,30200]
row_5 = [5,892,'finance','003','low',3,740000]

In [5]:
data = pd.DataFrame([row_1,row_2,row_3,row_4,row_5], columns=cols)

In [6]:
data.head()

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
0,1,231,Self-employed,9,High,2,30200
1,2,765,students,5,high,3,12700
2,3,453,Horeca,7,medium,5,89400
3,4,231,self-employed,9,high,2,30200
4,5,892,finance,3,low,3,740000


### 1. Dummy Encoder

In [7]:
pd.get_dummies(data)

Unnamed: 0,TransactionID,ClientID,Number of credits,Revenue,Profession_Horeca,Profession_Self-employed,Profession_finance,Profession_self-employed,Profession_students,Bank_dep_003,Bank_dep_005,Bank_dep_007,Bank_dep_009,Risk_High,Risk_high,Risk_low,Risk_medium
0,1,231,2,30200,0,1,0,0,0,0,0,0,1,1,0,0,0
1,2,765,3,12700,0,0,0,0,1,0,1,0,0,0,1,0,0
2,3,453,5,89400,1,0,0,0,0,0,0,1,0,0,0,0,1
3,4,231,2,30200,0,0,0,1,0,0,0,0,1,0,1,0,0
4,5,892,3,740000,0,0,1,0,0,1,0,0,0,0,0,1,0


The encoder adds 13 new columns with 0/1 values in it. This method is case sensitive, so data needs to be properly processed not to get problems with lower and upper case letters

### 2. Label Encoder

In [8]:
from sklearn.preprocessing import LabelEncoder
label_encode = LabelEncoder()
label_encode.fit_transform(data['Profession'])

array([1, 4, 0, 3, 2])

In [9]:
label_encode.fit(data['Profession'])

LabelEncoder()

In [10]:
label_encode.transform(data['Profession'])

array([1, 4, 0, 3, 2])

In [11]:
data['Profession_labels']=label_encode.fit_transform(data['Profession'])
data.head(5)

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue,Profession_labels
0,1,231,Self-employed,9,High,2,30200,1
1,2,765,students,5,high,3,12700,4
2,3,453,Horeca,7,medium,5,89400,0
3,4,231,self-employed,9,high,2,30200,3
4,5,892,finance,3,low,3,740000,2


### 3. Backward Difference Encoder

In [105]:
#pip install category_encoders

In [20]:
import category_encoders as ce
encoder = ce.BackwardDifferenceEncoder()

In [21]:
encoder.fit(data['Risk'])

BackwardDifferenceEncoder(cols=['Risk'],
                          mapping=[{'col': 'Risk',
                                    'mapping':     Risk_0  Risk_1  Risk_2
 1   -0.75    -0.5   -0.25
 2    0.25    -0.5   -0.25
 3    0.25     0.5   -0.25
 4    0.25     0.5    0.75
-1    0.00     0.0    0.00
-2    0.00     0.0    0.00}])

In [22]:
encoder.fit_transform(data['Risk'])
encoder.get_feature_names()

['intercept', 'Risk_0', 'Risk_1', 'Risk_2']

In [23]:
encoder.transform(data['Risk'])

Unnamed: 0,intercept,Risk_0,Risk_1,Risk_2
0,1,-0.75,-0.5,-0.25
1,1,0.25,-0.5,-0.25
2,1,0.25,0.5,-0.25
3,1,0.25,-0.5,-0.25
4,1,0.25,0.5,0.75


In [24]:
data.head(5)

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue,Profession_labels
0,1,231,Self-employed,9,High,2,30200,1
1,2,765,students,5,high,3,12700,4
2,3,453,Horeca,7,medium,5,89400,0
3,4,231,self-employed,9,high,2,30200,3
4,5,892,finance,3,low,3,740000,2


### 4. BaseN

In [46]:
encoder= ce.BaseNEncoder(cols=['Profession'],return_df=True,base=5)

In [47]:
encoder.fit_transform(data['Profession'])

Unnamed: 0,Profession_0,Profession_1
0,0,1
1,0,2
2,0,3
3,0,4
4,1,0


### 5. Binary

In [41]:
encoder= ce.BinaryEncoder(cols=['Profession'],return_df=True)

In [42]:
encoder.fit_transform(data)

Unnamed: 0,TransactionID,ClientID,Profession_0,Profession_1,Profession_2,Bank_dep,Risk,Number of credits,Revenue,Profession_labels
0,1,231,0,0,1,9,High,2,30200,1
1,2,765,0,1,0,5,high,3,12700,4
2,3,453,0,1,1,7,medium,5,89400,0
3,4,231,1,0,0,9,high,2,30200,3
4,5,892,1,0,1,3,low,3,740000,2


In [43]:
encoder.fit_transform(data['Profession'])

Unnamed: 0,Profession_0,Profession_1,Profession_2
0,0,0,1
1,0,1,0
2,0,1,1
3,1,0,0
4,1,0,1


### 6. CatBoost Encoder
It converts an array of categorical data into a mean value of a numerical data which is dependent from that categorical data. Returns and array of numbers, but requires a target value.

In [54]:
encoder = ce.CatBoostEncoder()

In [55]:
encoder.fit_transform(data['Profession'],data['Revenue'])

Unnamed: 0,Profession
0,180500.0
1,180500.0
2,180500.0
3,180500.0
4,180500.0


In [56]:
encoder.fit_transform(data2['Profession'],data['Revenue'])

Unnamed: 0,Profession
0,180500.0
1,180500.0
2,180500.0
3,105350.0
4,180500.0


### 7. Count Encoder
Counts how many times feature apears in the column and replace it by the number

In [57]:
encoder = ce.CountEncoder(cols=['Profession'])

In [58]:
encoder.fit_transform(data['Profession'])

Unnamed: 0,Profession
0,1
1,1
2,1
3,1
4,1


In [59]:
encoder.fit_transform(data2['Profession'])

Unnamed: 0,Profession
0,2
1,1
2,1
3,2
4,1


### 8. Generalized Linear Mixed Model Encoder
Similar to Target Encoder but uses more advanced statistical methods, thanks to which it is more precise

In [60]:
encoder = ce.GLMMEncoder()

In [61]:
encoder.fit_transform(data['Profession'],data['Revenue'])

Unnamed: 0,Profession
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [62]:
encoder.fit_transform(data2['Profession'],data['Revenue'])

Unnamed: 0,Profession
0,-187875.010232
1,-205375.01045
2,-128675.010445
3,-187875.010232
4,521925.051956


### 9. Hashing
Hashing is the transformation of arbitrary size input in the form of a fixed-size value. It is a one-way process. Hashing has several applications like data retrieval, checking data corruption, and in data encryption also.

In [39]:
encoder=ce.HashingEncoder()

In [40]:
encoder.fit_transform(data['Profession'])

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0


### 10. Helmert Coding

In [63]:
encoder = ce.HelmertEncoder()
encoder.fit_transform(data['Risk'])

Unnamed: 0,intercept,Risk_0,Risk_1,Risk_2
0,1,-1.0,-1.0,-1.0
1,1,1.0,-1.0,-1.0
2,1,0.0,2.0,-1.0
3,1,1.0,-1.0,-1.0
4,1,0.0,0.0,3.0


In [64]:
#In order to check it how it works for processed data I will create new dataframe with clean Risk column. i.e. no diffrence between upercase, lowercase letters.
data3=[risk.lower() for risk in data['Risk']]
data3

['high', 'high', 'medium', 'high', 'low']

In [65]:
data3= pd.DataFrame(data3, columns=['Risk'])
data3

Unnamed: 0,Risk
0,high
1,high
2,medium
3,high
4,low


In [66]:
encoder.fit_transform(data3['Risk'])

Unnamed: 0,intercept,Risk_0,Risk_1
0,1,-1.0,-1.0
1,1,-1.0,-1.0
2,1,1.0,-1.0
3,1,-1.0,-1.0
4,1,0.0,2.0


### 11. James-Stein Encoder

In [67]:
encoder = ce.JamesSteinEncoder()

In [68]:
encoder.fit_transform(data2['Profession'], data['Revenue'])

Unnamed: 0,Profession
0,30200.0
1,12700.0
2,89400.0
3,30200.0
4,740000.0


### 12. Leave One Out
Leave one out encoding is just target encoding where the average or expected value is calculated ignoring the value in the current row.

In [70]:
encoder = ce.LeaveOneOutEncoder()
encoder.fit_transform(data2['Profession'], data['Revenue'])

Unnamed: 0,Profession
0,30200.0
1,180500.0
2,180500.0
3,30200.0
4,180500.0


### 13. M-estimate
Simplest version of target encoding

In [72]:
encoder = ce.MEstimateEncoder()

In [73]:
encoder.fit_transform(data['Profession'], data['Revenue'])

Unnamed: 0,Profession
0,180500.0
1,180500.0
2,180500.0
3,180500.0
4,180500.0


### 14. One Hot

In [34]:
encoder=ce.OneHotEncoder(data['Profession'])

In [35]:
encoder.fit_transform(data['Profession'])

Unnamed: 0,Profession_1,Profession_2,Profession_3,Profession_4,Profession_5
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,0,1


### 15. Ordinal

In [30]:
#Used for ordinal data. Converts non-numerical data into an integer. Part of Category Encoders Libary
encoder= ce.OrdinalEncoder(data['Risk'])
encoder

OrdinalEncoder(verbose=0      High
1      high
2    medium
3      high
4       low
Name: Risk, dtype: object)

In [31]:
encoder.fit_transform(data['Risk'])

Unnamed: 0,Risk
0,1
1,2
2,3
3,2
4,4


In [32]:
encoder= ce.OrdinalEncoder(data['Risk'],return_df=True, mapping=[{'col':'Risk', 'mapping':{'None':0,'High':3,'high':3,'low':1,'medium':2}}])

In [33]:
#Now I can fit and transform the data 
encoder.fit_transform(data['Risk'])

Unnamed: 0,Risk
0,3
1,3
2,2
3,3
4,1


### 16. Polynomial Coding
Similar to Backward Difference Encoder. This type of coding system should be used only with an ordinal variable in which the levels are equally spaced. Examples of such a variable might be income or education

In [74]:
encoder = ce.PolynomialEncoder(cols=["Risk"])

In [75]:
encoder.fit_transform(data3, verbose=1)

Unnamed: 0,intercept,Risk_0,Risk_1
0,1,-0.7071068,0.408248
1,1,-0.7071068,0.408248
2,1,-4.4337800000000005e-17,-0.816497
3,1,-0.7071068,0.408248
4,1,0.7071068,0.408248


### 17. Sum Coding

In [36]:
encoder = ce.SumEncoder()

In [37]:
encoder.fit_transform(data['Profession'])

Unnamed: 0,intercept,Profession_0,Profession_1,Profession_2,Profession_3
0,1,1.0,0.0,0.0,0.0
1,1,0.0,1.0,0.0,0.0
2,1,0.0,0.0,1.0,0.0
3,1,0.0,0.0,0.0,1.0
4,1,-1.0,-1.0,-1.0,-1.0


In [38]:
encoder.fit_transform(data)

Unnamed: 0,intercept,TransactionID,ClientID,Profession_0,Profession_1,Profession_2,Profession_3,Bank_dep,Risk,Number of credits,Revenue,Profession_labels
0,1,1,231,1.0,0.0,0.0,0.0,9,High,2,30200,1
1,1,2,765,0.0,1.0,0.0,0.0,5,high,3,12700,4
2,1,3,453,0.0,0.0,1.0,0.0,7,medium,5,89400,0
3,1,4,231,0.0,0.0,0.0,1.0,9,high,2,30200,3
4,1,5,892,-1.0,-1.0,-1.0,-1.0,3,low,3,740000,2


### 18. Target Encoder
It converts an array of categorical data into a mean value of a numerical data which is dependent from that categorical data. Returns and array of numbers, but requires a target value.


In [48]:
encoder=ce.TargetEncoder(cols='Profession') 

In [49]:
encoder.fit_transform(data['Profession'],data['Revenue'])

Unnamed: 0,Profession
0,180500.0
1,180500.0
2,180500.0
3,180500.0
4,180500.0


In [50]:
data2=[profession.lower() for profession in data['Profession']]
data2

['self-employed', 'students', 'horeca', 'self-employed', 'finance']

In [51]:
data2= pd.DataFrame(data2, columns=['Profession'])

In [52]:
data2

Unnamed: 0,Profession
0,self-employed
1,students
2,horeca
3,self-employed
4,finance


In [53]:
encoder.fit_transform(data2['Profession'],data['Revenue'])

Unnamed: 0,Profession
0,70621.895632
1,180500.0
2,180500.0
3,70621.895632
4,180500.0


### 19. Weight of Evidence


In [77]:
encoder = ce.WOEEncoder()
#encoder.fit_transform(data2['Profession'], data['Profession_labels'])

### 20. Wrappers

In [80]:
from category_encoders import utils
#encoder = ce.PolynomialWrapper()
#encoder.fit_transform(data['Profession'], data['Revenue'])

### 21. Quantile Encoder

In [96]:
import numpy as np
from sklearn.preprocessing import QuantileTransformer
rng = np.random.RandomState(0)
X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)
qt = QuantileTransformer(n_quantiles=10, random_state=0)
qt.fit_transform(X)

array([[0.        ],
       [0.09871873],
       [0.10643612],
       [0.11754671],
       [0.21017437],
       [0.21945445],
       [0.23498666],
       [0.32443642],
       [0.33333333],
       [0.41360794],
       [0.42339464],
       [0.46257841],
       [0.47112236],
       [0.49834237],
       [0.59986536],
       [0.63390302],
       [0.66666667],
       [0.68873101],
       [0.69611125],
       [0.81280699],
       [0.82160354],
       [0.88126439],
       [0.90516028],
       [0.99319435],
       [1.        ]])

### 22. Summary Encoder

In [97]:
from category_encoders import *

In [101]:
#category_encoders.quantile_encoder.SummaryEncoder
#https://contrib.scikit-learn.org/category_encoders/summary.html

In [102]:
#pip install sktools

In [104]:
from sktools import *
import pandas as pd
from sklearn.datasets import load_boston
bunch = load_boston()
y = bunch.target
X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
#enc = SummaryEncoder(cols=['CHAS', 'RAD'], quantile=0.5, m=1.0).fit(X, y)
#numeric_dataset = enc.transform(X)
#print(numeric_dataset.info())