In [1]:
import pandas as pd 
import numpy as np

In [1]:
#First I create dataframe as in readme file 
cols = ['TransactionID', 'ClientID', 'Profession', 'Bank_dep', 'Risk', 'Number of credits', 'Revenue']

In [6]:
row_1 = [1,231,'Self-employed', '009', 'High', 2, 30200]
row_2 = [2,765,'students','005','high',3,12700]
row_3 = [3,453,'Horeca','007','medium',5,89400]
row_4 = [4,231,'self-employed','009','high',2,30200]
row_5 = [5,892,'finance','003','low',3,740000]

In [7]:
data = pd.DataFrame([row_1,row_2,row_3,row_4,row_5], columns=cols)

In [8]:
data

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
0,1,231,Self-employed,9,High,2,30200
1,2,765,students,5,high,3,12700
2,3,453,Horeca,7,medium,5,89400
3,4,231,self-employed,9,high,2,30200
4,5,892,finance,3,low,3,740000


### Dummy Encoder

In [9]:
pd.get_dummies(data)

Unnamed: 0,TransactionID,ClientID,Number of credits,Revenue,Profession_Horeca,Profession_Self-employed,Profession_finance,Profession_self-employed,Profession_students,Bank_dep_003,Bank_dep_005,Bank_dep_007,Bank_dep_009,Risk_High,Risk_high,Risk_low,Risk_medium
0,1,231,2,30200,0,1,0,0,0,0,0,0,1,1,0,0,0
1,2,765,3,12700,0,0,0,0,1,0,1,0,0,0,1,0,0
2,3,453,5,89400,1,0,0,0,0,0,0,1,0,0,0,0,1
3,4,231,2,30200,0,0,0,1,0,0,0,0,1,0,1,0,0
4,5,892,3,740000,0,0,1,0,0,1,0,0,0,0,0,1,0


The encoder adds 13 new columns with 0/1 values in it. This method is case sensitive, so data needs to be properly processed not to get problems with lower and upper case letters (for example) 


### Label Encoder

In [17]:
from sklearn.preprocessing import LabelEncoder

In [19]:
lbl_ebcode = LabelEncoder()
lbl_ebcode.fit_transform(data['Profession'])

array([1, 4, 0, 3, 2])

In [34]:
lbl_ebcode.fit(data['Profession'])

LabelEncoder()

In [36]:
lbl_ebcode.transform(data['Profession'])

array([1, 4, 0, 3, 2])

In [21]:
data['Profession_labels']=lbl_ebcode.fit_transform(data['Profession'])
data.head(5)

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue,Profession_labels
0,1,231,Self-employed,9,High,2,30200,1
1,2,765,students,5,high,3,12700,4
2,3,453,Horeca,7,medium,5,89400,0
3,4,231,self-employed,9,high,2,30200,3
4,5,892,finance,3,low,3,740000,2


# CATEGORY ENCODERS

## Backward Difference Encoder

In [2]:
import category_encoders as ce

In [25]:

encoder = ce.BackwardDifferenceEncoder()

In [26]:
encoder.fit(data['Risk'])

BackwardDifferenceEncoder(cols=['Risk'],
                          mapping=[{'col': 'Risk',
                                    'mapping':     Risk_0  Risk_1  Risk_2
 1   -0.75    -0.5   -0.25
 2    0.25    -0.5   -0.25
 3    0.25     0.5   -0.25
 4    0.25     0.5    0.75
-1    0.00     0.0    0.00
-2    0.00     0.0    0.00}])

In [27]:
encoder.fit_transform(data['Risk'])

Unnamed: 0,intercept,Risk_0,Risk_1,Risk_2
0,1,-0.75,-0.5,-0.25
1,1,0.25,-0.5,-0.25
2,1,0.25,0.5,-0.25
3,1,0.25,-0.5,-0.25
4,1,0.25,0.5,0.75


In [30]:
encoder.get_feature_names()

['intercept', 'Risk_0', 'Risk_1', 'Risk_2']

In [31]:
encoder.transform(data['Risk'])

Unnamed: 0,intercept,Risk_0,Risk_1,Risk_2
0,1,-0.75,-0.5,-0.25
1,1,0.25,-0.5,-0.25
2,1,0.25,0.5,-0.25
3,1,0.25,-0.5,-0.25
4,1,0.25,0.5,0.75


In [32]:
data.head(5)

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue,Profession_labels
0,1,231,Self-employed,9,High,2,30200,1
1,2,765,students,5,high,3,12700,4
2,3,453,Horeca,7,medium,5,89400,0
3,4,231,self-employed,9,high,2,30200,3
4,5,892,finance,3,low,3,740000,2


## BaseN Encoder

In [38]:
#BaseN Encoder uses the same libary as Bakward Difference Coding
encoder = ce.BaseNEncoder(cols=[...])

In [41]:
encoder.basen_to_integer(data)

TypeError: basen_to_integer() missing 2 required positional arguments: 'cols' and 'base'

## Ordinal Encoder
Used for ordinal data. Converts non-numerical data into an integer. Part of Category Encoders Libary 

In [48]:
#Since it should be used for ordinal data I will use it for Risk column
encoder= ce.OrdinalEncoder(data['Risk'])
encoder

OrdinalEncoder(verbose=0      High
1      high
2    medium
3      high
4       low
Name: Risk, dtype: object)

In [49]:
#The encoder worked, but the encoded data do not look like I would like to, so I should specify mapping for the encoder 
encoder.fit_transform(data['Risk'])

Unnamed: 0,Risk
0,1
1,2
2,3
3,2
4,4


In [46]:
#I can specify how the order should proceed by difining it in the encoder. Like that I can take care of the values like 'high' and 'High'
encoder= ce.OrdinalEncoder(data['Risk'],return_df=True, mapping=[{'col':'Risk', 'mapping':{'None':0,'High':3,'high':3,'low':1,'medium':2}}])

In [47]:
#Now I can fit and transform the data 
encoder.fit_transform(data['Risk'])

Unnamed: 0,Risk
0,3
1,3
2,2
3,3
4,1


## One Hot encoder
Used for nominal data - data does not to have any order> for each level of categorical feature a new variabel is created containing either 0 or 1


In [50]:
#As an example I will use One Hot encoder only on 'profession' column
# This method is coming from the Category Encoders libary 

In [51]:
encoder=ce.OneHotEncoder(data['Profession'])

In [52]:
encoder

OneHotEncoder(verbose=0    Self-employed
1         students
2           Horeca
3    self-employed
4          finance
Name: Profession, dtype: object)

In [53]:
#As there is a difference in lower and upper case letteres in the positions 0 and 3. The data should be cleaned before encoding, as this emthod is case sensitive
#method returns Dummy variables
encoder.fit_transform(data['Profession'])

Unnamed: 0,Profession_1,Profession_2,Profession_3,Profession_4,Profession_5
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,0,1


## Sum coding/Sum encoding
It is similar to Dummy encoding, but represents the data using 3 values -1,0 and 1. Value -1 appears where there is a position with only 0. Thanks to that there is one less column comapared to One hot encoder

In [54]:
#I will use the same column as in the One Hot encoder example, so it should be easier to compare
encoder = ce.SumEncoder()

In [55]:
encoder.fit_transform(data['Profession'])

Unnamed: 0,intercept,Profession_0,Profession_1,Profession_2,Profession_3
0,1,1.0,0.0,0.0,0.0
1,1,0.0,1.0,0.0,0.0
2,1,0.0,0.0,1.0,0.0
3,1,0.0,0.0,0.0,1.0
4,1,-1.0,-1.0,-1.0,-1.0


In [56]:
encoder.fit_transform(data)

Unnamed: 0,intercept,TransactionID,ClientID,Profession_0,Profession_1,Profession_2,Profession_3,Bank_dep,Risk,Number of credits,Revenue,Profession_labels
0,1,1,231,1.0,0.0,0.0,0.0,9,High,2,30200,1
1,1,2,765,0.0,1.0,0.0,0.0,5,high,3,12700,4
2,1,3,453,0.0,0.0,1.0,0.0,7,medium,5,89400,0
3,1,4,231,0.0,0.0,0.0,1.0,9,high,2,30200,3
4,1,5,892,-1.0,-1.0,-1.0,-1.0,3,low,3,740000,2


## Hashing / Hash encoder 
Hashing is the transformation of arbitrary size input in the form of a fixed-size value. It is a one-way process. Hashing has several applications like data retrieval, checking data corruption, and in data encryption also.
  
  As a result of hushing I obtain similary as in case of dummy encoding a 2 dimension table with 0, 1 values. However, we can choose the number of columns in which data will be represenetd. Thus this allows to represent for example 6 values data with N elemnts.

In [57]:
encoder=ce.HashingEncoder()

In [58]:
#I will fit and transform firts the 'Profession' column
encoder.fit_transform(data['Profession'])

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0


## Binary Encoding
This is a combination of hashing and one hot encoding. First, the categoritacl data is transformed into numerical using an ordinary encoder, and then the numbers are transformed into binary numbers.

In [63]:
#I will try fristly to encode 'Profession' column
encoder= ce.BinaryEncoder(cols=['Profession'],return_df=True)

In [64]:
encoder.fit_transform(data)

Unnamed: 0,TransactionID,ClientID,Profession_0,Profession_1,Profession_2,Bank_dep,Risk,Number of credits,Revenue,Profession_labels
0,1,231,0,0,1,9,High,2,30200,1
1,2,765,0,1,0,5,high,3,12700,4
2,3,453,0,1,1,7,medium,5,89400,0
3,4,231,1,0,0,9,high,2,30200,3
4,5,892,1,0,1,3,low,3,740000,2


In [65]:
encoder.fit_transform(data['Profession'])

Unnamed: 0,Profession_0,Profession_1,Profession_2
0,0,0,1
1,0,1,0
2,0,1,1
3,1,0,0
4,1,0,1


In [66]:
#I repeat procedure for Nank department, which was given as a string 
encoder= ce.BinaryEncoder(cols=['Bank_dep'],return_df=True)

In [67]:
encoder.fit_transform(data)

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep_0,Bank_dep_1,Bank_dep_2,Risk,Number of credits,Revenue,Profession_labels
0,1,231,Self-employed,0,0,1,High,2,30200,1
1,2,765,students,0,1,0,high,3,12700,4
2,3,453,Horeca,0,1,1,medium,5,89400,0
3,4,231,self-employed,0,0,1,high,2,30200,3
4,5,892,finance,1,0,0,low,3,740000,2


## BaseN encoding 
It is similar to Binary Enocding, but we are able to difine the base. This allows to represent the same features with lower number of columns when compared to Binary encoding.

In [69]:
#For comparison with Binary Encoding I will use 'Profession' column
encoder= ce.BaseNEncoder(cols=['Profession'],return_df=True,base=5)

In [70]:
encoder.fit_transform(data['Profession'])

Unnamed: 0,Profession_0,Profession_1
0,0,1
1,0,2
2,0,3
3,0,4
4,1,0


## Target Encoder
It converts an array of categorical data into a mean value of a numerical data which is dependent from that categorical data. 
Returns and array of numbers, but requires a target value.

In [71]:
#In the Table provided revenue can be conected to the profession, so I will use this 2 columns to do target encoding for profession
encoder=ce.TargetEncoder(cols='Profession') 

In [86]:
#As a result I obtained the same value, as tehy might be not enough, I will try to do the same but with cleaned data for column 'Profession'
encoder.fit_transform(data['Profession'],data['Revenue'])

Unnamed: 0,Profession
0,180500.0
1,180500.0
2,180500.0
3,180500.0
4,180500.0


In [82]:
data2=[profession.lower() for profession in data['Profession']]
data2

['self-employed', 'students', 'horeca', 'self-employed', 'finance']

In [83]:
data2= pd.DataFrame(data2, columns=['Profession'])

In [84]:
data2

Unnamed: 0,Profession
0,self-employed
1,students
2,horeca
3,self-employed
4,finance


In [85]:
#Now this data was encoded again, but I obtained diffrent values, proving that this method is efficient but a larger number of data is needed in order to correctly encode categorical data. 
encoder.fit_transform(data2['Profession'],data['Revenue'])

Unnamed: 0,Profession
0,70621.895632
1,180500.0
2,180500.0
3,70621.895632
4,180500.0


## CatBoost Encoder
Similar to target encoder 

In [87]:
encoder = ce.CatBoostEncoder()

In [88]:
encoder.fit_transform(data['Profession'],data['Revenue'])

Unnamed: 0,Profession
0,180500.0
1,180500.0
2,180500.0
3,180500.0
4,180500.0


In [89]:
#Gives similar results as target enocder. 
#In order to see the differenc I run again the same code but for cleaned 'profession' --> data2
encoder.fit_transform(data2['Profession'],data['Revenue'])

Unnamed: 0,Profession
0,180500.0
1,180500.0
2,180500.0
3,105350.0
4,180500.0


## Count Encoder
Counts how many timesa feature apears in the column and replace it by the number

In [90]:
encoder = ce.CountEncoder(cols=['Profession'])

In [91]:
encoder.fit_transform(data['Profession'])

Unnamed: 0,Profession
0,1
1,1
2,1
3,1
4,1


In [93]:
#method is case sensitive thus data needs to be processed (as an example I use data2)
encoder.fit_transform(data2['Profession'])

Unnamed: 0,Profession
0,2
1,1
2,1
3,2
4,1


## Generalized Linear Mixed Model Encoder
Similar to Target Encoder but uses more advanced statistical methods, thanks to which it is more precise

In [94]:
encoder = ce.GLMMEncoder()

In [97]:
encoder.fit_transform(data['Profession'],data['Revenue'])

Unnamed: 0,Profession
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [96]:
#Method is case sensitive, data needs to be proceeded
encoder.fit_transform(data2['Profession'],data['Revenue'])

Unnamed: 0,Profession
0,-187875.010232
1,-205375.01045
2,-128675.010445
3,-187875.010232
4,521925.051956


## Helmert coding
With Helmert coding, each level of the variable is compared to "later" levels of the variable.
  Used for ordinal variables

In [98]:
encoder = ce.HelmertEncoder()

In [99]:
encoder.fit_transform(data['Risk'])

Unnamed: 0,intercept,Risk_0,Risk_1,Risk_2
0,1,-1.0,-1.0,-1.0
1,1,1.0,-1.0,-1.0
2,1,0.0,2.0,-1.0
3,1,1.0,-1.0,-1.0
4,1,0.0,0.0,3.0


In [106]:
#In order to check it how it works for processed data I will create new dataframe with clean Risk column. i.e. no diffrence between upercase, lowercase letters.
data3=[risk.lower() for risk in data['Risk']]
data3

['high', 'high', 'medium', 'high', 'low']

In [107]:
data3= pd.DataFrame(data3, columns=['Risk'])
data3

Unnamed: 0,Risk
0,high
1,high
2,medium
3,high
4,low


In [108]:
encoder.fit_transform(data3['Risk'])

Unnamed: 0,intercept,Risk_0,Risk_1
0,1,-1.0,-1.0
1,1,-1.0,-1.0
2,1,1.0,-1.0
3,1,-1.0,-1.0
4,1,0.0,2.0


## James-Stein Encoder
Similar to Target encoder

In [111]:
encoder = ce.JamesSteinEncoder()

In [112]:
encoder.fit_transform(data2['Profession'], data['Revenue'])

Unnamed: 0,Profession
0,30200.0
1,12700.0
2,89400.0
3,30200.0
4,740000.0


## Leave One Out Encoder
This is very similar to target encoding but excludes the current row’s target when calculating the mean target for a level to reduce the effect of outliers.



In [113]:
encoder = ce.LeaveOneOutEncoder()

In [114]:
encoder.fit_transform(data2['Profession'], data['Revenue'])

Unnamed: 0,Profession
0,30200.0
1,180500.0
2,180500.0
3,30200.0
4,180500.0


## M-estimate encoder
Similar to target encoder. Simplified version of it.

In [115]:
encoder = ce.MEstimateEncoder()

In [119]:
#data where Profession contains: 'Self-employed' and 'self-employed'
encoder.fit_transform(data['Profession'], data['Revenue'])

Unnamed: 0,Profession
0,180500.0
1,180500.0
2,180500.0
3,180500.0
4,180500.0


In [118]:
#data where Profession contains: 'self-employed' and 'self-employed'
encoder.fit_transform(data2['Profession'], data['Revenue'])

Unnamed: 0,Profession
0,80300.0
1,96600.0
2,134950.0
3,80300.0
4,460250.0


## Polynominal Encodind
Similar to Backward Difference Encoder. 
  This type of coding system should be used only with an ordinal variable in which the levels are equally spaced. Examples of such a variable might be income or education

In [121]:
#I will encode Risk column for the lowercase data
encoder = ce.PolynomialEncoder(cols=["Risk"])

In [122]:
encoder.fit_transform(data3, verbose=1)

Unnamed: 0,intercept,Risk_0,Risk_1
0,1,-0.7071068,0.408248
1,1,-0.7071068,0.408248
2,1,-4.4337800000000005e-17,-0.816497
3,1,-0.7071068,0.408248
4,1,0.7071068,0.408248


## Weight of Evidence
WoE is a commonly used target-based encoder in credit scoring.


In [123]:
encoder = ce.WOEEncoder()

In [129]:
data

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue,Profession_labels
0,1,231,Self-employed,9,High,2,30200,1
1,2,765,students,5,high,3,12700,4
2,3,453,Horeca,7,medium,5,89400,0
3,4,231,self-employed,9,high,2,30200,3
4,5,892,finance,3,low,3,740000,2


In [132]:
encoder.fit_transform(data2['Profession'], data['Profession_labels'])

ValueError: The target column y must be binary. But the target contains 5 unique value(s).

In [133]:
#I get an error as a terget variable must be binary. In this example I don't now what will be a binary target but I will add this column to data in order to encode it. 
data['Binary']=[0,1,0,0,1]

In [134]:
data

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue,Profession_labels,Binary
0,1,231,Self-employed,9,High,2,30200,1,0
1,2,765,students,5,high,3,12700,4,1
2,3,453,Horeca,7,medium,5,89400,0,0
3,4,231,self-employed,9,high,2,30200,3,0
4,5,892,finance,3,low,3,740000,2,1


In [135]:
encoder.fit_transform(data3['Risk'], data['Binary'])

Unnamed: 0,Risk
0,-0.182322
1,-0.182322
2,0.0
3,-0.182322
4,0.0


## Wrapers

In [150]:
from category_encoders import utils
encoder = ce.PolynomialWrapper()

AttributeError: module 'category_encoders' has no attribute 'PolynomialWrapper'

## Quantile Encoder
Similar to M-estimate so to target endcoding, but here selected features are replaced by the statistical quantile instead of the mean

In [3]:
encoder = ce.QuantileEncoder()

AttributeError: module 'category_encoders' has no attribute 'QuantileEncoder'

In [None]:
encoder.fit_transform(data['Profession'], data['Revenue'])

## Summary Encoder

In [1]:
from category_encoders import *
import pandas as pd
from sklearn.datasets import load_boston
#from sklearn.preprocessing import SummaryEncoder

In [7]:
SummaryEncoder(cols=['Profession', 'Revenue'])

NameError: name 'SummaryEncoder' is not defined

In [5]:
import category_encoders as ce


In [6]:
#I got the errors for Wrapers, Quantile Encoder and Summary Encoder, because these methods are not present in the category_encoders model. It is contarary to the documentation on the website https://contrib.scikit-learn.org/category_encoders/
dir(ce)

['BackwardDifferenceEncoder',
 'BaseNEncoder',
 'BinaryEncoder',
 'CatBoostEncoder',
 'CountEncoder',
 'GLMMEncoder',
 'HashingEncoder',
 'HelmertEncoder',
 'JamesSteinEncoder',
 'LeaveOneOutEncoder',
 'MEstimateEncoder',
 'OneHotEncoder',
 'OrdinalEncoder',
 'PolynomialEncoder',
 'SumEncoder',
 'TargetEncoder',
 'WOEEncoder',
 '__all__',
 '__author__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'backward_difference',
 'basen',
 'binary',
 'cat_boost',
 'count',
 'glmm',
 'hashing',
 'helmert',
 'james_stein',
 'leave_one_out',
 'm_estimate',
 'one_hot',
 'ordinal',
 'polynomial',
 'sum_coding',
 'target_encoder',
 'utils',
 'woe']