In [26]:
import category_encoders as ce
import pandas as pd

In [27]:
clothing_list = [
    ['xxs', 'dress'],
    ['xxs', 'skirt'],
    ['xs', 'dress'],
    ['s', 'skirt'],
    ['m', 'dress'],
    ['l', 'shirt'],
    ['s', 'coat'],
    ['m', 'coat'],
    ['xxl', 'shirt'],
    ['l', 'dress']
]

clothing = pd.DataFrame(clothing_list, columns = ['size',  'type'])
clothing

Unnamed: 0,size,type
0,xxs,dress
1,xxs,skirt
2,xs,dress
3,s,skirt
4,m,dress
5,l,shirt
6,s,coat
7,m,coat
8,xxl,shirt
9,l,dress


# Ordinal encoding

In [28]:
ord_encoder = ce.OrdinalEncoder()
data_bin = ord_encoder.fit_transform(clothing['size'])
clothing = pd.concat([clothing, data_bin], axis=1)
clothing

Unnamed: 0,size,type,size.1
0,xxs,dress,1
1,xxs,skirt,1
2,xs,dress,2
3,s,skirt,3
4,m,dress,4
5,l,shirt,5
6,s,coat,3
7,m,coat,4
8,xxl,shirt,6
9,l,dress,5


# One-hot encoding

In [29]:
oh_encoder = ce.OneHotEncoder(use_cat_names=True) # указываем столбец для кодирования
type_bin = oh_encoder.fit_transform(clothing['type'])
clothing = pd.concat([clothing, type_bin], axis=1)

clothing
# Another way via Pandas
# pd.get_dummies(clothing, columns=['type'])

Unnamed: 0,size,type,size.1,type_dress,type_skirt,type_shirt,type_coat
0,xxs,dress,1,1,0,0,0
1,xxs,skirt,1,0,1,0,0
2,xs,dress,2,1,0,0,0
3,s,skirt,3,0,1,0,0
4,m,dress,4,1,0,0,0
5,l,shirt,5,0,0,1,0
6,s,coat,3,0,0,0,1
7,m,coat,4,0,0,0,1
8,xxl,shirt,6,0,0,1,0
9,l,dress,5,1,0,0,0


# Binary encoding

In [34]:
bin_encoder = ce.BinaryEncoder()
type_bin = bin_encoder.fit_transform(clothing['type'])
clothing = pd.concat([clothing[['size', 'type']], type_bin], axis=1)
clothing

Unnamed: 0,size,size.1,type,type_0,type_1,type_2
0,xxs,1,dress,0,0,1
1,xxs,1,skirt,0,1,0
2,xs,2,dress,0,0,1
3,s,3,skirt,0,1,0
4,m,4,dress,0,0,1
5,l,5,shirt,0,1,1
6,s,3,coat,1,0,0
7,m,4,coat,1,0,0
8,xxl,6,shirt,0,1,1
9,l,5,dress,0,0,1


In [31]:
# 5.9
list_of_dicts = [
 {'product': 'Product1', 'price': 1200, 'payment_type': 'Mastercard'},
 {'product': 'Product2', 'price': 3600, 'payment_type': 'Visa'},
 {'product': 'Product3', 'price': 7500, 'payment_type': 'Amex'}
]
df = pd.DataFrame(list_of_dicts)
df

Unnamed: 0,product,price,payment_type
0,Product1,1200,Mastercard
1,Product2,3600,Visa
2,Product3,7500,Amex


In [32]:
# product - номинальный признак с большим количеством категорий(количество продуктов может расти), стоит использовать Binary encoding
# price - целое число, не нуждается в кодировании
# payment_type - номинальный признак с малым кол-вом категорий, стоит использовать One-Hot encoding

In [36]:
display(bin_encoder.fit_transform(df['product']),
        oh_encoder.fit_transform(df['payment_type']))

Unnamed: 0,product_0,product_1
0,0,1
1,1,0
2,1,1


Unnamed: 0,payment_type_Mastercard,payment_type_Visa,payment_type_Amex
0,1,0,0
1,0,1,0
2,0,0,1
