### Работа с категориальными признаками

In [1]:
import pandas as pd
import numpy as nm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
# Создаем датафрейм
city_list = ['Москва', 'Санкт-Петербург', 'Новосибирск',
             'Екатеринбург', 'Казань', 'Нижний Новгород']
popul_list = [12600000, 5400000, 1625000, 1500000, 1300000, 1250000]
index_list = [1,2,3,4,5,6]

In [3]:
df = pd.DataFrame(columns = ['city', 'population'], index = index_list)

In [4]:
df['city'] = city_list
df['population'] = popul_list
df.head()

Unnamed: 0,city,population
1,Москва,12600000
2,Санкт-Петербург,5400000
3,Новосибирск,1625000
4,Екатеринбург,1500000
5,Казань,1300000


### LabelEncoder

In [5]:
labelencoder = LabelEncoder()
df['city_label'] = labelencoder.fit_transform(df['city'])
df.head() # У каждого города появилс свой индекс

Unnamed: 0,city,population,city_label
1,Москва,12600000,2
2,Санкт-Петербург,5400000,5
3,Новосибирск,1625000,4
4,Екатеринбург,1500000,0
5,Казань,1300000,1


In [6]:
# Обратное преобразование
labelencoder.inverse_transform([2,4,1])

array(['Москва', 'Новосибирск', 'Казань'], dtype=object)

### One Hot Encoder

In [7]:
onehotencoder = OneHotEncoder()
df_ohe = pd.DataFrame(onehotencoder.fit_transform(df[['city_label']]).toarray())

In [8]:
df = df.join(df_ohe)
df.head() # Дополняются столбцы с дополнительными признаками

Unnamed: 0,city,population,city_label,0,1,2,3,4,5
1,Москва,12600000,2,0.0,0.0,0.0,0.0,0.0,1.0
2,Санкт-Петербург,5400000,5,0.0,0.0,0.0,0.0,1.0,0.0
3,Новосибирск,1625000,4,1.0,0.0,0.0,0.0,0.0,0.0
4,Екатеринбург,1500000,0,0.0,1.0,0.0,0.0,0.0,0.0
5,Казань,1300000,1,0.0,0.0,0.0,1.0,0.0,0.0


### Pandas get_dummies

In [9]:
# Передаем колонки для преобразования
dum_df = pd.get_dummies(df, columns = ['city_label'], prefix = ['get_D'])
dum_df.head() # Появились новые столбцы с признаками (см. get_D)

Unnamed: 0,city,population,0,1,2,3,4,5,get_D_0,get_D_1,get_D_2,get_D_3,get_D_4,get_D_5
1,Москва,12600000,0.0,0.0,0.0,0.0,0.0,1.0,0,0,1,0,0,0
2,Санкт-Петербург,5400000,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0,0,0,1
3,Новосибирск,1625000,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0
4,Екатеринбург,1500000,0.0,1.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0
5,Казань,1300000,0.0,0.0,0.0,1.0,0.0,0.0,0,1,0,0,0,0
