In [1]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [2]:
drink = pd.read_csv('./drinks.csv')

In [3]:
drink

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF
...,...,...,...,...,...,...
188,Venezuela,333,100,3,7.7,SA
189,Vietnam,111,2,1,2.0,AS
190,Yemen,6,0,0,0.1,AS
191,Zambia,32,19,4,2.5,AF


In [4]:
# 1. beer_servings를 3개의 group으로 만들기
# 2. 1번의 결과를 beer_cat 변수로 추가
# 3. LabelEncoding 2가지 방식으로
# 4. 원핫인코딩 df 전체
# 5. 원핫인코딩 특정 컬럼(1개 or 그 이상) 선택해서

In [5]:
drink['beer_servings'].describe()

count    193.000000
mean     106.160622
std      101.143103
min        0.000000
25%       20.000000
50%       76.000000
75%      188.000000
max      376.000000
Name: beer_servings, dtype: float64

In [6]:
mean = drink['beer_servings'].mean()
mean

106.16062176165804

In [7]:
drink['beer_cat'] = pd.cut(drink['beer_servings'],
                                bins=[0,20,188,376],
                                include_lowest=True,
                                labels=['low', 'middle', 'high']
                               )

In [8]:
drink

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent,beer_cat
0,Afghanistan,0,0,0,0.0,AS,low
1,Albania,89,132,54,4.9,EU,middle
2,Algeria,25,0,14,0.7,AF,middle
3,Andorra,245,138,312,12.4,EU,high
4,Angola,217,57,45,5.9,AF,high
...,...,...,...,...,...,...,...
188,Venezuela,333,100,3,7.7,SA,high
189,Vietnam,111,2,1,2.0,AS,middle
190,Yemen,6,0,0,0.1,AS,low
191,Zambia,32,19,4,2.5,AF,middle


In [9]:
# 3. LabelEncoding 2가지 방식으로

In [10]:
# 3-1. fit --> transform

In [11]:
data = pd.Series(drink['beer_cat'])
data

0         low
1      middle
2      middle
3        high
4        high
        ...  
188      high
189    middle
190       low
191    middle
192    middle
Name: beer_cat, Length: 193, dtype: category
Categories (3, object): ['low' < 'middle' < 'high']

In [12]:
encoder = LabelEncoder()

In [13]:
encoder.fit(data)

LabelEncoder()

In [14]:
encoder.classes_

array(['high', 'low', 'middle'], dtype=object)

In [15]:
target = encoder.transform(data)
target[:5]

array([1, 2, 2, 0, 0])

In [16]:
# 3-2. fit_transform

In [17]:
df = pd.DataFrame({
    'beer_cat' : drink['beer_cat']
})
df

Unnamed: 0,beer_cat
0,low
1,middle
2,middle
3,high
4,high
...,...
188,high
189,middle
190,low
191,middle


In [18]:
encoder2 = LabelEncoder()

In [19]:
col = df.columns
col

Index(['beer_cat'], dtype='object')

In [20]:
for c in col:
    df[c] = encoder2.fit_transform(df[c])
df

Unnamed: 0,beer_cat
0,1
1,2
2,2
3,0
4,0
...,...
188,0
189,2
190,1
191,2


In [21]:
# 4. 원핫인코딩 df 전체

In [22]:
pd.get_dummies(drink.loc[:,'beer_servings':'beer_cat'])

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent_AF,continent_AS,continent_EU,continent_OC,continent_SA,beer_cat_low,beer_cat_middle,beer_cat_high
0,0,0,0,0.0,0,1,0,0,0,1,0,0
1,89,132,54,4.9,0,0,1,0,0,0,1,0
2,25,0,14,0.7,1,0,0,0,0,0,1,0
3,245,138,312,12.4,0,0,1,0,0,0,0,1
4,217,57,45,5.9,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
188,333,100,3,7.7,0,0,0,0,1,0,0,1
189,111,2,1,2.0,0,1,0,0,0,0,1,0
190,6,0,0,0.1,0,1,0,0,0,1,0,0
191,32,19,4,2.5,1,0,0,0,0,0,1,0


In [23]:
# 5. 원핫인코딩 특정 컬럼(1개 or 그 이상) 선택해서

In [24]:
pd.get_dummies(drink['continent'])

Unnamed: 0,AF,AS,EU,OC,SA
0,0,1,0,0,0
1,0,0,1,0,0
2,1,0,0,0,0
3,0,0,1,0,0
4,1,0,0,0,0
...,...,...,...,...,...
188,0,0,0,0,1
189,0,1,0,0,0
190,0,1,0,0,0
191,1,0,0,0,0
