# Categorization (이산형화)

![데이터변환3_범주화](imgs/데이터변환3_범주화.png)

<hr>

## ■ 이산형화란 (Discretization)
- 이항변수화(binarization)가 '0'과 '1'의 값만을 가지는 가변수(dummy variable)를 만드는 것을 의미한다면, **이산형화(discretization)은 연속형 변수를 2개 이상의 범주(category)를 가지는 변수로 변환해주는 것을 말함**

- ex) 이산형화 한걸로 요약통계량 집계 결과 / 범주 간 평균차이 / 독립성 검정 / 분류형 목표변수(타겟변수) / 인덱싱 등에 사용가능

![이산형화](imgs/이산형화.jfif)


In [13]:
import numpy as np
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(10)

df = pd.DataFrame({'C1':np.random.randn(12),
                   'C2':['a','a','a','a','a','a',
                         'b','b','b','b','b','b']})

df

Unnamed: 0,C1,C2
0,1.331587,a
1,0.715279,a
2,-1.5454,a
3,-0.008384,a
4,0.621336,a
5,-0.720086,a
6,0.265512,b
7,0.108549,b
8,0.004291,b
9,-0.1746,b


In [20]:
# 1. np.digitize(data, bins)를 이용한 연속형 변수의 이산형화 (discretization)

# 최소치와 최대치 사이에 4개
bins = np.linspace(df.C1.min(), df.C1.max(), 4)
bins

# 이산화하기
df['C1_bin'] = np.digitize(df['C1'], bins)
df

array([-1.54540029, -0.58640469,  0.37259091,  1.3315865 ])

Unnamed: 0,C1,C2,C1_bin
0,1.331587,a,4
1,0.715279,a,3
2,-1.5454,a,1
3,-0.008384,a,2
4,0.621336,a,3
5,-0.720086,a,1
6,0.265512,b,2
7,0.108549,b,2
8,0.004291,b,2
9,-0.1746,b,2


In [19]:
# 요약통계량 

# aggregation with groupby
df.groupby('C1_bin')['C1'].size()

# mean
df.groupby('C1_bin')['C1'].mean()

# std
df.groupby('C1_bin')['C1'].std()

# value_counts()
df.groupby('C1_bin')['C2'].value_counts()

# indexing (일부만 가져온 것)
df_bin2 = df[df['C1_bin']==2]
df_bin2

C1_bin
1    2
2    5
3    4
4    1
Name: C1, dtype: int64

C1_bin
1   -1.132743
2    0.039073
3    0.743170
4    1.331587
Name: C1, dtype: float64

C1_bin
1    0.583586
2    0.162188
3    0.328273
4         NaN
Name: C1, dtype: float64

C1_bin  C2
1       a     2
2       b     4
        a     1
3       a     2
        b     2
4       a     1
Name: C2, dtype: int64

Unnamed: 0,C1,C2,C1_bin
3,-0.008384,a,2
6,0.265512,b,2
7,0.108549,b,2
8,0.004291,b,2
9,-0.1746,b,2


In [21]:
# (2) pd.get_dummies() 를 이용해 가변수(dummy var) 만들기
df

pd.get_dummies(df['C1_bin'], prefix='C1')

Unnamed: 0,C1,C2,C1_bin
0,1.331587,a,4
1,0.715279,a,3
2,-1.5454,a,1
3,-0.008384,a,2
4,0.621336,a,3
5,-0.720086,a,1
6,0.265512,b,2
7,0.108549,b,2
8,0.004291,b,2
9,-0.1746,b,2


Unnamed: 0,C1_1,C1_2,C1_3,C1_4
0,0,0,0,1
1,0,0,1,0
2,1,0,0,0
3,0,1,0,0
4,0,0,1,0
5,1,0,0,0
6,0,1,0,0
7,0,1,0,0
8,0,1,0,0
9,0,1,0,0


In [22]:
# (3) np.where(condition, factor1, factor2, ...)를 이용한 연속형 변수의 이산형화
df['high_low'] = np.where(df['C1'] >= df.C1.mean(), 'high', 'low')
df

Unnamed: 0,C1,C2,C1_bin,high_low
0,1.331587,a,4,high
1,0.715279,a,3,high
2,-1.5454,a,1,low
3,-0.008384,a,2,low
4,0.621336,a,3,high
5,-0.720086,a,1,low
6,0.265512,b,2,high
7,0.108549,b,2,low
8,0.004291,b,2,low
9,-0.1746,b,2,low


In [25]:
# 조금더 복잡하게 Percentile 함수 응용 
Q1 = np.percentile(df['C1'], 25)
Q3 = np.percentile(df['C1'], 75)

df['h_m_l'] = np.where(df['C1'] >= Q3, '01_high', 
                       np.where(df['C1'] >= Q1, '02_medium', '03_low'))

df

Unnamed: 0,C1,C2,C1_bin,high_low,h_m_l
0,1.331587,a,4,high,01_high
1,0.715279,a,3,high,01_high
2,-1.5454,a,1,low,03_low
3,-0.008384,a,2,low,02_medium
4,0.621336,a,3,high,02_medium
5,-0.720086,a,1,low,03_low
6,0.265512,b,2,high,02_medium
7,0.108549,b,2,low,02_medium
8,0.004291,b,2,low,02_medium
9,-0.1746,b,2,low,03_low


### 출처
- https://rfriend.tistory.com/273