# 범주형 데이터 처리

범주형 데이터 : 분류를 나타내는 값들

In [15]:
import numpy as np
import pandas as pd

### 1. 데이터

In [16]:
edges = pd.DataFrame({"source" : [0,1,2],
					  "target" : [2,2,3],
					  "weight" : [3,4,5],
					  "color" : ["red", "blue", "blue"]})

display(edges)

Unnamed: 0,source,target,weight,color
0,0,2,3,red
1,1,2,4,blue
2,2,3,5,blue


### 2. 데이터 형식 확인

In [17]:
print(edges.dtypes)
# color 컬럼은 문자열이므로 object 타입

source     int64
target     int64
weight     int64
color     object
dtype: object


## 3. One-Hot Encoding

### 3-1. get_dummies()

DataFrame 전체에 대해 가능한 모든 범주형 컬럼들을 One-Hot Encoding함.  
(dataFrame 반환)

In [18]:
pd.get_dummies(edges)

Unnamed: 0,source,target,weight,color_blue,color_red
0,0,2,3,False,True
1,1,2,4,True,False
2,2,3,5,True,False


In [19]:
pd.get_dummies(edges["color"])

Unnamed: 0,blue,red
0,False,True
1,True,False
2,True,False


In [20]:
pd.get_dummies(edges[["color"]])

Unnamed: 0,color_blue,color_red
0,False,True
1,True,False
2,True,False


## 4. weight 컬럼 One-Hot Encoding

In [21]:
weight_dict = {3:"M",
			   4: "L",
			   5: "XL"}

edges["weight_sign"] = edges["weight"].map(weight_dict)
weight_sign = pd.get_dummies(edges["weight_sign"])
display(weight_sign)

Unnamed: 0,L,M,XL
0,False,True,False
1,True,False,False
2,False,False,True


In [22]:
pd.concat([edges, weight_sign], axis=1)

Unnamed: 0,source,target,weight,color,weight_sign,L,M,XL
0,0,2,3,red,M,False,True,False
1,1,2,4,blue,L,True,False,False
2,2,3,5,blue,XL,False,False,True
