# 데이터 인코딩

* 레이블 인코딩

In [1]:
from sklearn.preprocessing import LabelEncoder

items=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

# LabelEncoder 를 객체로 생성한 후, fit() 과 transform() 으로 label 인코딩 수행.

encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print(f"인코딩 변환값: {labels}")

인코딩 변환값: [0 1 4 5 3 3 2 2]


* 원-핫 인코딩(One-hot Encoding)

In [4]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

items = ['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

# 2차원 ndarray로 변환
items = np.array(items).reshape(8, 1)


# 원-핫 인코딩을 적용합니다.
oh_encoder = OneHotEncoder()
oh_encoder.fit(items)
oh_labels = oh_encoder.transform(items)

# OneHotEncoder로 변환한 결과는 희소 행렬 이므로 Dense 행렬로
print(oh_labels)
print(oh_labels.toarray())
print(oh_labels.shape)

  (0, 0)	1.0
  (1, 1)	1.0
  (2, 4)	1.0
  (3, 5)	1.0
  (4, 3)	1.0
  (5, 3)	1.0
  (6, 2)	1.0
  (7, 2)	1.0
[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
(8, 6)


In [5]:
import pandas as pd 

df = pd.DataFrame({'item': ['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']})
df

Unnamed: 0,item
0,TV
1,냉장고
2,전자렌지
3,컴퓨터
4,선풍기
5,선풍기
6,믹서
7,믹서


In [6]:
pd.get_dummies(df)

Unnamed: 0,item_TV,item_냉장고,item_믹서,item_선풍기,item_전자렌지,item_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


### 피처 스케일링과 정규화

* StandardScaler

In [6]:
from sklearn.datasets import load_iris
import pandas as pd

# 붓꽃 데이터 셋을 로딩하고 DataFrame으로 변환합니다.
iris = load_iris()
iris_data = iris.data
# print(type(iris_data), iris_data.shape)
# print(f"iris_data: {iris_data}")
iris_df = pd.DataFrame(data=iris_data, columns=iris.feature_names)
print("feature 들의 평균 값")
print(iris_df.mean())
print("\nfeature 들의 분산 값")
print(iris_df.var())

feature 들의 평균 값
sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64

feature 들의 분산 값
sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64


In [13]:
from sklearn.preprocessing import StandardScaler

# StandardScaler 객체 생성
scaler = StandardScaler()

# StandardScaler 로 데이터 셋 변환. fit( )과 transform( ) 호출.
# scaler.fit(iris_df)
# iris_scalled = scaler.transform(iris_df)
iris_scaled = scaler.fit_transform(iris_df)
iris_df_scaled = pd.DataFrame(data=iris_scalled, columns=iris.feature_names)
print("feature 들의 평균 값")
print(iris_df_scaled.mean())
print("\nfeature 들의 분산 값")
print(iris_df_scaled.var())

print()
print("===================================")
print("===================================")
print()
iris_df_scaled

feature 들의 평균 값
sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64

feature 들의 분산 값
sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
dtype: float64




Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


* MinMaxScaler

In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# MinMaxScaler로 데이터 셋 변환, fit()과 transform() 호출 
scaler.fit(iris_df)
iris_scalled = scaler.transform(iris_df)

# transform()시 scale 변환된 데이터 셋이 numpy ndarray로 반환되어 이를 DataFrame으로 변환
iris_scaled = scaler.fit_transform(iris_df)
iris_df_scaled = pd.DataFrame(data=iris_scalled, columns=iris.feature_names)
print("feature 들의 평균 값")
print(iris_df_scaled.mean())
print("\nfeature 들의 분산 값")
print(iris_df_scaled.var())
iris_df_scaled

feature 들의 평균 값
sepal length (cm)    0.428704
sepal width (cm)     0.440556
petal length (cm)    0.467458
petal width (cm)     0.458056
dtype: float64

feature 들의 분산 값
sepal length (cm)    0.052908
sepal width (cm)     0.032983
petal length (cm)    0.089522
petal width (cm)     0.100869
dtype: float64


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667


* Scaler를 이용하여 학습 데이터와 테스트 데이터에 fit(), transform(), fit_transform() 적용 시 유의사항.

In [24]:
import numpy as np

# 학습 데이터는 0부터 10까지, 테스트 데이터는 0부터 5까지
# Scaler 클래스의 fit(), transform()은 2차원 이상 데이터만 가능 
train_array = np.arange(0, 11).reshape(-1, 1)
test_array = np.arange(0, 6).reshape(-1, 1)

In [18]:
#fit 하게 되면, train_array 데이터의 최소값은 0, 최대값이 10
scaler.fit(train_array)
train_scaled = scaler.transform(train_array)

print(f"원본 train_array 데이터: {np.round(train_array.reshape(-1), 2)}")
print(f"Scale된 train_array 데이터: {np.round(train_scaled.reshape(-1), 2)}")

원본 train_array 데이터: [ 0  1  2  3  4  5  6  7  8  9 10]
Scale된 train_array 데이터: [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]


In [26]:
#fit 하게 되면, train_array 데이터의 최소값은 0, 최대값이 5
scaler.fit(test_array)
test_scaled = scaler.transform(test_array)

print(f"원본 train_array 데이터: {np.round(test_array.reshape(-1), 2)}")
print(f"Scale된 train_array 데이터: {np.round(test_scaled.reshape(-1), 2)}")

# fit을 해줌으로 인해, train데이터와 test데이터의 스케일링 된 데이터가 안맞음

원본 train_array 데이터: [0 1 2 3 4 5]
Scale된 train_array 데이터: [0.  0.2 0.4 0.6 0.8 1. ]


In [28]:
scaler.fit(train_array)
train_scaled = scaler.transform(train_array)

print(f"원본 train_array 데이터: {np.round(train_array.reshape(-1), 2)}")
print(f"Scale된 train_array 데이터: {np.round(train_scaled.reshape(-1), 2)}")

test_scaled = scaler.transform(test_array)

print(f"원본 test_array 데이터: {np.round(test_array.reshape(-1), 2)}")
print(f"Scale된 test_array 데이터: {np.round(test_scaled.reshape(-1), 2)}")

원본 train_array 데이터: [ 0  1  2  3  4  5  6  7  8  9 10]
Scale된 train_array 데이터: [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
원본 test_array 데이터: [0 1 2 3 4 5]
Scale된 test_array 데이터: [0.  0.1 0.2 0.3 0.4 0.5]
