# 데이터 인코딩

## 레이블 인코딩 (Label encoding)
- sklearn
- pandas

In [2]:
## sklearn 
from sklearn.preprocessing import LabelEncoder

In [6]:
items = ['사자','호랑이','판다','강아지','펭귄','펭귄','펭귄','판다']

# 숫자로 변경하기 LabelEncoder 객체 생성
encoder = LabelEncoder()

# fit
encoder.fit(items)

#transform
labels =encoder.transform(items)

labels

array([1, 4, 2, 0, 3, 3, 3, 2])

In [7]:
# 인코딩 클래스
encoder.classes_

array(['강아지', '사자', '판다', '펭귄', '호랑이'], dtype='<U3')

In [12]:
## 숫자 - > 문자열
encoder.inverse_transform(labels)

array(['사자', '호랑이', '판다', '강아지', '펭귄', '펭귄', '펭귄', '판다'], dtype='<U3')

# 원핫 인코딩 (one-hot encoding)

- sklearn
- pandas

## sklearn

In [21]:
items

['사자', '호랑이', '판다', '강아지', '펭귄', '펭귄', '펭귄', '판다']

In [28]:
items = ['사자','호랑이','판다','강아지','펭귄','펭귄','펭귄','판다']

# 1차원을 2차원으로 
import numpy as np

items=np.array(items).reshape(-1,1)
items

array([['사자'],
       ['호랑이'],
       ['판다'],
       ['강아지'],
       ['펭귄'],
       ['펭귄'],
       ['펭귄'],
       ['판다']], dtype='<U3')

In [32]:
# one-hot encoding 객체 생성
from sklearn.preprocessing import OneHotEncoder

oh_encoder = OneHotEncoder()

In [38]:
## fit ==== 
oh_encoder.fit(items)

In [37]:
## transform
# sparse matrix (희소 행렬)
# 값 대부분이 0인 행렬
# 반대는 밀집행렬 (dense matrix)
oh_labels = oh_encoder.transform(items)
oh_labels

<8x5 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [39]:
# 원- 핫 인코딩 : 차원
oh_labels.shape

(8, 5)

In [41]:
# 원핫 인코딩 : 타입
type(oh_labels)

scipy.sparse._csr.csr_matrix

In [42]:
import pandas as pd

In [46]:
df = pd.DataFrame({'items' : ['사자','호랑이','판다','강아지','펭귄','펭귄','펭귄','판다']})


Unnamed: 0,0,1,2,3,4,5,6,7
items,사자,호랑이,판다,강아지,펭귄,펭귄,펭귄,판다


In [48]:
pd.get_dummies(df)

Unnamed: 0,items_강아지,items_사자,items_판다,items_펭귄,items_호랑이
0,False,True,False,False,False
1,False,False,False,False,True
2,False,False,True,False,False
3,True,False,False,False,False
4,False,False,False,True,False
5,False,False,False,True,False
6,False,False,False,True,False
7,False,False,True,False,False


# 피처 스케일링과 
- StanderdScaler 평균 0 이고 분산이 1인 정규 분포 형태로 변환
- minmaxScaler 데이터 값을 0~1 범위 값으로 변환

In [59]:
from sklearn.datasets import load_iris
import pandas as pd

# 붓꽃 데이터 셋로딩
iris = load_iris()
iris_data = iris.data
iris_df = pd.DataFrame(data=iris_data,columns=iris.feature_names)
iris_df[:3]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


In [55]:
#feature
iris_df.mean()

sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64

In [56]:
iris_df.var()

sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64

In [64]:
from sklearn.preprocessing import StandardScaler

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the data
scaler.fit(iris_df)

# Transform the data
iris_scaler = scaler.transform(iris_df)


In [65]:
iris_scaler_df = pd.DataFrame(data= iris_scaler, columns=iris.feature_names)

In [67]:
iris_scaler_df.mean()

sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64

In [68]:
iris_scaler_df.var()

sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
dtype: float64

## MinMaxScaler
- default : 최솟값 0 , 최대값 1

In [69]:
from sklearn.preprocessing import MinMaxScaler

##MinMaxScaler 객체 생성
scaler = MinMaxScaler()

#[방법1]
## fit 
scaler.fit(iris_df)

## transform
iris_scaler_min_max = scaler.transform(iris_df)

#[방법2] fit, transform 한번에 실행

iris_scaler_min_max

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667],
       [0.30555556, 0.79166667, 0.11864407, 0.125     ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667],
       [0.13888889, 0.41666667, 0.06779661, 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.38888889, 1.        , 0.08474576, 0.125     ],
       [0.30555556, 0.79166667, 0.05084746, 0.125     ],
       [0.22222222, 0.625     ,

In [71]:
iris_scaler_min_max_df = pd.DataFrame(data= iris_scaler_min_max, columns=iris.feature_names)
iris_scaler_min_max_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667


In [72]:
iris_scaler_min_max_df.mean()

sepal length (cm)    0.428704
sepal width (cm)     0.440556
petal length (cm)    0.467458
petal width (cm)     0.458056
dtype: float64

In [73]:
# scaler를 이용한 fit transform, fit_transform 적용시 주의 사항

In [75]:
# 학습 데이터 0~10
# 테스트 데이터 0~5
train_data = np.arange(0,11).reshape(-1,1)
test_data = np.arange(0,6).reshape(-1,1)

In [87]:
#MinMaxScaler
# default : 최소값0  최대값 1

#객체 생성
scaler = MinMaxScaler()

# 학습 데이터 fit()
scaler.fit(train_data)

# 학습 데이터 transform
train_scale = scaler.transform(train_data)

# 테스트 데이터 transform
test_scale = scaler.transform(test_data)

test_scale.reshape(-1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5])

In [77]:
train_scale.reshape(-1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [83]:
##테스트 데이터 스케일링
#테스트 데이터가 서로 다르다. 테스트는 스케일을 할때, 한번에 하면 안된다. 


#[방법1]
scaler.fit(test_data)
test_scale = scaler.transform(test_data)

#[방법2]
# test_scale = scaler.fit_transform(train_data) #잘못된 형태이다.

test_scale.reshape(-1)

array([0. , 0.2, 0.4, 0.6, 0.8, 1. ])

In [81]:
test_data.reshape(-1)

array([0, 1, 2, 3, 4, 5])

In [84]:
## fit 학습 데이터
scaler.fit(train_data) #데이터 파악해용

## transform 테스트 데이터
test_scale = scaler.fit_transform(test_data)

In [86]:
test_scale.reshape(-1)

array([0. , 0.2, 0.4, 0.6, 0.8, 1. ])