## 데이터 전처리
1. 레이블 인코딩: 카데고리 피쳐를 코드형 숫자로 변환하는것 
2. 원-핫 인코딩: 해당하는 컬럼에만 1을 표시하고 나머지는 0으로 표기

### 레이블 인코딩

In [1]:
from sklearn.preprocessing import LabelEncoder

In [3]:
items=['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']

In [4]:
# LabelEncoder 객체를 먼저 생성
encoder=LabelEncoder()

In [5]:
# fit()과 transform()으로 레이블 인코딩 수행
encoder.fit(items)

LabelEncoder()

In [6]:
labels=encoder.transform(items)

In [7]:
print("인코딩 변환값",labels)

인코딩 변환값 [0 1 4 5 3 3 2 2]


### One-Hot Encoding

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
import numpy as np

In [10]:
items=['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']

In [11]:
# 숫자값으로 변환을 위해 LabelEncoder로 변환
encoder=LabelEncoder()
encoder.fit(items)
labels=encoder.transform(items)

In [12]:
# 2차원 데이터를 변환해줘야함 reshape()
labels=labels.reshape(-1,1)

In [13]:
# 원-핫 인코딩 적용
oh=OneHotEncoder()

In [14]:
oh.fit(labels)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

In [15]:
oh_labels=oh.transform(labels)

In [16]:
# 데이터를 배열로 변환 toarray()
print(oh_labels.toarray())

[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]


In [17]:
print(oh_labels.shape)

(8, 6)


### get_dummies()

In [18]:
import pandas as pd

  return f(*args, **kwds)
  return f(*args, **kwds)


In [20]:
df=pd.DataFrame({'items':['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']})

In [21]:
df

Unnamed: 0,items
0,TV
1,냉장고
2,전자레인지
3,컴퓨터
4,선풍기
5,선풍기
6,믹서
7,믹서


In [22]:
pd.get_dummies(df)

Unnamed: 0,items_TV,items_냉장고,items_믹서,items_선풍기,items_전자레인지,items_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


### 피쳐스케일링과 정규화 

#### 1.StandardScaler : 개별 피처를 평균이 0이고 분산이 1인값으로 변환

In [23]:
from sklearn.datasets import load_iris
import pandas as pd

In [24]:
iris=load_iris()

In [25]:
iris_df=pd.DataFrame(data=iris.data, columns=iris.feature_names)

In [28]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [29]:
print(iris_df.mean())

sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64


In [30]:
print(iris_df.var())

sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64


In [31]:
from sklearn.preprocessing import StandardScaler

In [32]:
# StandardScaler 객체 생성 
scaler=StandardScaler()

In [33]:
scaler.fit(iris_df)
iris_scaled=scaler.transform(iris_df)

In [34]:
iris_scaled=pd.DataFrame(data=iris_scaled,columns=iris.feature_names)

In [36]:
iris_scaled.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


In [37]:
print(iris_scaled.mean())

sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64


In [38]:
print(iris_scaled.var())

sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
dtype: float64


#### 2.MinMaxScaler  : 데이터값은 0과 1사이의값, 음수값은 -1에서 1사이의 값으로 변환

In [39]:
from sklearn.preprocessing import MinMaxScaler

In [40]:
scaler=MinMaxScaler()

In [41]:
scaler.fit(iris_df)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [42]:
iris_scaled=scaler.transform(iris_df)

In [43]:
iris_df_scaled=pd.DataFrame(data=iris_scaled,columns=iris.feature_names)

In [44]:
iris_df_scaled.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667


In [45]:
print(iris_df_scaled.max())

sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
dtype: float64


In [46]:
print(iris_df_scaled.min())

sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
dtype: float64
