## OneHotEncoder

In [1]:
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder = OneHotEncoder()

In [2]:
data = {'Feature1': ['A', 'B', 'A', 'C'],
        'Feature2': ['High', 'Low', 'Medium', 'Low']}

In [3]:
import pandas as pd

In [4]:
df_data = pd.DataFrame(data)

In [5]:
df_data.index, df_data.columns, df_data.values

(RangeIndex(start=0, stop=4, step=1),
 Index(['Feature1', 'Feature2'], dtype='object'),
 array([['A', 'High'],
        ['B', 'Low'],
        ['A', 'Medium'],
        ['C', 'Low']], dtype=object))

In [6]:
type(df_data.values)

numpy.ndarray

In [7]:
oneHotEncoder.fit(df_data[['Feature1']])

In [8]:
oneHotEncoder.categories_, oneHotEncoder.categories_.index

([array(['A', 'B', 'C'], dtype=object)],
 <function list.index(value, start=0, stop=9223372036854775807, /)>)

In [9]:
encoder_array = oneHotEncoder.transform(df_data[['Feature1']]).toarray()
encoder_array

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [10]:
# df_encoder = pd.DataFrame(encoder_array, columns=['A', 'B', 'C'])
df_encoder = pd.DataFrame(encoder_array, columns=oneHotEncoder.get_feature_names_out(['Feature1']))
df_encoder

Unnamed: 0,Feature1_A,Feature1_B,Feature1_C
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0


In [11]:
pd.concat([df_data,df_encoder],axis=1)

Unnamed: 0,Feature1,Feature2,Feature1_A,Feature1_B,Feature1_C
0,A,High,1.0,0.0,0.0
1,B,Low,0.0,1.0,0.0
2,A,Medium,1.0,0.0,0.0
3,C,Low,0.0,0.0,1.0


## Imbalanced Data Sampling

### under sampling : Tomek's Link
- sklearn의 연계된 package에 imbalnced learn에 포함된 기능
- imblearn이 안깔린경우: %conda install imbalanced-learn
- TomekLinks 결과값은 numpy로 뱉어냄 

In [12]:
from imblearn.under_sampling import TomekLinks

In [13]:
from sklearn.datasets import make_classification# random하게 실습하기위한 data

In [14]:
features, target= make_classification(n_classes=2, class_sep=2,
weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) # 범주형 랜덤데이터 옵션 주기

In [15]:
features.shape, target.shape

((1000, 20), (1000,))

In [16]:
## numpy count하기 _ 자연어 처리할때 필요함.주로 자연어 처리할때 단어가 많이 걸리면 글자크기 크게, 적게 걸리면 글자크기 작게
from collections import Counter

In [17]:
Counter(target) # target의 imbalance check

Counter({0: 300, 1: 700})

Counter(target) = Counter({0: 100, 1: 900}) = target범주의 0은 100개 1은 900개
- undersampling 진행 = TomekLinks

In [18]:
tomekLinks = TomekLinks() # TomeKLinks 인스턴스화 후 교육이 필요함. 

In [19]:
features_resample, target_resample = tomekLinks.fit_resample(features, target) ## 데이터를 건드릴때는 features와 target을 동시에 진행해줘야함. 

In [20]:
features_resample.shape, target_resample.shape

((996, 20), (996,))

In [21]:
Counter(target_resample)

Counter({0: 300, 1: 696})

#### OverSampling : SMOTE

In [22]:
from imblearn.over_sampling import SMOTE

In [23]:
smote = SMOTE()

In [24]:
feature_over_sample, target_over_sample = smote.fit_resample(features, target)

In [25]:
feature_over_sample.shape, target_over_sample.shape

((1400, 20), (1400,))

In [26]:
Counter(target_over_sample)

Counter({0: 700, 1: 700})