# Mengenal Categorical Encoding: Label Encoding & One Hot Encoding

## Label Encoding

### Dataset

In [4]:
import pandas as pd
df = pd.DataFrame({
    'country': ['India','US','Japan','US','Japan'],
    'age': [44,34,46,35,23],
    'salary': [72000,65000,98000,45000,34000]})
df

Unnamed: 0,country,age,salary
0,India,44,72000
1,US,34,65000
2,Japan,46,98000
3,US,35,45000
4,Japan,23,34000


### Label Encoding pada Scikit Learn

In [7]:
from sklearn.preprocessing import LabelEncoder
label_encoder= LabelEncoder()
df['country']= label_encoder.fit_transform(df['country'])
df

Unnamed: 0,country,age,salary
0,0,44,72000
1,2,34,65000
2,1,46,98000
3,2,35,45000
4,1,23,34000


In [9]:
label_encoder.classes_

array(['India', 'Japan', 'US'], dtype=object)

### One Hot Encoding

#### Dataset

In [13]:
df = pd.DataFrame({
    'country': ['India','US','Japan','US','Japan'],
    'age': [44,34,46,35,23],
    'salary': [72000,65000,98000,45000,34000]})
df

Unnamed: 0,country,age,salary
0,India,44,72000
1,US,34,65000
2,Japan,46,98000
3,US,35,45000
4,Japan,23,34000


### One Hot Encoding pada Scikit Learn

In [16]:
X=df['country'].values.reshape(-1,1)
X

array([['India'],
       ['US'],
       ['Japan'],
       ['US'],
       ['Japan']], dtype=object)

In [18]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder= OneHotEncoder()
X = onehot_encoder.fit_transform(X).toarray()
X

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [20]:
onehot_encoder.categories_

[array(['India', 'Japan', 'US'], dtype=object)]

In [22]:
df_onehot=pd.DataFrame(X,columns=[str(i) for i in range(X.shape[1])])
df_onehot

Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0


In [24]:
df =pd.concat([df_onehot,df],axis=1)
df

Unnamed: 0,0,1,2,country,age,salary
0,1.0,0.0,0.0,India,44,72000
1,0.0,0.0,1.0,US,34,65000
2,0.0,1.0,0.0,Japan,46,98000
3,0.0,0.0,1.0,US,35,45000
4,0.0,1.0,0.0,Japan,23,34000


In [26]:
df =df.drop(['country'],axis=1)
df

Unnamed: 0,0,1,2,age,salary
0,1.0,0.0,0.0,44,72000
1,0.0,0.0,1.0,34,65000
2,0.0,1.0,0.0,46,98000
3,0.0,0.0,1.0,35,45000
4,0.0,1.0,0.0,23,34000
