In [None]:
# Label Encoding

# Pada Label Encoding, setiap kategori pada suatu feature akan diurutkan secara alfabet dan direpresentasikan dengan sebuah nilai integer.

In [1]:
# Dataset

import pandas as pd

df = pd.DataFrame({
     'Country': ['UK', 'Argentina', 'China', 'Argentina', 'China'],
     'Age': [42, 36, 48, 37, 25],
     'Salary':[78000, 63000, 100000, 47000, 36000]
})

df

Unnamed: 0,Country,Age,Salary
0,UK,42,78000
1,Argentina,36,63000
2,China,48,100000
3,Argentina,37,47000
4,China,25,36000


In [2]:
# Label Encoding pada Scikit Learn

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['Country'] = label_encoder.fit_transform(df['Country'])
df

Unnamed: 0,Country,Age,Salary
0,2,42,78000
1,0,36,63000
2,1,48,100000
3,0,37,47000
4,1,25,36000


In [3]:
label_encoder.classes_  #Kumpulan kategori yang menjadi acuan indeks dan diurutkan sesuai alfabet

array(['Argentina', 'China', 'UK'], dtype=object)

In [None]:
# One Hot Encoding

# Setiap kategori pada suatu feature akan diurutkan secara alfabet dan direpresentasikan sebagai kumpulan bits. 

In [4]:
# Dataset

df = pd.DataFrame({
     'Country': ['UK', 'Argentina', 'China', 'Argentina', 'China'],
     'Age': [42, 36, 48, 37, 25],
     'Salary':[78000, 63000, 100000, 47000, 36000]
})

df

Unnamed: 0,Country,Age,Salary
0,UK,42,78000
1,Argentina,36,63000
2,China,48,100000
3,Argentina,37,47000
4,China,25,36000


In [5]:
# One Hot Encoding pada Scikit Learn

X = df['Country'].values.reshape(-1, 1) #reshape (-1,1) dibutuhkan karena sekumpulan nilai ini akan diperlakukan sebagai feature 
X

array([['UK'],
       ['Argentina'],
       ['China'],
       ['Argentina'],
       ['China']], dtype=object)

In [6]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder()
X = onehot_encoder.fit_transform(X).toarray()
X

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [7]:
onehot_encoder.categories_ #Setiap negara akan direpresentasikan 3 digit

[array(['Argentina', 'China', 'UK'], dtype=object)]

In [8]:
df_onehot = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
df_onehot

Unnamed: 0,0,1,2
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0


In [9]:
df = pd.concat([df_onehot, df], axis=1)
df

Unnamed: 0,0,1,2,Country,Age,Salary
0,0.0,0.0,1.0,UK,42,78000
1,1.0,0.0,0.0,Argentina,36,63000
2,0.0,1.0,0.0,China,48,100000
3,1.0,0.0,0.0,Argentina,37,47000
4,0.0,1.0,0.0,China,25,36000


In [10]:
df = df.drop(['Country'], axis=1)
df

Unnamed: 0,0,1,2,Age,Salary
0,0.0,0.0,1.0,42,78000
1,1.0,0.0,0.0,36,63000
2,0.0,1.0,0.0,48,100000
3,1.0,0.0,0.0,37,47000
4,0.0,1.0,0.0,25,36000


In [None]:
# Menerapkan One Hot Encoding bila:

# Nilai categorical adalah nominal
# Jumlah Kategori yang ada tidak terlalu banyak

# Menerapkan Label Encoding bila

# Nilai categorical adalah ordinal
# Jumlah kategori yang ada relatif banyak