## Set up the DataFrame

In [106]:
import pandas as pd
import numpy as np

country = ['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France', 'Spain', 'France', 'Germany', 'France']
age = [44, 27, 30, 38, 40, 35, 0, 48, 50, 37]
salary = [72000, 48000, 54000, 61000, 0, 58000, 52000, 79000, 83000, 67000]
purchased = ['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes']

list_labels = ['Country', 'Age', 'Salary', 'Purchased']
list_cols = [country, age, salary, purchased]

zipped = list(zip(list_labels, list_cols))
df = pd.DataFrame(dict(zipped))
df.set_index('Country')
print(df)


   Country  Age  Salary Purchased
0   France   44   72000        No
1    Spain   27   48000       Yes
2  Germany   30   54000        No
3    Spain   38   61000        No
4  Germany   40       0       Yes
5   France   35   58000       Yes
6    Spain    0   52000        No
7   France   48   79000       Yes
8  Germany   50   83000        No
9   France   37   67000       Yes


## Import Label Encoder class

In [107]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
labelencode_index = labelencoder.fit_transform(df.iloc[:, 0])
labelencode_purchased = labelencoder.fit_transform(df.iloc[:, 3])
print(labelencode_index)
print(labelencode_purchased)

[0 2 1 2 1 0 2 0 1 0]
[0 1 0 0 1 1 0 1 0 1]


df is a dataframe and can't be accessed via slice terminology like `X[:, 3]`. You must access via `iloc` or `X.values`.

## Check the dataframe then

In [108]:
df_labelencoder = df
df_labelencoder.Country = labelencode_index
df_labelencoder.Purchased = labelencode_purchased
print(df_labelencoder)

   Country  Age  Salary  Purchased
0        0   44   72000          0
1        2   27   48000          1
2        1   30   54000          0
3        2   38   61000          0
4        1   40       0          1
5        0   35   58000          1
6        2    0   52000          0
7        0   48   79000          1
8        1   50   83000          0
9        0   37   67000          1


The problem here is, since there are different numbers in the same column, the model will <b>misunderstand the data to be in some kind of order, 0 < 1 < 2.</b> But this isn’t the case at all. To overcome this problem, we use `One Hot Encoder`.

## One Hot Encoder

In [109]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0])
df.iloc[:, 3] = labelencode_purchased
df = onehotencoder.fit_transform(df).toarray()


In [110]:
print(df)

[[1.0e+00 0.0e+00 0.0e+00 4.4e+01 7.2e+04 0.0e+00]
 [0.0e+00 0.0e+00 1.0e+00 2.7e+01 4.8e+04 1.0e+00]
 [0.0e+00 1.0e+00 0.0e+00 3.0e+01 5.4e+04 0.0e+00]
 [0.0e+00 0.0e+00 1.0e+00 3.8e+01 6.1e+04 0.0e+00]
 [0.0e+00 1.0e+00 0.0e+00 4.0e+01 0.0e+00 1.0e+00]
 [1.0e+00 0.0e+00 0.0e+00 3.5e+01 5.8e+04 1.0e+00]
 [0.0e+00 0.0e+00 1.0e+00 0.0e+00 5.2e+04 0.0e+00]
 [1.0e+00 0.0e+00 0.0e+00 4.8e+01 7.9e+04 1.0e+00]
 [0.0e+00 1.0e+00 0.0e+00 5.0e+01 8.3e+04 0.0e+00]
 [1.0e+00 0.0e+00 0.0e+00 3.7e+01 6.7e+04 1.0e+00]]
