## Handling Categorical Data

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('customer.csv')

In [10]:
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
45,61,Male,Poor,PG,Yes
30,73,Male,Average,UG,No
9,74,Male,Good,UG,Yes
29,83,Female,Average,UG,Yes
16,59,Male,Poor,UG,Yes


In [12]:
# for the sake of this demo we are only using the columns = [review, education, purchase]
df = df.iloc[:, 2:]

Basically for these columns, we have to do these...

- Gender (nominal) - One Hot Encoding
- Review (Ordinal) - Ordinal Encoding
- Education (ordinal) - Ordinla Encoding
- Purchased (Nominal)(output) - Label Encoding

In [31]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, 0:2], df.iloc[:, -1], test_size=0.2)
# we are choosing purchased columns as the output, and test size as 20%
x_train

Unnamed: 0,review,education
31,Poor,School
3,Good,PG
24,Average,PG
33,Good,PG
9,Good,UG
10,Good,UG
0,Average,School
27,Poor,PG
19,Poor,PG
8,Average,UG


#### Ordinal Encoding

In [32]:
from sklearn.preprocessing import OrdinalEncoder

In [33]:
# create an object
# here we pass the list of lists, which have all the distinct Ordinal Data in Ascending Order
oe = OrdinalEncoder(categories=[['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])

#fit the data
oe.fit(x_train)

In [34]:
# transform data
x_train = oe.transform(x_train)

In [35]:
x_train
# note that the oridinal values column has numerical data now!!

array([[0., 0.],
       [2., 2.],
       [1., 2.],
       [2., 2.],
       [2., 1.],
       [2., 1.],
       [1., 0.],
       [0., 2.],
       [0., 2.],
       [1., 1.],
       [0., 0.],
       [0., 1.],
       [2., 0.],
       [1., 2.],
       [0., 2.],
       [1., 0.],
       [0., 1.],
       [0., 2.],
       [1., 1.],
       [1., 1.],
       [0., 1.],
       [2., 1.],
       [0., 2.],
       [0., 2.],
       [2., 2.],
       [2., 0.],
       [1., 0.],
       [0., 0.],
       [0., 2.],
       [2., 2.],
       [2., 0.],
       [1., 0.],
       [2., 0.],
       [1., 2.],
       [0., 2.],
       [2., 0.],
       [0., 2.],
       [1., 1.],
       [1., 1.],
       [2., 1.]])

In [36]:
# to list all the oridinal categories
oe.categories_
# will return the categories in ascending order

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

### Label Encoding

In [45]:
from sklearn.preprocessing import LabelEncoder

# create an object
le = LabelEncoder()

# here we fit the output column
le.fit(y_train)

In [46]:
# to view the categories
le.classes_

array([0, 1])

In [47]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [51]:
y_train
# note the output has been encoded into numeric values!!

array([1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], dtype=int64)

In [50]:
y_test

array([1, 1, 1, 0, 0, 1, 1, 1, 0, 0], dtype=int64)