### Import the libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

### Load the dataset

In [2]:
df = pd.read_csv(r'data/customer.csv')

In [3]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


* gender - no natural ordering - Nominal Categorical Data - One-Hot Encoding is needed
* review - natural ordering (Poor, Average, Poor) - Ordinal Categorical Data - Ordinal Encoding is needed
* education - natural ordering (School, UG, PG) - Ordinal Categorical Data - Ordinal Encoding is needed
* purchased - no natural ordering - Nominal Categorical Data - Label Encoding is needed

**There's something called as ColumnTransformer and Pipelines in Scikit-Learn, which can do this task easily for us. I'll be learning it in future videos.**

In [4]:
df = df.iloc[:, 2:]

In [5]:
df.head()

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


### Train Test Split

In [6]:
X, y = df.iloc[:, 0:2], df.iloc[:, 2]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40, 2), (10, 2), (40,), (10,))

### Ordinal Encoder

In [9]:
oe = OrdinalEncoder(categories=[['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])
X_train_oe = oe.fit_transform(X_train)
X_test_oe = oe.transform(X_test)

In [10]:
X_train_oe = pd.DataFrame(X_train_oe, columns=X_train.columns)
X_test_oe = pd.DataFrame(X_test_oe, columns=X_test.columns)

In [11]:
X_train.head()

Unnamed: 0,review,education
12,Poor,School
4,Average,UG
37,Average,PG
8,Average,UG
3,Good,PG


In [12]:
X_train_oe.head()

Unnamed: 0,review,education
0,0.0,0.0
1,1.0,1.0
2,1.0,2.0
3,1.0,1.0
4,2.0,2.0


* The ordering starts fron 0.

In [13]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

### Label Encoder

In [14]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)

In [15]:
y_train_le = pd.DataFrame(y_train_le, columns=['y'])
y_test_le = pd.DataFrame(y_test_le, columns=['y'])

In [16]:
y_train.head()

12     No
4      No
37    Yes
8      No
3      No
Name: purchased, dtype: object

In [17]:
y_train_le.head()

Unnamed: 0,y
0,0
1,0
2,1
3,0
4,0


* Yes -> 1
* No -> 0

**LabelEncoder is used only on OUTPUT FEATURE**