In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# **Categorical data**

Categorical variables are usually represented as ‘strings’ or ‘categories’ and are finite in number. 
we can see there are two kinds of categorical data-

* **Ordinal Data**: The categories have an inherent order
* **Nominal Data**: The categories do not have an inherent order

>In Ordinal data, while encoding, one should retain the information regarding the order in which the category is provided. Like in the above example the highest degree a person possesses, gives vital information about his qualification. The degree is an important feature to decide whether a person is suitable for a post or not.

> While encoding Nominal data, we have to consider the presence or absence of a feature. In such a case, no notion of order is present. For example, the city a person lives in. For the data, it is important to retain where a person lives. Here, We do not have any order or sequence.

### **Dataset**

In [97]:
df=pd.DataFrame({'Degree':['High school','Masters','Diploma','Bachelors','Bachelors','Masters','Phd','High school','High school']*100,'gender':['M','F','M','O','F','M','O','F','M']*100,'target':['B','N','S','B','N','S','B','N','B']*100})


In [98]:
df.head()

Unnamed: 0,Degree,gender,target
0,High school,M,B
1,Masters,F,N
2,Diploma,M,S
3,Bachelors,O,B
4,Bachelors,F,N


In [99]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(df.drop('target',axis=1),df['target'],test_size=0.3,random_state=42)
X_train,X_test,Y_train,Y_test = pd.DataFrame(np.array(X_train),columns=df.columns[:-1]),pd.DataFrame(np.array(X_test),columns=df.columns[:-1]),np.array(Y_train),np.array(Y_test)

In [100]:
X_train.shape,Y_train.shape,X_test.shape,Y_test.shape

((630, 2), (630,), (270, 2), (270,))

## **Ordinal encoding**

In [101]:
from sklearn.preprocessing import OrdinalEncoder

In [102]:
oe = OrdinalEncoder(categories=[['High school','Masters','Diploma','Bachelors','Phd'],['M','F','O']])

In [103]:
oe.fit(X_train)

OrdinalEncoder(categories=[['High school', 'Masters', 'Diploma', 'Bachelors',
                            'Phd'],
                           ['M', 'F', 'O']])

In [104]:
oe.transform(X_train)
oe.transform(X_test)

array([[0., 1.],
       [0., 0.],
       [4., 2.],
       [3., 2.],
       [3., 2.],
       [2., 0.],
       [2., 0.],
       [2., 0.],
       [0., 0.],
       [1., 0.],
       [3., 1.],
       [2., 0.],
       [0., 0.],
       [3., 2.],
       [0., 0.],
       [1., 0.],
       [4., 2.],
       [0., 0.],
       [4., 2.],
       [3., 1.],
       [3., 1.],
       [3., 2.],
       [3., 1.],
       [0., 0.],
       [2., 0.],
       [2., 0.],
       [3., 2.],
       [0., 0.],
       [1., 1.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [1., 1.],
       [0., 0.],
       [3., 2.],
       [0., 0.],
       [1., 0.],
       [3., 1.],
       [0., 0.],
       [2., 0.],
       [0., 0.],
       [1., 0.],
       [1., 1.],
       [4., 2.],
       [3., 1.],
       [3., 1.],
       [0., 0.],
       [4., 2.],
       [0., 0.],
       [3., 1.],
       [3., 2.],
       [1., 1.],
       [0., 0.],
       [1., 1.],
       [3., 1.],
       [4., 2.],
       [2., 0.],
       [0., 0.],
       [1., 0.

### **Label encoding (for target variable only)**

In [105]:
from sklearn.preprocessing import LabelEncoder

In [106]:
le = LabelEncoder()

In [107]:
le.fit(Y_train)
le.transform(Y_train)
le.transform(Y_test)

array([1, 0, 0, 0, 0, 2, 2, 2, 0, 2, 1, 2, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0,
       1, 0, 2, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 2, 0, 2, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 0, 0, 1, 1, 1, 0,
       2, 0, 1, 0, 2, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0,
       2, 0, 1, 2, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 2, 1, 2, 0, 0, 1, 0,
       1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 2, 0, 0, 2,
       0, 2, 0, 0, 0, 2, 0, 1, 1, 1, 0, 0, 1, 2, 0, 0, 0, 0, 1, 1, 0, 2,
       1, 1, 2, 1, 0, 1, 0, 1, 0, 0, 1, 2, 2, 0, 2, 0, 0, 1, 1, 0, 0, 0,
       2, 2, 1, 0, 1, 1, 1, 0, 0, 0, 1, 2, 1, 2, 1, 0, 2, 1, 0, 0, 0, 0,
       1, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1,
       2, 0, 0, 2, 1, 1, 0, 2, 2, 0, 2, 2, 1, 2, 2, 0, 1, 2, 2, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 2, 0, 0, 1, 2, 2, 0, 1, 1, 1, 0, 2, 1, 0, 1, 0,
       0, 2, 2, 0, 2, 2])

# **One hot encoding**

### **Dummy variable trap**

> Generally, after one hot encoding, a single column is removed in order to remove multicollinearity between the variables. Multicollinearity arises due to the fact that after one-hot encoding, x1+x2+x3+..=1, which is an equation connecting the encoded columns.

In [108]:
from sklearn.preprocessing import OneHotEncoder

In [109]:
ohe = OneHotEncoder(drop='first') # To avoid multicollinearity, use drop = 'first'

In [110]:
encoded_X_train = ohe.fit_transform(X_train[['Degree','gender']]).toarray()
encoded_X_test = ohe.transform(X_test[['Degree','gender']]).toarray()

In [111]:
np.hstack((X_train[['Degree','gender']].values,encoded_X_train)).shape

(630, 8)

In [112]:
encoded_X_train

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.]])

# **Column transformer**
> Apply different type of encoding to each class

In [113]:
from sklearn.compose import ColumnTransformer

In [114]:
transformer = ColumnTransformer(transformers=[
                                              ('tnf1',OrdinalEncoder(categories=[['High school','Masters','Diploma','Bachelors','Phd']]),['Degree']),
                                              ('tnf2',OneHotEncoder(sparse=True,drop='first'),['gender'])
                                              ],remainder='passthrough')

In [119]:
X_train = pd.DataFrame(transformer.fit_transform(X_train),columns=['Degree','g1','g2'])

In [120]:
X_train

Unnamed: 0,Degree,g1,g2
0,3.0,0.0,1.0
1,3.0,0.0,1.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0
...,...,...,...
625,0.0,0.0,0.0
626,0.0,1.0,0.0
627,1.0,1.0,0.0
628,3.0,0.0,1.0
