# ***`Feature Encoding`***

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy 
import category_encoders as ce

from scipy.io.arff import loadarff
from sklearn.datasets import load_breast_cancer, load_diabetes

%matplotlib inline

pd.set_option('display.max_columns',50)

In [2]:
def apply_decode(df_name):
    """
    Description: Function created for changing the character encoding
    Input: It accepts one parameter:
    df_name : `Pandas DataFrame`
    Return: `utf-8` encoded DataFrame
    """
    for col in df_name.columns:
        if df_name[col].dtype != 'float64':
            df_name[col] = df_name[col].apply(lambda val : val.decode('utf-8'))
    return df_name

In [3]:
path = os.getcwd()
file = path+'\\Autism-Child-Data.arff'

autism_dataset = pd.DataFrame(loadarff(file)[0])
autism_dataset.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,b'1',b'1',b'0',b'0',b'1',b'1',b'0',b'1',b'0',b'0',6.0,b'm',b'Others',b'no',b'no',b'Jordan',b'no',5.0,b'4-11 years',b'Parent',b'NO'
1,b'1',b'1',b'0',b'0',b'1',b'1',b'0',b'1',b'0',b'0',6.0,b'm',b'Middle Eastern ',b'no',b'no',b'Jordan',b'no',5.0,b'4-11 years',b'Parent',b'NO'
2,b'1',b'1',b'0',b'0',b'0',b'1',b'1',b'1',b'0',b'0',6.0,b'm',b'?',b'no',b'no',b'Jordan',b'yes',5.0,b'4-11 years',b'?',b'NO'
3,b'0',b'1',b'0',b'0',b'1',b'1',b'0',b'0',b'0',b'1',5.0,b'f',b'?',b'yes',b'no',b'Jordan',b'no',4.0,b'4-11 years',b'?',b'NO'
4,b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',5.0,b'm',b'Others',b'yes',b'no',b'United States',b'no',10.0,b'4-11 years',b'Parent',b'YES'


In [4]:
autism_df = apply_decode(autism_dataset)
autism_df = autism_df.applymap(lambda val: None if val == '?' else val)
autism_df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,6.0,m,Others,no,no,Jordan,no,5.0,4-11 years,Parent,NO
1,1,1,0,0,1,1,0,1,0,0,6.0,m,Middle Eastern,no,no,Jordan,no,5.0,4-11 years,Parent,NO
2,1,1,0,0,0,1,1,1,0,0,6.0,m,,no,no,Jordan,yes,5.0,4-11 years,,NO
3,0,1,0,0,1,1,0,0,0,1,5.0,f,,yes,no,Jordan,no,4.0,4-11 years,,NO
4,1,1,1,1,1,1,1,1,1,1,5.0,m,Others,yes,no,United States,no,10.0,4-11 years,Parent,YES


In [5]:
autism_df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,6.0,m,Others,no,no,Jordan,no,5.0,4-11 years,Parent,NO
1,1,1,0,0,1,1,0,1,0,0,6.0,m,Middle Eastern,no,no,Jordan,no,5.0,4-11 years,Parent,NO
2,1,1,0,0,0,1,1,1,0,0,6.0,m,,no,no,Jordan,yes,5.0,4-11 years,,NO
3,0,1,0,0,1,1,0,0,0,1,5.0,f,,yes,no,Jordan,no,4.0,4-11 years,,NO
4,1,1,1,1,1,1,1,1,1,1,5.0,m,Others,yes,no,United States,no,10.0,4-11 years,Parent,YES


In [6]:
autism_df.dtypes

A1_Score            object
A2_Score            object
A3_Score            object
A4_Score            object
A5_Score            object
A6_Score            object
A7_Score            object
A8_Score            object
A9_Score            object
A10_Score           object
age                float64
gender              object
ethnicity           object
jundice             object
austim              object
contry_of_res       object
used_app_before     object
result             float64
age_desc            object
relation            object
Class/ASD           object
dtype: object

In [7]:
autism_df.isna().sum()

A1_Score            0
A2_Score            0
A3_Score            0
A4_Score            0
A5_Score            0
A6_Score            0
A7_Score            0
A8_Score            0
A9_Score            0
A10_Score           0
age                 4
gender              0
ethnicity          43
jundice             0
austim              0
contry_of_res       0
used_app_before     0
result              0
age_desc            0
relation           43
Class/ASD           0
dtype: int64

In [8]:
autism_df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before',
       'result', 'age_desc', 'relation', 'Class/ASD'],
      dtype='object')

In [9]:
for col in ['A1_Score','A2_Score','A3_Score','A4_Score','A5_Score','A6_Score','A7_Score','A8_Score','A9_Score','A10_Score']:
    autism_df[col] = autism_df[col].astype(int)

In [10]:
autism_df.dtypes

A1_Score             int32
A2_Score             int32
A3_Score             int32
A4_Score             int32
A5_Score             int32
A6_Score             int32
A7_Score             int32
A8_Score             int32
A9_Score             int32
A10_Score            int32
age                float64
gender              object
ethnicity           object
jundice             object
austim              object
contry_of_res       object
used_app_before     object
result             float64
age_desc            object
relation            object
Class/ASD           object
dtype: object

In [11]:
autism_df.fillna(method='backfill',inplace=True)

In [12]:
autism_df.isna().sum()

A1_Score           0
A2_Score           0
A3_Score           0
A4_Score           0
A5_Score           0
A6_Score           0
A7_Score           0
A8_Score           0
A9_Score           0
A10_Score          0
age                0
gender             0
ethnicity          0
jundice            0
austim             0
contry_of_res      0
used_app_before    0
result             0
age_desc           0
relation           0
Class/ASD          0
dtype: int64

In [13]:
autism_df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,6.0,m,Others,no,no,Jordan,no,5.0,4-11 years,Parent,NO
1,1,1,0,0,1,1,0,1,0,0,6.0,m,Middle Eastern,no,no,Jordan,no,5.0,4-11 years,Parent,NO
2,1,1,0,0,0,1,1,1,0,0,6.0,m,Others,no,no,Jordan,yes,5.0,4-11 years,Parent,NO
3,0,1,0,0,1,1,0,0,0,1,5.0,f,Others,yes,no,Jordan,no,4.0,4-11 years,Parent,NO
4,1,1,1,1,1,1,1,1,1,1,5.0,m,Others,yes,no,United States,no,10.0,4-11 years,Parent,YES


In [14]:
X = autism_df[['age','gender','ethnicity','jundice','austim','contry_of_res','used_app_before','result','age_desc','relation']].copy(deep=True)
y = autism_df['Class/ASD'].copy(deep=True)

In [15]:
X.head()

Unnamed: 0,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation
0,6.0,m,Others,no,no,Jordan,no,5.0,4-11 years,Parent
1,6.0,m,Middle Eastern,no,no,Jordan,no,5.0,4-11 years,Parent
2,6.0,m,Others,no,no,Jordan,yes,5.0,4-11 years,Parent
3,5.0,f,Others,yes,no,Jordan,no,4.0,4-11 years,Parent
4,5.0,m,Others,yes,no,United States,no,10.0,4-11 years,Parent


In [16]:
y.head()

0     NO
1     NO
2     NO
3     NO
4    YES
Name: Class/ASD, dtype: object

## **Ordinal_Encoding**

### **CASE-I**
#### **Using all the values of RELATION variable**

In [17]:
X['relation'] = X['relation'].apply(lambda val: str(val).capitalize())

In [18]:
X['relation'].value_counts()

Parent                      251
Relative                     18
Health care professional     15
Self                          8
Name: relation, dtype: int64

In [19]:
oe = ce.OrdinalEncoder(verbose=1,
                       mapping=[{'col':'relation','mapping':{'Self':1,'Parent':2,'Relative':3,'Health care professional':4}}])

In [20]:
oe.fit(X=X)

OrdinalEncoder(cols=['gender', 'ethnicity', 'jundice', 'austim',
                     'contry_of_res', 'used_app_before', 'age_desc',
                     'relation'],
               mapping=[{'col': 'relation',
                         'mapping': {'Health care professional': 4, 'Parent': 2,
                                     'Relative': 3, 'Self': 1}}],
               verbose=1)

In [21]:
X['oe_relation'] = oe.transform(X)['relation']

In [22]:
X.head()

Unnamed: 0,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,oe_relation
0,6.0,m,Others,no,no,Jordan,no,5.0,4-11 years,Parent,2
1,6.0,m,Middle Eastern,no,no,Jordan,no,5.0,4-11 years,Parent,2
2,6.0,m,Others,no,no,Jordan,yes,5.0,4-11 years,Parent,2
3,5.0,f,Others,yes,no,Jordan,no,4.0,4-11 years,Parent,2
4,5.0,m,Others,yes,no,United States,no,10.0,4-11 years,Parent,2


In [24]:
X['oe_relation'].value_counts()

2    251
3     18
4     15
1      8
Name: oe_relation, dtype: int64

### **CASE-II**
#### **Manually Ordinal encoding the classes with one class labelled as -1**

In [25]:
relation_dict = {'Self':1,'Parent':2,'Relative':3,'Health care professional':-1}
relation_dict

{'Self': 1, 'Parent': 2, 'Relative': 3, 'Health care professional': -1}

In [26]:
X['relation'].apply(lambda val: relation_dict[val]).value_counts()

 2    251
 3     18
-1     15
 1      8
Name: relation, dtype: int64

## **Count_Encoding**

### **CASE-I**
#### **Using all the values of RELATION variable**

In [27]:
cnt_end = ce.CountEncoder(cols='relation')

In [28]:
cnt_end.fit(X=X)

CountEncoder(cols=['relation'], combine_min_nan_groups=True)

In [29]:
cnt_end.transform(X)['relation'].value_counts()

251    251
18      18
15      15
8        8
Name: relation, dtype: int64

### **CASE-II**
#### **Manually Count encoding the classes**

In [30]:
relation_cnt_dict = X['relation'].value_counts().to_dict()
X['relation'].apply(lambda val: relation_cnt_dict.get(val)).value_counts()

251    251
18      18
15      15
8        8
Name: relation, dtype: int64

## **One-Hot_Encoding**

### **CASE-I**
#### **Using all the values of RELATION variable**

In [31]:
ohe = ce.OneHotEncoder(cols='relation')

In [32]:
ohe.fit(X=X)

  elif pd.api.types.is_categorical(cols):


OneHotEncoder(cols=['relation'])

In [33]:
ohe.transform(X)

Unnamed: 0,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation_1,relation_2,relation_3,relation_4,oe_relation
0,6.0,m,Others,no,no,Jordan,no,5.0,4-11 years,1,0,0,0,2
1,6.0,m,Middle Eastern,no,no,Jordan,no,5.0,4-11 years,1,0,0,0,2
2,6.0,m,Others,no,no,Jordan,yes,5.0,4-11 years,1,0,0,0,2
3,5.0,f,Others,yes,no,Jordan,no,4.0,4-11 years,1,0,0,0,2
4,5.0,m,Others,yes,no,United States,no,10.0,4-11 years,1,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,7.0,f,White-European,yes,yes,United Kingdom,no,10.0,4-11 years,1,0,0,0,2
288,7.0,f,White-European,yes,yes,Australia,no,4.0,4-11 years,1,0,0,0,2
289,4.0,m,Latino,no,no,Brazil,no,7.0,4-11 years,1,0,0,0,2
290,4.0,m,South Asian,no,no,India,no,9.0,4-11 years,1,0,0,0,2


### **CASE-II**
#### **Manually one-hot encoding the classes**

##### **Step-1 : Ordinal Encoded variable**

In [34]:
X['oe_relation'].values

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 3, 3, 1, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 3, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2, 2, 2, 2, 3, 2, 1, 2, 2,
       2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 4,
       2, 2, 2, 2, 3, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 4, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 3,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4,
       2, 2, 2, 2, 2, 2])

##### **Step-2 : Creating the Identity Matrix based on the unique categories**

In [35]:
eye = np.eye(len(set(X['oe_relation']))).astype(int)
eye

array([[1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1]])

##### **Step-3 : Accessing the rows of above created Identity Matrix based on the value in Ordinal Encoded Feature(e.g. 1,2,3 and 4)**

In [36]:
eye[1-1], eye[2-1], eye[3-1], eye[4-1]

(array([1, 0, 0, 0]),
 array([0, 1, 0, 0]),
 array([0, 0, 1, 0]),
 array([0, 0, 0, 1]))

##### **Step-4 : Converting the row into Series**

In [37]:
pd.Series(eye[1-1])

0    1
1    0
2    0
3    0
dtype: int32

#### **Now, applying the above steps on the dataframe column**

In [38]:
X['oe_relation'].apply(lambda val: pd.Series(np.diag(np.ones(len(set(X['oe_relation']))))[val-1].astype(int))).head(25)

Unnamed: 0,0,1,2,3
0,0,1,0,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,0,1,0,0
5,0,1,0,0
6,0,1,0,0
7,0,1,0,0
8,0,1,0,0
9,1,0,0,0


https://www.kaggle.com/subinium/11-categorical-encoders-and-benchmark

https://towardsdatascience.com/beyond-one-hot-17-ways-of-transforming-categorical-features-into-numeric-features-57f54f199ea4

In [83]:
se1 = ce.SumEncoder(cols='relation')

In [84]:
se1.fit(X)

  elif pd.api.types.is_categorical(cols):


SumEncoder(cols=['relation'],
           mapping=[{'col': 'relation',
                     'mapping':     relation_0  relation_1  relation_2
 1         1.0         0.0         0.0
 2         0.0         1.0         0.0
 3         0.0         0.0         1.0
 4        -1.0        -1.0        -1.0
-1         0.0         0.0         0.0
-2         0.0         0.0         0.0}])

In [88]:
se1.transform(X,y).tail(50)

Unnamed: 0,intercept,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation_0,relation_1,relation_2,oe_relation
242,1,6.0,m,Middle Eastern,yes,no,Saudi Arabia,no,3.0,4-11 years,1.0,0.0,0.0,2
243,1,9.0,f,Middle Eastern,yes,no,Saudi Arabia,no,3.0,4-11 years,1.0,0.0,0.0,2
244,1,7.0,m,Middle Eastern,yes,no,Jordan,no,6.0,4-11 years,1.0,0.0,0.0,2
245,1,5.0,m,Middle Eastern,no,no,Jordan,no,6.0,4-11 years,1.0,0.0,0.0,2
246,1,5.0,m,Middle Eastern,yes,no,United Arab Emirates,no,4.0,4-11 years,1.0,0.0,0.0,2
247,1,10.0,m,Middle Eastern,no,yes,United Arab Emirates,no,1.0,4-11 years,1.0,0.0,0.0,2
248,1,7.0,m,Middle Eastern,no,no,Jordan,no,3.0,4-11 years,1.0,0.0,0.0,2
249,1,9.0,m,Middle Eastern,yes,no,Egypt,no,6.0,4-11 years,1.0,0.0,0.0,2
250,1,7.0,m,Middle Eastern,yes,no,Egypt,no,7.0,4-11 years,1.0,0.0,0.0,2
251,1,7.0,m,Middle Eastern,yes,no,Egypt,no,8.0,4-11 years,1.0,0.0,0.0,2


In [46]:
from sklearn.datasets import load_boston

In [47]:
boston = load_boston()

In [48]:
XX = pd.DataFrame(boston.data,columns=boston.feature_names)
yy = boston.target

In [60]:
XX.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [82]:
XX['CHAS'].value_counts()

0.0    471
1.0     35
Name: CHAS, dtype: int64

In [74]:
xxx = pd.DataFrame(np.c_[np.array(XX),yy])
feats = list(boston.feature_names)
feats.extend(['Label'])

In [76]:
xxx.columns = feats

In [77]:
xxx.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Label
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [78]:
se = ce.SumEncoder(cols='CHAS')

In [79]:
se.fit(X=XX,y=yy)

  elif pd.api.types.is_categorical(cols):


SumEncoder(cols=['CHAS'],
           mapping=[{'col': 'CHAS',
                     'mapping':     CHAS_0
 1     1.0
 2    -1.0
-1     0.0
-2     0.0}])

In [80]:
se.transform(XX).head(30)

Unnamed: 0,intercept,CRIM,ZN,INDUS,CHAS_0,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,1,0.00632,18.0,2.31,1.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,1,0.02731,0.0,7.07,1.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,1,0.02729,0.0,7.07,1.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,1,0.03237,0.0,2.18,1.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,1,0.06905,0.0,2.18,1.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33
5,1,0.02985,0.0,2.18,1.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21
6,1,0.08829,12.5,7.87,1.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43
7,1,0.14455,12.5,7.87,1.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15
8,1,0.21124,12.5,7.87,1.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93
9,1,0.17004,12.5,7.87,1.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1
