<img src="./images/shouke_logo.png"
     style="float: right"
     width=100
     style="padding-bottom:100px;"/>
<br>
<br>

<table style="float:center;">
    <tr>
        <td>
            <img src='./images/python-logo.png'width=120>
        </td>
        <td>
            <img src='./images/pandas-logo.png'width=150>
        </td>
        <td>
            <img src='./images/scikit_learn_logo.png'width=150>
        </td>
    </tr>
</table>

<h1 style='text-align: center;'>Encoding Categorical Data</h1>
<h3 style='text-align: center;'>Shouke Wei, Ph.D. Professor</h3>
<h4 style='text-align: center;'>Email: shouke.wei@gmail.com</h4>

## Objective
- learn how to encode categorical data, i.e. transfer categorical data into numerical values

In [5]:
# import required packages
import pandas as pd

# read data
df = pd.read_csv('./data/gdp_china_variables_seleted.csv')
# display the first 5 rows
df.head()

Unnamed: 0,prov,year,gdp,pop,finv,trade,fexpen,uinc
0,Guangdong,2000,1.074125,8.65,0.314513,1.408147,0.108032,0.976157
1,Guangdong,2001,1.203925,8.733,0.348443,1.501391,0.132133,1.041519
2,Guangdong,2002,1.350242,8.842,0.385078,1.830169,0.152108,1.11372
3,Guangdong,2003,1.584464,8.963,0.48132,2.346735,0.169563,1.238043
4,Guangdong,2004,1.886462,9.052298,0.587002,2.955899,0.185295,1.362765


## 1. Factorize method

In [6]:
df_copy = df.copy()
df_copy['prov'] = pd.factorize(df_copy['prov'])[0]
df_copy

Unnamed: 0,prov,year,gdp,pop,finv,trade,fexpen,uinc
0,0,2000,1.074125,8.650000,0.314513,1.408147,0.108032,0.976157
1,0,2001,1.203925,8.733000,0.348443,1.501391,0.132133,1.041519
2,0,2002,1.350242,8.842000,0.385078,1.830169,0.152108,1.113720
3,0,2003,1.584464,8.963000,0.481320,2.346735,0.169563,1.238043
4,0,2004,1.886462,9.052298,0.587002,2.955899,0.185295,1.362765
...,...,...,...,...,...,...,...,...
90,4,2014,3.493824,9.436000,3.078217,0.399111,0.602869,2.367206
91,4,2015,3.700216,9.480000,3.566035,0.459535,0.679935,2.557561
92,4,2016,4.047179,9.532000,4.041509,0.471385,0.745374,2.723292
93,4,2017,4.455283,9.392000,4.449690,0.474870,0.821552,2.955790


In [8]:
pd.factorize(df_copy['prov'])[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4], dtype=int64)

## 2. get_dummies method

In [13]:
dums = pd.get_dummies(df['prov'],prefix='prov',drop_first=True)
dums                      

Unnamed: 0,prov_Henan,prov_Jiangsu,prov_Shandong,prov_Zhejiang
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
90,1,0,0,0
91,1,0,0,0
92,1,0,0,0
93,1,0,0,0


A categorical variable of K categories, or levels, usually enters a regression as a sequence of K-1 dummy variables. Otherwise, it might cause **Dummy Variable Trap**, where two or more dummy variables created by one-hot encoding are highly correlated (i.e. multi-collinear)

In [14]:
# add dummies to Pandas DataFrame
df_coded = df.join(dums)
df_coded

Unnamed: 0,prov,year,gdp,pop,finv,trade,fexpen,uinc,prov_Henan,prov_Jiangsu,prov_Shandong,prov_Zhejiang
0,Guangdong,2000,1.074125,8.650000,0.314513,1.408147,0.108032,0.976157,0,0,0,0
1,Guangdong,2001,1.203925,8.733000,0.348443,1.501391,0.132133,1.041519,0,0,0,0
2,Guangdong,2002,1.350242,8.842000,0.385078,1.830169,0.152108,1.113720,0,0,0,0
3,Guangdong,2003,1.584464,8.963000,0.481320,2.346735,0.169563,1.238043,0,0,0,0
4,Guangdong,2004,1.886462,9.052298,0.587002,2.955899,0.185295,1.362765,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
90,Henan,2014,3.493824,9.436000,3.078217,0.399111,0.602869,2.367206,1,0,0,0
91,Henan,2015,3.700216,9.480000,3.566035,0.459535,0.679935,2.557561,1,0,0,0
92,Henan,2016,4.047179,9.532000,4.041509,0.471385,0.745374,2.723292,1,0,0,0
93,Henan,2017,4.455283,9.392000,4.449690,0.474870,0.821552,2.955790,1,0,0,0


## 3. OneHotEncoder

In [16]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False,drop='first')
# sparse parameter (=True or False): will return a sparse matrix or dense array
trans = enc.fit_transform(df[['prov']])
trans

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],


In [23]:
# feature_names = enc.get_feature_names_out(['prov'])
feature_names = ['prov_hn', 'prov_js', 'prov_sd', 'prov_zj']
feature_names

('prov_hn', 'prov_js', 'prov_sd', 'prov_zj')

In [24]:
feature_coded = pd.DataFrame(trans, columns=feature_names)
feature_coded

Unnamed: 0,prov_hn,prov_js,prov_sd,prov_zj
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
...,...,...,...,...
90,1.0,0.0,0.0,0.0
91,1.0,0.0,0.0,0.0
92,1.0,0.0,0.0,0.0
93,1.0,0.0,0.0,0.0


In [25]:
df_coded = pd.concat([df,feature_coded],axis=1)
df_coded.drop(['prov'],axis=1, inplace=True)
df_coded

Unnamed: 0,year,gdp,pop,finv,trade,fexpen,uinc,prov_hn,prov_js,prov_sd,prov_zj
0,2000,1.074125,8.650000,0.314513,1.408147,0.108032,0.976157,0.0,0.0,0.0,0.0
1,2001,1.203925,8.733000,0.348443,1.501391,0.132133,1.041519,0.0,0.0,0.0,0.0
2,2002,1.350242,8.842000,0.385078,1.830169,0.152108,1.113720,0.0,0.0,0.0,0.0
3,2003,1.584464,8.963000,0.481320,2.346735,0.169563,1.238043,0.0,0.0,0.0,0.0
4,2004,1.886462,9.052298,0.587002,2.955899,0.185295,1.362765,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
90,2014,3.493824,9.436000,3.078217,0.399111,0.602869,2.367206,1.0,0.0,0.0,0.0
91,2015,3.700216,9.480000,3.566035,0.459535,0.679935,2.557561,1.0,0.0,0.0,0.0
92,2016,4.047179,9.532000,4.041509,0.471385,0.745374,2.723292,1.0,0.0,0.0,0.0
93,2017,4.455283,9.392000,4.449690,0.474870,0.821552,2.955790,1.0,0.0,0.0,0.0


## 4. Save the data

In [26]:
df_coded.to_csv('./data/gdp_china_encoded.csv', index=False)