In [3]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import pandas as pd
import os
import time

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# import seaborn as sns
# sns.set()


# Ignore useless warnings (see SciPy issue #5998)
# import warnings
# warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [9]:
# Paths ...
PROJECT_ROOT_DIR = os.getcwd()
# PATH_TO_DATA = '../../data'  # TODO make cross
PATH_TO_DATA = '..\..\data'  # TODO make cross

# Encoding Categorical Variables


## Table of content
1. [TL;DR](#tl-dr)
2. 
3. 


## Label Encoding

![label-encoding.png](.\images\label-encoding.png)

## One Hot Encoding

![one-hot-encoding.png](.\images\one-hot-encoding.png)

## Data

Credit Approval Data Set [sources](https://archive.ics.uci.edu/ml/datasets/credit+approval)

### Data Set Information:

This file concerns credit card applications. All attribute names and values have been changed to meaningless symbols to protect confidentiality of the data.

This dataset is interesting because there is a good mix of attributes -- continuous, nominal with small numbers of values, and nominal with larger numbers of values. `There are also a few missing values.`


#### Attribute Information:

A1: b, a.

A2: continuous.

A3: continuous.

A4: u, y, l, t.

A5: g, p, gg.

A6: c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.

A7: v, h, bb, j, n, z, dd, ff, o.

A8: continuous.

A9: t, f.

A10: t, f.

A11: continuous.

A12: t, f.

A13: g, p, s.

A14: continuous.

A15: continuous.

A16: +,- (class attribute)

In [18]:
data = pd.read_csv(os.path.join(PATH_TO_DATA, 'crx', 'crx.data'), header=None)

varnames = ['A'+str(s) for s in range(1,17)]
data.columns = varnames

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [19]:
# Replace the question marks in the dataset with NumPy NaN values:
data = data.replace('?', np.nan)

# re-cast some variables to the correct types 
data['A2'] = data['A2'].astype('float')
data['A14'] = data['A14'].astype('float')

# encode target to binary
data['A16'] = data['A16'].map({'+':1, '-':0})

# display
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [26]:
# find categorical variables
cat_cols = [c for c in data.columns if data[c].dtypes=='O']
data[cat_cols].head(2)

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13
0,b,u,g,w,v,t,t,f,g
1,a,u,g,q,h,t,t,f,g


In [23]:
# find numerical variables
num_cols = [c for c in data.columns if data[c].dtypes!='O']
data[num_cols].head(2)

Unnamed: 0,A2,A3,A8,A11,A14,A15,A16
0,30.83,0.0,1.25,1,202.0,0,1
1,58.67,4.46,3.04,6,43.0,560,1


In [22]:
data.isnull().sum()

A1     12
A2     12
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0
dtype: int64

### Alert !!!

In [27]:
# fill in missing values

data[num_cols] = data[num_cols].fillna(0)
data[cat_cols] = data[cat_cols].fillna('Missing')

data.isnull().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A14    0
A15    0
A16    0
dtype: int64

## Summary

## What Next ?



## Resources


### Books
1. [[1.1.] Python Feature Engineering Cookbook](https://www.packtpub.com/product/python-feature-engineering-cookbook/9781789806311)
    1. [Github](https://github.com/PacktPublishing/Python-Feature-Engineering-Cookbook)


### Papers
0. [[2.0.] ...](#)


### Web
0. [[3.0.] ...](#)


### Images
0. [[4.0.] ...](#)

pip install feature-engine

pip install category_encoders

<!--
https://github.com/scikit-learn-contrib/scikit-learn-contrib/blob/master/README.md

https://www.trainindata.com/feature-engine

https://feature-engine.readthedocs.io/en/latest/encoders/index.html
-->