# Categorical Variables
### One-Hot-Encoding (Dummy Variables)

In [1]:
# importing the dataset

import os
import mglearn
import pandas as pd

names = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship',
        'race','gender','capital-gain','capital-loss','hours-per-week','native-country','income']
adult_path = os.path.join(mglearn.datasets.DATA_PATH, "adult.data")
data = pd.read_csv(adult_path, header=None, index_col=False, names=names)

# filtering the data to only 7 columns
data = data[['age','workclass','education','gender','hours-per-week','occupation','income']]
display(data.head())

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


In [2]:
# applying one-hot-encoding using get-dummy

data_dummies = pd.get_dummies(data)
data_dummies.head()

Unnamed: 0,age,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,income_ <=50K,income_ >50K
0,39,40,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,50,13,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,38,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,53,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,28,40,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [4]:
data_dummies.shape

(32561, 46)

### One-Hot-Encoding (Integer Features)

In [2]:
# creating the dataset

import pandas as pd
demo_df = pd.DataFrame({'Integer Feature': [0,1,2,1], 'Categorical Feature': ['socks','fox','socks','box']})
demo_df

Unnamed: 0,Integer Feature,Categorical Feature
0,0,socks
1,1,fox
2,2,socks
3,1,box


In [3]:
# applying one-hot encoding based on the column names

demo_df['Integer Feature'] = demo_df['Integer Feature'].astype(str) # converting type to string
pd.get_dummies(demo_df)
# pd.get_dummies(demo_df, columns=['Integer Feature','Categorical Feature'])

Unnamed: 0,Integer Feature_0,Integer Feature_1,Integer Feature_2,Categorical Feature_box,Categorical Feature_fox,Categorical Feature_socks
0,1,0,0,0,0,1
1,0,1,0,0,1,0
2,0,0,1,0,0,1
3,0,1,0,1,0,0


# Binning and Discretization

In [1]:
# importing the dataset

import mglearn

X, y = mglearn.datasets.make_wave(n_samples=100)

In [2]:
# binning

import numpy as np

bins = np.linspace(-3,3,11)
which_bin = np.digitize(X, bins=bins)

which_bin[:5]

array([[ 4],
       [10],
       [ 8],
       [ 6],
       [ 2]])

In [3]:
# discretization (one-hot-encoding)

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
X_binned = encoder.fit_transform(which_bin)

X_binned[:5]

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]])