# Chapter 5: Handling Categorical Data

In [2]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

## 5.1 Encoding Nominal Categorical Features

#### Create feature

In [3]:
feature = np.array([["Texas"],
                   ["California"],
                   ["Texas"],
                   ["Delaware"],
                   ["Texas"]])

#### Create one-hot encoder

In [4]:
oneHot = LabelBinarizer()

#### One-hot encode feature

In [5]:
oneHot.fit_transform(feature)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [7]:
featOH = oneHot.fit_transform(feature)

#### View feature classes:

In [6]:
oneHot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

#### To reverse the one-hot encoding

In [9]:
oneHot.inverse_transform(featOH)

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

#### Create dummy values from the feature:

In [10]:
pd.get_dummies(feature[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


#### Some features have multiple classes

In [11]:
# Create multiclass feature
multiclassFeature = [("Texas", "Florida"),
                     ("California", "Alabama"),
                     ("Texas", "Florida"),
                     ("Delware", "Florida"),
                     ("Texas", "Alabama")]

#### Create multiclass one-hot encoder

In [12]:
oneHotMulticlass = MultiLabelBinarizer()

#### One-hot encode multiclass feature

In [13]:
oneHotMulticlass.fit_transform(multiclassFeature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

#### View the classes

In [14]:
oneHotMulticlass.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

## 5.2 Encoding Ordinal Categorical Features

#### Create features

In [17]:
df = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})
df

Unnamed: 0,Score
0,Low
1,Low
2,Medium
3,Medium
4,High


#### Create mapper

In [16]:
scaleMapper = {"Low":1,
              "Medium":2,
              "High":3}

#### Replace feature values with scale

In [18]:
df["Score"].replace(scaleMapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

In [19]:
df = pd.DataFrame({"Score": ["Low", 
                             "Low", 
                             "Medium", 
                             "Medium", 
                             "High", 
                             "Barely More Than Medium"]})
df

Unnamed: 0,Score
0,Low
1,Low
2,Medium
3,Medium
4,High
5,Barely More Than Medium


In [20]:
scaleMapper = {"Low":1, 
               "Medium":2, 
               "Barely More Than Medium":3, 
               "High":4}

In [21]:
df["Score"].replace(scaleMapper)

0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

#### Be conscious of the values mapped to classes

In [22]:
scaleMapper = {"Low":1, 
               "Medium":2, 
               "Barely More Than Medium":2.1, 
               "High":3}

In [23]:
df["Score"].replace(scaleMapper)

0    1.0
1    1.0
2    2.0
3    2.0
4    3.0
5    2.1
Name: Score, dtype: float64

## 5.3 Encoding Dictionaries of Features

In [24]:
from sklearn.feature_extraction import DictVectorizer

#### Create dictionary

In [25]:
dataDict = [{"Red": 2, "Blue": 4},
             {"Red": 4, "Blue": 3},
             {"Red": 1, "Yellow": 2},
             {"Red": 2, "Yellow": 2}]

#### Create dictionary vectorizer

In [26]:
dictVectorizer = DictVectorizer(sparse=False)

Note: by default DictVectorizer outputs a sparse matrix (no zeros); can force to output a dense matrix using `sparse=False`

#### Convert dictionary to feature matrix

In [27]:
features = dictVectorizer.fit_transform(dataDict)

#### View feature matrix

In [28]:
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

#### Get the feature names

In [29]:
featureNames = dictVectorizer.get_feature_names()

In [30]:
featureNames

['Blue', 'Red', 'Yellow']

#### Can use pandas DF

In [31]:
pd.DataFrame(features, columns=featureNames)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


#### Create word counts dicts for four documents

In [32]:
doc1WordCt = {"Red": 2, "Blue": 4}
doc2WordCt = {"Red": 4, "Blue": 3}
doc3WordCt = {"Red": 1, "Yellow": 2}
doc4WordCt = {"Red": 2, "Yellow": 2}

#### Create list

In [33]:
docWordCounts = [doc1WordCt, 
                 doc2WordCt, 
                 doc3WordCt, 
                 doc4WordCt]

In [34]:
docWordCounts

[{'Red': 2, 'Blue': 4},
 {'Red': 4, 'Blue': 3},
 {'Red': 1, 'Yellow': 2},
 {'Red': 2, 'Yellow': 2}]

#### Convert list of word count dictionaries into feature matrix

In [35]:
dictVectorizer.fit_transform(docWordCounts)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

## 5.4 Imputing Missing Class Values

In [36]:
from sklearn.neighbors import KNeighborsClassifier

#### Create feature matrix with categorical feature

In [37]:
X = np.array([[0, 2.10, 1.45],
              [1, 1.18, 1.33],
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])

#### Create feature matrix with missing values in the categorical feature

In [38]:
X_withNan = np.array([[np.nan, 0.87, 1.31], 
                      [np.nan, -.67, -.22]])

#### Train KNN learner

In [39]:
clf = KNeighborsClassifier(3, weights='distance')
trainedModel = clf.fit(X[:,1:], X[:,0])

#### Predict missing values' class

In [40]:
imputedValues = trainedModel.predict(X_withNan[:,1:])

In [41]:
imputedValues

array([0., 1.])

#### Join column of predicted class with their other features

In [42]:
X_withImputed = np.hstack((imputedValues.reshape(-1,1), X_withNan[:,1:]))

In [43]:
X_withImputed

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22]])

#### Join the two feature matrices

In [44]:
np.vstack((X_withImputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [46]:
# from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer

#### Join the two feature matrices

In [47]:
X_complete = np.vstack((X_withNan, X))
X_complete

array([[  nan,  0.87,  1.31],
       [  nan, -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [50]:
imputer = SimpleImputer(strategy='most_frequent') #, axis=0)

In [51]:
imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

## 5.5 Handling Imbalanced Classes

In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

#### Load iris data

In [63]:
iris = load_iris()

In [64]:
type(iris)

sklearn.utils.Bunch

#### Create feature matrix

In [65]:
features = iris.data

In [66]:
type(features)

numpy.ndarray

#### Create target vector

In [67]:
target = iris.target

In [68]:
type(target)

numpy.ndarray

#### Remove first 40 observations

In [69]:
features = features[40:, :]
target = target[40:]

#### Create binary target vector indicating if class 0

In [71]:
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [72]:
target = np.where((target == 0), 0, 1)

#### Look at the imbalanced target vector

In [73]:
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

#### Create weights for classes

In [74]:
weights = {0: .9, 1: .1}

#### Create random forest classifier with weights

In [75]:
RandomForestClassifier(class_weight=weights)

RandomForestClassifier(class_weight={0: 0.9, 1: 0.1})

Note: there's a bunch of other parameter options shown in the text.

In [76]:
RandomForestClassifier(class_weight='balanced')

RandomForestClassifier(class_weight='balanced')

### Downsampling

#### Indices of each class' observations

In [78]:
iClass0 = np.where(target == 0)[0]
iClass1 = np.where(target == 1)[0]

#### Number of observations in each class

In [79]:
nClass0 = len(iClass0)
nClass1 = len(iClass1)

#### For every observation of class 0, randomly sample from class 1 without replacement

In [81]:
iClass1_downsampled = np.random.choice(iClass1, size=nClass0, replace=False)

In [82]:
iClass1_downsampled

array([93, 28, 16, 61, 58, 59, 89, 86, 66, 62], dtype=int64)

#### Join together class 0's target vector with the downsampled class1's target vector

In [83]:
np.hstack((target[iClass0], target[iClass1_downsampled]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

#### Join together class 0's feature matrix with the downsampled class 1's feature matrix

In [84]:
np.vstack((features[iClass0,:], features[iClass1_downsampled,:]))[0:5]

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4]])

### Upsampling

#### For every observation in class 1, randomly sample from class 0 with replacement 

In [85]:
iClass0_upsampled = np.random.choice(iClass0, size=nClass1, replace=True)

#### Join together class 0's upsampled target vector with class 1's target vector

In [86]:
np.concatenate((target[iClass0_upsampled], target[iClass1]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

#### Join together class 0's upsampled feature matrix with class 1's feature matrix

In [87]:
np.vstack((features[iClass0_upsampled,:], features[iClass1,:]))[0:5]

array([[4.6, 3.2, 1.4, 0.2],
       [5.1, 3.8, 1.9, 0.4],
       [5.1, 3.8, 1.6, 0.2],
       [5.1, 3.8, 1.6, 0.2],
       [5.1, 3.8, 1.6, 0.2]])