# 5.1 Encoding Nomial Categories Features

In [58]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
# Create feature
feature = np.array([['Texas'], ['California'], ['Texas'], 
                    ['Delaware'], ['Texas']])
# Create one-hot encoder
one_hot = LabelBinarizer()
# One-hot encode feature
one_hot.fit_transform(feature)
print(pd.DataFrame(one_hot.fit_transform(feature)))
# View feature classes
one_hot.classes_
# Reverse one-hot encoding
one_hot.inverse_transform(one_hot.transform(feature))

   0  1  2
0  0  0  1
1  1  0  0
2  0  0  1
3  0  1  0
4  0  0  1


array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

Using pandas

In [59]:
import pandas as pd
# Create dummy variables from feature
pd.get_dummies(feature[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


Multiple classes

In [60]:
# Create multiclass feature
multiclass_feature = [('Texas', 'Florida'),
                      ('California', 'Alabama'),
                      ('Texas', 'Florida'),
                      ('Delware', 'Florida'),
                      ('Texas', 'Alabama')]
# Create multiclass one-hot encoder
one_hot_multiclass = MultiLabelBinarizer()
# One-hot encode multiclass feature
one_hot_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

# 5-2 Encoding Ordinal Categorical Features

In [61]:
import pandas as pd
# Create features
dataframe = pd.DataFrame({'Score': ['Low', 'Low', 'Medium', 'High']})
# Create mapper
scaler_mapper = {'Low':1, 'Medium':2, 'High':3}
# Replace feature values with scale
dataframe['Score'].replace(scaler_mapper)

0    1
1    1
2    2
3    3
Name: Score, dtype: int64

In [62]:
# Create features
dataframe = pd.DataFrame({'Score': ['Low', 'Low', 
                                    'Medium', 'Medium',
                                    'High', 'Barely More than Medium']})
# Create mapper
scaler_mapper = {'Low':1, 'Medium':2, 
                 'Barely More than Medium':3, 'High':4}
# Replace feature values with scale
dataframe['Score'].replace(scaler_mapper)

0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

# 5.3 Encoding Dictionaries of Features

Convert a dictionary into a feature matrix.

In [63]:
from sklearn.feature_extraction import DictVectorizer
# Create dictionary
data_dict = [{'Red': 2, 'Blue': 4},
             {'Red': 4, 'Blue': 3},
             {'Red': 1, 'Yellow': 2},
             {'Red': 2, 'Yellow': 2}]
# Create dictionary vectorizer
dictvectorizer = DictVectorizer(sparse=False)
# Convert dictionary to feature matrix
features = dictvectorizer.fit_transform(data_dict)
# Get feature names
feature_names = dictvectorizer.get_feature_names()
feature_names

['Blue', 'Red', 'Yellow']

In [64]:
import pandas as pd
pd.DataFrame(features, columns=feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [65]:
# Create word counts dictionaries for four documents
doc_1_word_count = {'Red': 2, 'Blue': 4}
doc_2_word_count = {'Red': 4, 'Blue': 3}
doc_3_word_count = {'Red': 1, 'Yellow': 2}
doc_4_word_count = {'Red': 2, 'Yellow': 2}
# Create list
doc_word_counts = [doc_1_word_count,
                   doc_2_word_count,
                   doc_3_word_count,
                   doc_4_word_count]
# Convert list of word count dictionaries into feature matrix
dictvectorizer.fit_transform(doc_word_counts)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

# 5-4 Imputing Missing Class Values

A categorical feature containing missing values that you want to replace with predicted values.
- By training a ML classifier algorithm to predict the missing values.
- KNN

In [66]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
# Create feature matrix with categorical feature
X = np.array([[0, 2.10, 1.45],
              [1, 1.18, 1.33],
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])
# Create feature matrix with missing values in the categorical feature
X_with_nan = np.array([[np.nan, 0.87, 1.31],
                       [np.nan, -0.67, -0.22]])
# Train KNN learner
clf = KNeighborsClassifier(3, weights='distance')
trained_model = clf.fit(X[:,1:], X[:,0])
# Predict missing values' class
imputed_values = trained_model.predict(X_with_nan[:,1:])
# Join column of predicted class with their other features
X_with_imputed = np.hstack((imputed_values.reshape(-1,1), 
                            X_with_nan[:,1:]))
# Join two feature matrices
np.vstack((X_with_imputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

Alternative solution - fill in missing values with feature's most frequent value

In [67]:
from sklearn.preprocessing import Imputer
# Join two feature matrices
X_complete = np.vstack((X_with_nan, X))
imputer = Imputer(strategy='most_frequent', axis=0)
imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

# 5.5 Handling Imbalanced Classes

In [68]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
# Load iris data
iris = load_iris()
# Create feature matrix
features = iris.data
# Create target vector
target = iris.target
# Remove first 40 observation
features = features[40:,:]
target = target[40:]
# Create binary target vector indicating if class 0
target = np.where((target == 0), 0, 1)
# Load at the imbalanced target vector
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [69]:
# Create weights
weights = {0: .9, 1: 0.1}
# Create random forest classifier with weights
RandomForestClassifier(class_weight=weights)
RandomForestClassifier(bootstrap=True, class_weight={0: .9, 1: 0.1},
                      criterion='gini', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0,
                      min_impurity_split=None, 
                      min_samples_leaf=1,
                      min_samples_split=2, 
                      min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=1,
                      oob_score=False, random_state=None,
                      verbose=0, warm_start=False)


RandomForestClassifier(bootstrap=True, class_weight={0: 0.9, 1: 0.1},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [70]:
# Train a random forest with balanced class weights
RandomForestClassifier(class_weight='balanced')
RandomForestClassifier(bootstrap=True, class_weight={0: .9, 1: 0.1},
                      criterion='gini', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0,
                      min_impurity_split=None, 
                      min_samples_leaf=1,
                      min_samples_split=2, 
                      min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=1,
                      oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

RandomForestClassifier(bootstrap=True, class_weight={0: 0.9, 1: 0.1},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [71]:
# Indices of each class' observations
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]
# Number of observations in each class
n_class0 = len(i_class0)
n_class1 = len(i_class1)

# For every observation of class 0, 
# randomly sample from class1 without replacement
i_class1_downsampled = np.random.choice(i_class1, 
                                        size=n_class0, replace=False)

# Join together class 0's target vector 
# with the downsampled class1' target vector
np.hstack((target[i_class0], target[i_class1_downsampled]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [72]:
# Join together class 0's unsampled feature matrix with class 1's
np.vstack((features[i_class0_unsampled,:], features[i_class1,:]))[0:5]

array([[4.5, 2.3, 1.3, 0.3],
       [5.1, 3.8, 1.9, 0.4],
       [5. , 3.5, 1.6, 0.6],
       [5. , 3.5, 1.6, 0.6],
       [4.4, 3.2, 1.3, 0.2]])

In [73]:
# For every observation in class1, 
# randomly sample from class0 with replacement
i_class0_unsampled = np.random.choice(i_class0, size=n_class1, 
                                      replace=True)

# Join together class 0's unsampled target vector with class 1's
np.concatenate((target[i_class0_unsampled], target[i_class1]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [74]:
# Join together class 0's unsampled feature matrix with class 1's
np.vstack((features[i_class0_unsampled,:], features[i_class1,:]))[0:5]

array([[5. , 3.5, 1.3, 0.3],
       [5. , 3.5, 1.6, 0.6],
       [4.8, 3. , 1.4, 0.3],
       [4.8, 3. , 1.4, 0.3],
       [5. , 3.3, 1.4, 0.2]])