In [43]:
import pandas as pd

# Dataframe with some ordinal and nominal features
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'], 
    ['red', 'L', 13.5, 'class2'], 
    ['blue', 'XL', 15.3, 'class1']
])

df.columns = ['color', 'size', 'price', 'classlabel']

print('Original Dataset: ')
print(df)
print('\n')

Original Dataset: 
   color size  price classlabel
0  green    M   10.1     class1
1    red    L   13.5     class2
2   blue   XL   15.3     class1




In [44]:
# Converting categorical features to integers
size_mappings = {
    'XL': 3,
    'L': 2,
    'M': 1
}

df['size'] = df['size'].map(size_mappings)
print('Dataset after converting ordinal feature size: ')
print(df)
print('\n')

Dataset after converting ordinal feature size: 
   color  size  price classlabel
0  green     1   10.1     class1
1    red     2   13.5     class2
2   blue     3   15.3     class1




In [45]:
# Many machine learning libraries require that class labels are encoded as integer values. 
# Although most estimators for classification in scikit-learn convert class labels to integers internally, 
# it is considered good practice to provide class labels as integer arrays to avoid technical glitches. 
# To encode the class labels, we can use an approach similar to the mapping of ordinal features discussed previously. 
# We need to remember that class labels are not ordinal, and it doesn't matter which integer number 
# we assign to a particular string-label. Thus, we can simply enumerate the class labels starting at 0:

from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
df['classlabel'] = class_le.fit_transform(df['classlabel'].values)
print('Dataset after converting nominal feature [ class label ]: ')
print(df)
print('\n')

Dataset after converting nominal feature [ class label ]: 
   color  size  price  classlabel
0  green     1   10.1           0
1    red     2   13.5           1
2   blue     3   15.3           0




In [46]:
# It may appear that we could use a similar approach to transform the nominal color column of our dataset, 
# as follows:

df['color'] = class_le.fit_transform(df['color'].values)
print('Dataset after converting nominal feature [ color ]: ')
print(df)
print('\n')

Dataset after converting nominal feature [ color ]: 
   color  size  price  classlabel
0      1     1   10.1           0
1      2     2   13.5           1
2      0     3   15.3           0




In [47]:
# If we stop at this point and feed the array to our classifier, we will make one of the most common mistakes 
# in dealing with categorical data. Can you spot the problem? 
# Although the color values don't come in any particular order, a learning algorithm will now assume that 
# green is larger than blue, and red is larger than green. Although this assumption is incorrect, 
# the algorithm could still produce useful results. However, those results would not be optimal.
# A common workaround for this problem is to use a technique called one-hot encoding.
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0], sparse=False)
ohe.fit_transform(df)

array([[  0. ,   1. ,   0. ,   1. ,  10.1,   0. ],
       [  0. ,   0. ,   1. ,   2. ,  13.5,   1. ],
       [  1. ,   0. ,   0. ,   3. ,  15.3,   0. ]])

In [53]:
# An even more convenient way to create those dummy features via one-hot encoding is to use 
# the get_dummies method implemented in pandas. Applied on a DataFrame, the get_dummies method will only 
# convert string columns and leave all other columns unchanged:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'], 
    ['red', 'L', 13.5, 'class2'], 
    ['blue', 'XL', 15.3, 'class1']
])

df.columns = ['color', 'size', 'price', 'classlabel']
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,color_blue,color_green,color_red,size_L,size_M,size_XL
0,10.1,0,1,0,0,1,0
1,13.5,0,0,1,1,0,0
2,15.3,1,0,0,0,0,1


In [60]:
# DictVectorizer from sklearn
from sklearn.feature_extraction import DictVectorizer
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'], 
    ['red', 'L', 13.5, 'class2'], 
    ['blue', 'XL', 15.3, 'class1']
])

df.columns = ['color', 'size', 'price', 'classlabel']
dv = DictVectorizer(sparse=False)
D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]