# Encoding Categorical Data

In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

In [8]:
# load the dataset
dataset = pd.read_csv('../datasets/breast-cancer.csv', header=None)
data = dataset.values

# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

print("Input: ", X.shape)
print("Output: ", y.shape)

Input:  (286, 9)
Output:  (286,)


## Ordinal encoder

In [18]:
# load the dataset
dataset = pd.read_csv('../datasets/breast-cancer.csv', header=None)
data = dataset.values

# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

ordinal_encoder = OrdinalEncoder()
X = ordinal_encoder.fit_transform(X)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

print(X)
print(y)

[[0. 1. 2. ... 2. 0. 2.]
 [0. 2. 2. ... 1. 1. 5.]
 [0. 2. 2. ... 1. 0. 2.]
 ...
 [1. 4. 0. ... 0. 1. 3.]
 [1. 2. 0. ... 2. 0. 2.]
 [1. 3. 0. ... 2. 0. 2.]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1
 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 0 0 0 1 1 0 0 1 1 1 1 1 1 1
 1 0 1 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 1 1 0 1 0 1 0 0 1 1 1 1 0 0 0 1 1 0 1 0 1 1 1 1 0 0 0 1 0 0 0 0 1 1 1
 0 0 1 0 0 0 0 0 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 0 0 0]


## Evaluate Logistic regression on the breast dataset with an ordinal encoding

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

# load the dataset
dataset = pd.read_csv('../datasets/breast-cancer.csv', header=None)
data = dataset.values

# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=1)

# ordinal encode input variables
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=100)
ordinal_encoder.fit(X_train)
X_train = ordinal_encoder.transform(X_train)
X_test  = ordinal_encoder.transform(X_test)

# ordinal encode output variables
label_encoder = LabelEncoder()
label_encoder.fit_transform(y)

print('Input', X.shape)
print(X[:5, :])
print('Output', y.shape)
print(y[:5])

Input (286, 9)
[['no-recurrence-events' '30-39' 'premeno' '30-34' '0-2' 'no' '3' 'left'
  'left_low']
 ['no-recurrence-events' '40-49' 'premeno' '20-24' '0-2' 'no' '2' 'right'
  'right_up']
 ['no-recurrence-events' '40-49' 'premeno' '20-24' '0-2' 'no' '2' 'left'
  'left_low']
 ['no-recurrence-events' '60-69' 'ge40' '15-19' '0-2' 'no' '2' 'right'
  'left_up']
 ['no-recurrence-events' '40-49' 'premeno' '0-4' '0-2' 'no' '2' 'right'
  'right_low']]
Output (286,)
['no' 'no' 'no' 'no' 'no']
