# import preprocessing functions from sklearn

In [11]:
# binarization
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from pandas import read_csv
from numpy import set_printoptions

# read data

In [12]:
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values

# display data

In [13]:
dataframe

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


# binarization

In [14]:
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
binarizer = Binarizer(threshold=50.0).fit(X)
binaryX = binarizer.transform(X)
# summarize transformed data
set_printoptions(precision=3)
print(binaryX[0:10,:])

[[0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 1. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0.]
 [0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 0. 1. 0. 0. 1.]
 [0. 1. 1. 0. 0. 0. 0. 1.]]


# normalization

In [15]:
X = array[:,0:8]
Y = array[:,8]
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
# summarize transformed data
set_printoptions(precision=3)
print(normalizedX[0:7,:])

[[0.034 0.828 0.403 0.196 0.    0.188 0.004 0.28 ]
 [0.008 0.716 0.556 0.244 0.    0.224 0.003 0.261]
 [0.04  0.924 0.323 0.    0.    0.118 0.003 0.162]
 [0.007 0.588 0.436 0.152 0.622 0.186 0.001 0.139]
 [0.    0.596 0.174 0.152 0.731 0.188 0.01  0.144]
 [0.035 0.81  0.517 0.    0.    0.179 0.001 0.209]
 [0.022 0.566 0.363 0.232 0.638 0.225 0.002 0.189]]


# standardization

In [16]:
X = array[:,0:8]
Y = array[:,8]
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
# summarize transformed data
set_printoptions(precision=3)
print(rescaledX[1:6,:])

[[-0.845 -1.123 -0.161  0.531 -0.693 -0.684 -0.365 -0.191]
 [ 1.234  1.944 -0.264 -1.288 -0.693 -1.103  0.604 -0.106]
 [-0.845 -0.998 -0.161  0.155  0.123 -0.494 -0.921 -1.042]
 [-1.142  0.504 -1.505  0.907  0.766  1.41   5.485 -0.02 ]
 [ 0.343 -0.153  0.253 -1.288 -0.693 -0.811 -0.818 -0.276]]


# rescaling

In [17]:
X = array[:,0:8]
Y = array[:,8]
scaler = MinMaxScaler(feature_range=(100, 200))
rescaledX = scaler.fit_transform(X)
# summarize transformed data
set_printoptions(precision=4)
print(rescaledX[0:5,:])

[[135.2941 174.3719 159.0164 135.3535 100.     150.0745 123.4415 148.3333]
 [105.8824 142.7136 154.0984 129.2929 100.     139.6423 111.6567 116.6667]
 [147.0588 191.9598 152.459  100.     100.     134.7243 125.3629 118.3333]
 [105.8824 144.7236 154.0984 123.2323 111.1111 141.8778 103.8002 100.    ]
 [100.     168.8442 132.7869 135.3535 119.8582 164.2325 194.3638 120.    ]]


# label encoding

In [18]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example
data = ['Tiger','Cheetah','Cougar','Leopard', 'Cheetah', 'Cougar', 'Panther', 'Jaguar', 'Eurasian lynx', 'Tiger']
values = array(data)
print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

['Tiger' 'Cheetah' 'Cougar' 'Leopard' 'Cheetah' 'Cougar' 'Panther'
 'Jaguar' 'Eurasian lynx' 'Tiger']
[6 0 1 4 0 1 5 3 2 6]


In [19]:
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(values)
print(integer_encoded)

['Tiger' 'Cheetah' 'Cougar' 'Leopard' 'Cheetah' 'Cougar' 'Panther'
 'Jaguar' 'Eurasian lynx' 'Tiger']
[6 0 1 4 0 1 5 3 2 6]


# one hot encoding

In [20]:
# binary encode
onehot_encoder = OneHotEncoder(sparse=False,categories='auto')
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(values)
print(onehot_encoded)

['Tiger' 'Cheetah' 'Cougar' 'Leopard' 'Cheetah' 'Cougar' 'Panther'
 'Jaguar' 'Eurasian lynx' 'Tiger']
[[0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]]


# inverse label encoding

In [6]:
# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[3, :])])
print(values)
print(inverted)

['Tiger' 'Cheetah' 'Cougar' 'Leopard' 'Cheetah' 'Cougar' 'Panther'
 'Jaguar' 'Eurasian lynx' 'Tiger']
['Leopard']


In [7]:
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(values)
print(inverted)

['Tiger' 'Cheetah' 'Cougar' 'Leopard' 'Cheetah' 'Cougar' 'Panther'
 'Jaguar' 'Eurasian lynx' 'Tiger']
['Tiger']


In [8]:
# define both label encoding and one hot encoding
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
values = array(data)
print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']
[0 0 2 0 1 1 2 0 2 1]


# inverse one hot encoding

In [9]:
# binary encode
onehot_encoder = OneHotEncoder(sparse=False,categories='auto')
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(values)
print(onehot_encoded)

['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']
[[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In [10]:
# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[5, :])])
print(values)
print(inverted)

['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']
['hot']
