In [1]:
!pip install numpy
!pip install scikit-learn



In [2]:
import numpy as np
from sklearn import preprocessing

In [4]:
# Create some input data
inputData = np.array([[2.1,-1.9,5.5],
                     [-1.5,2.4,3.5],
                     [0.5,-7.9,5.6],
                     [5.9,2.3,-5.8]])

print(inputData)


[[ 2.1 -1.9  5.5]
 [-1.5  2.4  3.5]
 [ 0.5 -7.9  5.6]
 [ 5.9  2.3 -5.8]]


In [5]:
# Binarize the data, given some threshold (0.5 is the criteria)
dataBinarized = preprocessing.Binarizer(threshold=0.5).transform(inputData)
print("\nBinarized Data:\n", dataBinarized)


Binarized Data:
 [[1. 0. 1.]
 [0. 1. 1.]
 [0. 0. 1.]
 [1. 1. 0.]]


In [6]:
print("Mean = ", inputData.mean(axis=0)) # axis 0 are for row means, axis 1 are for colmeans
print("Std. Dev. = ", inputData.std(axis=0))

Mean =  [ 1.75  -1.275  2.2  ]
Std. Dev. =  [2.71431391 4.20022321 4.69414529]


In [8]:
# Scaling feature vectors is needed because the values of every feature can vary between many random values
# We do not want any feature to be synthetically large or small
dataScalerMinMax = preprocessing.MinMaxScaler(feature_range = (0.5, 2))
dataScaledMinMax = dataScalerMinMax.fit_transform(inputData)
print("\nMin max scaled data:\n", dataScaledMinMax)


Min max scaled data:
 [[1.22972973 1.37378641 1.98684211]
 [0.5        2.         1.72368421]
 [0.90540541 0.5        2.        ]
 [2.         1.98543689 0.5       ]]


In [9]:
# L1 Normalization is another preprocessing technique to modify the vectors
# Gets the vectors on a common scale
# Each pint is divded by sum of absolute values of all the data points in same feature or col
# Result represents the proportion or contribution of each data pont relative to the sum of row

# Normalize data
dataNormalized = preprocessing.normalize(inputData, norm = 'l1')
print("\nL1 Normalized data:\n", dataNormalized)


L1 Normalized data:
 [[ 0.22105263 -0.2         0.57894737]
 [-0.2027027   0.32432432  0.47297297]
 [ 0.03571429 -0.56428571  0.4       ]
 [ 0.42142857  0.16428571 -0.41428571]]


In [11]:
# L2 Normalization - Least Squares (takes the square of the deviation)

dataNormalized = preprocessing.normalize(inputData, norm = 'l2')
print("\nL2 normalized data:\n", dataNormalized)


L2 normalized data:
 [[ 0.33946114 -0.30713151  0.88906489]
 [-0.33325106  0.53320169  0.7775858 ]
 [ 0.05156558 -0.81473612  0.57753446]
 [ 0.68706914  0.26784051 -0.6754239 ]]


In [15]:
# Qualitative variable encoding (NOT one hot encoding. Creates ordinal quantitative variable)
# Creates quant variable for a qualitative variable
inputLabels = ['red', 'black', 'red', 'green', 'black', 'yellow', 'white']
encoder = preprocessing.LabelEncoder()
encoder.fit(inputLabels) # all we have done here is invoke the encoder

In [16]:
# Encoding a set of labels
# Now we're testing the labeleconder whether it could produce soe lables
testLabels = ['green', 'red', 'black']
encodedValues = encoder.transform(testLabels)
print("\nLabels = ", testLabels)
print("Encoded values = ", list(encodedValues))


Labels =  ['green', 'red', 'black']
Encoded values =  [1, 2, 0]


In [17]:
# Checking the performance by decoding a randomly ordered list of integers
# We will map the integers to the labels we created in block 16
encodedValues = [1,2,0]
decodedList = encoder.inverse_transform(encodedValues)
print("\nEncoded values =", decodedList)


Encoded values = ['green' 'red' 'black']


In [19]:
# Sample training labels
# This is the training data and we are training the machine to label colors
inputLabels = ['Bad', 'Worse', 'Worst', 'Good', 'VeryGood', 'Excellent']
encoder = preprocessing.LabelEncoder()
encoder.fit(inputLabels)

In [20]:
# Checking the performance by encoding randomly ordered list
testLabels = ['Bad', 'Worse', 'Good', 'VeryGood']
encodedValues = encoder.transform(testLabels)
print("\nLabels = ", testLabels)
print("Encoded values = ", list(encodedValues))


Labels =  ['Bad', 'Worse', 'Good', 'VeryGood']
Encoded values =  [0, 4, 2, 3]
