# scikit-learn provides many preprocessing utilities such as,

1.Standardization mean removal <br>
2.Scaling<br>
3.Normalization<br>
4.Binarization<br>
5.One Hot Encoding<br>
6.Label Encoding<br>
7.Imputation<br>

<font color="red"><b>Standardization:</b></font><br>
Standardization or Mean Removal is the process of transforming each feature vector into a normal distribution 
with mean 0 and variance 1.


In [2]:
import sklearn.preprocessing as preprocessing
from sklearn.datasets import load_breast_cancer
import pandas as pd
standardizer = preprocessing.StandardScaler()
breast_cancer=load_breast_cancer()
standardizer = standardizer.fit(breast_cancer.data)
breast_cancer_standardized = standardizer.transform(breast_cancer.data)

print('Mean of each feature after Standardization :\n\n')
print(breast_cancer_standardized.mean(axis=0))
print('\nStd. of each feature after Standardization :\n\n')
print(breast_cancer_standardized.std(axis=0))

Mean of each feature after Standardization :


[-3.16286735e-15 -6.53060890e-15 -7.07889127e-16 -8.79983452e-16
  6.13217737e-15 -1.12036918e-15 -4.42138027e-16  9.73249991e-16
 -1.97167024e-15 -1.45363120e-15 -9.07641468e-16 -8.85349205e-16
  1.77367396e-15 -8.29155139e-16 -7.54180940e-16 -3.92187747e-16
  7.91789988e-16 -2.73946068e-16 -3.10823423e-16 -3.36676596e-16
 -2.33322442e-15  1.76367415e-15 -1.19802625e-15  5.04966114e-16
 -5.21317026e-15 -2.17478837e-15  6.85645643e-16 -1.41265636e-16
 -2.28956670e-15  2.57517109e-15]

Std. of each feature after Standardization :


[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]


In [42]:
import numpy as np
my_array=np.array([[10,1],[20,9],[30,30],[40,17]])
print(my_array)
standardizer = preprocessing.StandardScaler()
standardizer = standardizer.fit(my_array)
my_array_standardized = standardizer.transform(my_array)
my_array_standardized

[[10  1]
 [20  9]
 [30 30]
 [40 17]]


array([[-1.34164079, -1.23725106],
       [-0.4472136 , -0.49023155],
       [ 0.4472136 ,  1.47069466],
       [ 1.34164079,  0.25678796]])

<font color="red"><b>Scaling</b></font>
<br>Scaling transforms existing data values to lie between a minimum and maximum value.
    <br><b>MinMaxScaler</b> transforms data to range 0 and 1.
    <br><b>MaxAbsScaler</b> transforms data to range -1 and 1

In [4]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 10)).fit(breast_cancer.data)
breast_cancer_minmaxscaled10 = min_max_scaler.transform(breast_cancer.data)

In [5]:
breast_cancer_minmaxscaled10

array([[5.21037437, 0.22658099, 5.45988529, ..., 9.12027491, 5.98462448,
        4.18863964],
       [6.43144493, 2.72573554, 6.15783291, ..., 6.39175258, 2.33589592,
        2.22878132],
       [6.01495575, 3.90260399, 5.95743211, ..., 8.35051546, 4.03705894,
        2.13433032],
       ...,
       [4.55251077, 6.21237741, 4.45788128, ..., 4.87285223, 1.28720678,
        1.51908697],
       [6.44564343, 6.63510315, 6.65537972, ..., 9.10652921, 4.97141731,
        4.52315361],
       [0.36868759, 5.01521813, 0.28539838, ..., 0.        , 2.57441356,
        1.00682146]])

In [7]:
max_abs_scaler = preprocessing.MaxAbsScaler().fit(breast_cancer.data)
breast_cancer_maxabsscaled = max_abs_scaler.transform(breast_cancer.data)
breast_cancer_maxabsscaled

array([[0.63998577, 0.26425662, 0.65145889, ..., 0.91202749, 0.69313046,
        0.57301205],
       [0.73176805, 0.45239308, 0.70503979, ..., 0.63917526, 0.41428141,
        0.42901205],
       [0.70046247, 0.54098778, 0.68965517, ..., 0.83505155, 0.54429045,
        0.42207229],
       ...,
       [0.59053718, 0.71486762, 0.57453581, ..., 0.48728522, 0.33413679,
        0.37686747],
       [0.73283529, 0.74669043, 0.74323607, ..., 0.91065292, 0.6156975 ,
        0.59759036],
       [0.27605834, 0.62474542, 0.25421751, ..., 0.        , 0.43250979,
        0.33922892]])

In [48]:
my_array=np.array([[10,-10],[20,9],[30,30],[40,17]])
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 10)).fit(my_array)
my_array_minmaxscaled10 = min_max_scaler.transform(my_array)
my_array_minmaxscaled10

array([[ 0.        ,  0.        ],
       [ 3.33333333,  4.75      ],
       [ 6.66666667, 10.        ],
       [10.        ,  6.75      ]])

In [47]:
my_array=np.array([[10,-10],[20,9],[30,30],[40,17]])
max_abs_scaler = preprocessing.MaxAbsScaler().fit(my_array)
my_array_maxabsscaled = max_abs_scaler.transform(my_array)
my_array_maxabsscaled

array([[ 0.25      , -0.33333333],
       [ 0.5       ,  0.3       ],
       [ 0.75      ,  1.        ],
       [ 1.        ,  0.56666667]])

<font color="red"><b>Normalization</b></font><br>
Normalization scales each sample to have a unit norm.<br>
Normalization can be achieved with 'l1', 'l2', and 'max' norms.<br>
'l1' norm makes the sum of absolute values of each row as 1, and 'l2' norm makes the sum of squares of each row as 1.<br>
'l1' norm is insensitive to outliers.<br>
By default l2 norm is considered. Hence, removing outliers is recommended before applying l2 norm.<br>

In [10]:
normalizer = preprocessing.Normalizer(norm='l1').fit(breast_cancer.data)

breast_cancer_normalized = normalizer.transform(breast_cancer.data)
breast_cancer_normalized

array([[5.04461573e-03, 2.91067878e-03, 3.44346198e-02, ...,
        7.44214015e-05, 1.29017660e-04, 3.33410122e-05],
       [5.49864230e-03, 4.75016401e-03, 3.55259874e-02, ...,
        4.97203436e-05, 7.35112606e-05, 2.37962634e-05],
       [5.81273050e-03, 6.27326171e-03, 3.83776011e-02, ...,
        7.17365928e-05, 1.06660210e-04, 2.58546946e-05],
       ...,
       [7.00344278e-03, 1.18467875e-02, 4.56911357e-02, ...,
        5.98245895e-05, 9.35761210e-05, 3.29921220e-05],
       [5.68390968e-03, 8.09267334e-03, 3.86561042e-02, ...,
        7.31182555e-05, 1.12767664e-04, 3.42138252e-05],
       [1.18802525e-02, 3.75697675e-02, 7.33636209e-02, ...,
        0.00000000e+00, 4.39538722e-04, 1.07764300e-04]])

In [73]:
my_array=np.array([[10,-10],[20,9],[30,30],[40,17]])
normalizer = preprocessing.Normalizer(norm='l2').fit(my_array)
my_array_normalized = normalizer.transform(my_array)
my_array_normalized

array([[ 0.70710678, -0.70710678],
       [ 0.91192151,  0.41036468],
       [ 0.70710678,  0.70710678],
       [ 0.92033092,  0.39114064]])

<font color="red"><b>Binarization</b></font><br>
Binarization is the process of transforming data points to 0 or 1 based on a given threshold.<br>

Any value above the threshold is transformed to 1, and any value below the threshold is transformed to 0.
By default, a threshold of 0 is used.

In [12]:
binarizer = preprocessing.Binarizer(threshold=3.0).fit(breast_cancer.data)
breast_cancer_binarized = binarizer.transform(breast_cancer.data)
print(breast_cancer_binarized[:5,:5])

[[1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]]


In [57]:
my_array=np.array([1,2,3,4,5,6]).reshape(-1,1)
binarizer = preprocessing.Binarizer(threshold=3.0).fit(my_array)
my_array_binarized = binarizer.transform(my_array)
my_array_binarized

array([[0],
       [0],
       [0],
       [1],
       [1],
       [1]])

<font color="red"><b>OneHotEncoder</b></font><br>
OneHotEncoder converts categorical integer values into one-hot vectors. In an on-hot vector, every category is transformed into a binary attribute having only 0 and 1 values

In [18]:
onehotencoder = preprocessing.OneHotEncoder(categories='auto')
onehotencoder = onehotencoder.fit([[1], [1], [1], [2], [3], [4]])

# Transforming category values 1 and 2 to one-hot vectors
print(onehotencoder.transform([[1]]).toarray())
print(onehotencoder.transform([[2]]).toarray())
print(onehotencoder.transform([[3]]).toarray())

[[1. 0. 0. 0.]]
[[0. 1. 0. 0.]]
[[0. 0. 1. 0.]]


<font color="red"><b>Label Encoding</b></font><br>

Label Encoding is a step in which, in which categorical features are represented as categorical integers. An example of transforming categorical values ["benign","malignant"]into[0, 1]` is

In [24]:
labels = ['malignant', 'benign', 'malignant', 'benign']

labelencoder = preprocessing.LabelEncoder()

labelencoder = labelencoder.fit(labels)

bc_labelencoded = labelencoder.transform(breast_cancer.target_names)
bc_labelencoded

array([1, 0], dtype=int64)

<font color="red"><b>Imputation </b></font><br>
replaces missing values with either median, mean, or the most common value of the column or row in which the missing values exist.
Below example replaces missing values, represented by np.nan, with the mean of respective column (axis 0).

In [30]:
from sklearn.impute import SimpleImputer
import numpy as np

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean',verbose=0)
imputer = imputer.fit(breast_cancer.data)
breast_cancer_imputed = imputer.transform(breast_cancer.data)
breast_cancer_imputed

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [60]:
my_array=np.array([1,2,3,np.nan,4,5,6]).reshape(-1,1)
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean',verbose=0)
imputer = imputer.fit(my_array)
my_array_imputed = imputer.transform(my_array)
my_array_imputed

array([[1. ],
       [2. ],
       [3. ],
       [3.5],
       [4. ],
       [5. ],
       [6. ]])

In [61]:
my_array=np.array([1,2,3,np.nan,6,5,6]).reshape(-1,1)
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent',verbose=0)
imputer = imputer.fit(my_array)
my_array_imputed = imputer.transform(my_array)
my_array_imputed

array([[1.],
       [2.],
       [3.],
       [6.],
       [6.],
       [5.],
       [6.]])

In [63]:
my_array=np.array([1,2,3,np.nan,6,5,6]).reshape(-1,1)
imputer = SimpleImputer(missing_values = np.nan, strategy = 'constant',fill_value =0,verbose=0)
imputer = imputer.fit(my_array)
my_array_imputed = imputer.transform(my_array)
my_array_imputed.ravel()

array([1., 2., 3., 0., 6., 5., 6.])