# Standardizing

In [4]:
from sklearn import preprocessing
import numpy as np
import pandas as pd

In [31]:
x = np.array([[5.,10.,15.],[5.,0.,-10.],[15.,2.,8.],[11.,5.,-2.]])
x

array([[  5.,  10.,  15.],
       [  5.,   0., -10.],
       [ 15.,   2.,   8.],
       [ 11.,   5.,  -2.]])

In [32]:
x.shape

(4, 3)

In [58]:
x_scaled = preprocessing.scale(x)
x_scaled

array([[-0.94280904,  1.52656362,  1.28635955],
       [-0.94280904, -1.12832963, -1.33886402],
       [ 1.41421356, -0.59735098,  0.55129695],
       [ 0.47140452,  0.19911699, -0.49879248]])

In [59]:
x_scaled.mean(axis=0)

array([-1.38777878e-17, -2.08166817e-17, -1.38777878e-17])

In [60]:
x_scaled.std(axis=0)

array([1., 1., 1.])

In [61]:
#Can use transformer style api for assisting with pipelines

In [92]:
scaler = preprocessing.StandardScaler(with_mean=True, with_std=True)
scaler.fit(x)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [63]:
scaler.mean_   #Feature wise means computed in scaler class

array([9.  , 4.25, 2.75])

In [64]:
scaler.transform(x)

array([[-0.94280904,  1.52656362,  1.28635955],
       [-0.94280904, -1.12832963, -1.33886402],
       [ 1.41421356, -0.59735098,  0.55129695],
       [ 0.47140452,  0.19911699, -0.49879248]])

In [68]:
# Scaling features to a range using minmax and maxabs scalers
#min max scales to fit between a min and max, 0-1 example.

In [66]:
x

array([[  5.,  10.,  15.],
       [  5.,   0., -10.],
       [ 15.,   2.,   8.],
       [ 11.,   5.,  -2.]])

In [86]:
minmaxscale = preprocessing.MinMaxScaler(feature_range=(0,1))
minmaxscale.fit(x)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [87]:
minmaxscale.data_min_

array([  5.,   0., -10.])

In [88]:
minmaxscale.data_max_

array([15., 10., 15.])

In [89]:
minmaxscale.data_range_

array([10., 10., 25.])

In [90]:
minmaxscale.feature_range

(0, 1)

In [93]:
minmaxscale.transform(x)

array([[0.  , 1.  , 1.  ],
       [0.  , 0.  , 0.  ],
       [1.  , 0.2 , 0.72],
       [0.6 , 0.5 , 0.32]])

In [102]:
#formula of min max scaler
def min_max_scaler(x, mn, mx):
    x_std = (x-x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0))
    x_scaled = x_std * (mx - mn) + mn
    return x_scaled

In [103]:
min_max_scaler(x, 0, 1)

array([[0.  , 1.  , 1.  ],
       [0.  , 0.  , 0.  ],
       [1.  , 0.2 , 0.72],
       [0.6 , 0.5 , 0.32]])

In [105]:
maxabs = preprocessing.MaxAbsScaler()
maxabs.fit(x)

MaxAbsScaler(copy=True)

In [112]:
#Works similar to min max scaler but scales between -1, 1.. 
#by dividing by max value in each feature
#meant for data already centered around zero or sparse data

In [110]:
maxabs.max_abs_

array([15., 10., 15.])

In [109]:
maxabs.transform(x)

array([[ 0.33333333,  1.        ,  1.        ],
       [ 0.33333333,  0.        , -0.66666667],
       [ 1.        ,  0.2       ,  0.53333333],
       [ 0.73333333,  0.5       , -0.13333333]])

MaxAbsScaler and maxabs_scale were specifically designed for scaling sparse data, and are the recommended way to go about this. However, scale and StandardScaler can accept scipy.sparse matrices as input, as long as with_mean=False is explicitly passed to the constructor. Otherwise a ValueError will be raised as silently centering would break the sparsity and would often crash the execution by allocating excessive amounts of memory unintentionally.