In [1]:
#### Standardization, or mean removal and variance scaling
### Standard normally distributed data: Gaussian with zero mean and unit variance.
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
scaler = preprocessing.StandardScaler().fit(X_train)
scaler

scaler.mean_

scaler.scale_

X_scaled = scaler.transform(X_train)
X_scaled

X_scaled.mean(axis=0)

X_scaled.std(axis=0)

array([1., 1., 1.])

In [2]:
## Scalers, Transformers, and Normalizers compute the mean and standard deviation on a training set 
## so as to be able to later re-apply the same transformation on the testing set. 


# This class is hence suitable for use in the early steps of a Pipeline:


from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X, y = make_classification(random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train)  # apply scaling on training data

pipe.score(X_test, y_test)  # apply scaling on testing data, without leaking training data.

0.96

In [3]:
### Scaling features to a range
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_train_minmax


array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [4]:
# The same scaling and shifting operations will be applied to be consistent 
# with the transformation performed on the train data:
X_test = np.array([[-3., -1.,  4.]])
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax

array([[-1.5       ,  0.        ,  1.66666667]])

In [5]:
#It is possible to introspect the scaler attributes 
# to find about the exact nature of the transformation learned on the training data:
min_max_scaler.scale_

min_max_scaler.min_

array([0.        , 0.5       , 0.33333333])

In [6]:
# If MinMaxScaler is given an explicit feature_range=(min, max) the full formula is:
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

X_scaled = X_std * (X.max(axis=0) - X.min(axis=0)) + X.min(axis=0)
print(f"MinMax Standard Deviation of X:\n{X_std}\n\nMinMaxed X:\n{X_scaled}")    

MinMax Standard Deviation of X:
[[0.12381866 0.52439337 0.4864274  ... 0.49285474 0.67906612 0.21207425]
 [0.84375859 0.64635665 0.5435557  ... 0.82812533 0.64916728 0.42922402]
 [0.55742129 0.70255081 0.81972143 ... 0.79152641 0.42087826 0.34748043]
 ...
 [0.51666315 0.20845695 0.60900112 ... 0.49782028 0.43780975 0.39628525]
 [0.73933131 0.7513872  0.46276529 ... 0.49859413 0.28347444 0.24459206]
 [0.52328794 0.77176729 0.53917657 ... 0.75291143 0.58023005 0.34009484]]

MinMaxed X:
[[-2.02514259  0.0291022  -0.47494531 ... -0.33450124  0.86575519
  -1.20029641]
 [ 1.61371127  0.65992405 -0.15005559 ...  1.37570681  0.70117274
  -0.2975635 ]
 [ 0.16645221  0.95057302  1.42050425 ...  1.18901653 -0.55547712
  -0.63738713]
 ...
 [-0.03955515 -1.60499282  0.22213377 ... -0.30917212 -0.46227529
  -0.43449623]
 [ 1.08589557  1.2031659  -0.6095122  ... -0.3052247  -1.31183623
  -1.06511366]
 [-0.00607091  1.30857636 -0.17495976 ...  0.99204235  0.32169781
  -0.66809045]]


In [7]:
## MaxAbsScaler works in a very similar fashion, 
## but scales in a way that the training data lies within the range [-1, 1] 
## by dividing through the largest maximum value in each feature. 
## It is meant for data that is already centered at zero or sparse data.
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
X_train_maxabs
X_test = np.array([[ -3., -1.,  4.]])
X_test_maxabs = max_abs_scaler.transform(X_test)
X_test_maxabs
max_abs_scaler.scale_

array([2., 1., 2.])

# Scaling sparse data
Centering sparse data would destroy the sparseness structure in the data, 
and thus rarely is a sensible thing to do. 
However, it can make sense to scale sparse inputs, 
especially if features are on different scales.

MaxAbsScaler was specifically designed for scaling sparse data, 
and is the recommended way to go about this. 
However, StandardScaler can accept scipy.sparse matrices as input, 
as long as with_mean=False is explicitly passed to the constructor.

### RobustScaler cannot be fitted to sparse inputs




# Scaling data with outliers
Scaling using the mean and variance of the data is likely to not work very well. 
In these cases, you can use RobustScaler as a drop-in replacement instead.



# Scaling vs Whitening

It is sometimes not enough to center and scale the features independently, 
since a downstream model can further make some assumption 
on the linear independence of the features.
To address this issue you can use PCA 
with whiten=True to further remove the linear correlation across features




# Centering kernel matrices
If you have a kernel matrix of a kernel K
that computes a dot product in a feature space (possibly implicitly) 
a KernelCenterer can transform the kernel matrix 
so that it contains inner products in the feature space
followed by the removal of the mean in that space

# Non-linear transformation

Mapping to a Uniform distribution-basic quantile transformation
QuantileTransformer provides a non-parametric transformation 
to map the data to a uniform distribution with values between 0 and 1

In [8]:

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
X_train_trans = quantile_transformer.fit_transform(X_train)
X_test_trans = quantile_transformer.transform(X_test)

np.percentile(X_train[:, 0], [0, 25, 50, 75, 100]) # Raw data
np.percentile(X_train_trans[:, 0], [0, 25, 50, 75, 100]) # Approximated to Quantile!

np.percentile(X_test[:, 0], [0, 25, 50, 75, 100])
np.percentile(X_test_trans[:, 0], [0, 25, 50, 75, 100])



array([0.01351351, 0.25      , 0.47747748, 0.60472973, 0.94144144])

# Mapping to a Gaussian distribution
Power transforms are a family of parametric, monotonic transformations 
that aim to map data from any distribution 
to as close to a Gaussian distribution as possible 
in order to stabilize variance and minimize skewness.

## PowerTransformer currently provides two such power transformations, 
the Yeo-Johnson transform and the Box-Cox transform.
Box-Cox can only be applied to strictly positive data. 
In both methods, the transformation is parameterized by lambda,
which is determined through maximum likelihood estimation

In [9]:
pt = preprocessing.PowerTransformer(method='box-cox', standardize=False)
X_lognormal = np.random.RandomState(616).lognormal(size=(3, 3))
X_lognormal # Only has positive values
pt.fit_transform(X_lognormal) # Now there are negative values also!

array([[ 0.49024349,  0.17881995, -0.1563781 ],
       [-0.05102892,  0.58863195, -0.57612415],
       [ 0.69420009, -0.84857823,  0.10051454]])

In [10]:
### To map data to a normal distribution using QuantileTransformer,
## set output_distribution='normal'. 
quantile_transformer = preprocessing.QuantileTransformer(
    output_distribution='normal', random_state=0)
X_trans = quantile_transformer.fit_transform(X)
quantile_transformer.quantiles_



array([[4.3, 2. , 1. , 0.1],
       [4.4, 2.2, 1.1, 0.1],
       [4.4, 2.2, 1.2, 0.1],
       [4.4, 2.2, 1.2, 0.1],
       [4.5, 2.3, 1.3, 0.1],
       [4.6, 2.3, 1.3, 0.2],
       [4.6, 2.3, 1.3, 0.2],
       [4.6, 2.3, 1.3, 0.2],
       [4.6, 2.4, 1.3, 0.2],
       [4.7, 2.4, 1.3, 0.2],
       [4.7, 2.4, 1.3, 0.2],
       [4.8, 2.5, 1.4, 0.2],
       [4.8, 2.5, 1.4, 0.2],
       [4.8, 2.5, 1.4, 0.2],
       [4.8, 2.5, 1.4, 0.2],
       [4.8, 2.5, 1.4, 0.2],
       [4.9, 2.5, 1.4, 0.2],
       [4.9, 2.5, 1.4, 0.2],
       [4.9, 2.5, 1.4, 0.2],
       [4.9, 2.6, 1.4, 0.2],
       [4.9, 2.6, 1.4, 0.2],
       [4.9, 2.6, 1.4, 0.2],
       [5. , 2.6, 1.4, 0.2],
       [5. , 2.6, 1.4, 0.2],
       [5. , 2.7, 1.5, 0.2],
       [5. , 2.7, 1.5, 0.2],
       [5. , 2.7, 1.5, 0.2],
       [5. , 2.7, 1.5, 0.2],
       [5. , 2.7, 1.5, 0.2],
       [5. , 2.7, 1.5, 0.2],
       [5. , 2.7, 1.5, 0.2],
       [5. , 2.7, 1.5, 0.2],
       [5.1, 2.7, 1.5, 0.2],
       [5.1, 2.8, 1.5, 0.2],
       [5.1, 2

# Normalization
Normalization is the process of scaling individual samples to have unit norm. 
This process can be useful if you plan to use a quadratic form 
such as the dot-product or any other kernel 
to quantify the similarity of any pair of samples.

This assumption is the base of the Vector Space Model 
often used in text classification and clustering contexts.

In [11]:
X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]
X_normalized = preprocessing.normalize(X, norm='l2')

X_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [12]:
### a utility class Normalizer that implements the same operation 
## using the Transformer API (even though the fit method is useless in this case: 
## the class is stateless as this operation treats samples independently).
normalizer = preprocessing.Normalizer().fit(X)  # fit does nothing
normalizer

normalizer.transform(X)

normalizer.transform([[-1.,  1., 0.]])
## This class is suitable for use in the early steps of a Pipeline


array([[-0.70710678,  0.70710678,  0.        ]])

normalize and Normalizer accept both dense array-like and sparse matrices 
from scipy.sparse as input.
For sparse input the data is converted to the Compressed Sparse Rows 
(see scipy.sparse.csr_matrix) before being fed to efficient Cython routines

# Encoding categorical features 
To convert categorical features to such integer codes, 
we can use the OrdinalEncoder. 
This estimator transforms each categorical feature 
to one new feature of integers (0 to n_categories - 1)

In [13]:
enc = preprocessing.OrdinalEncoder()
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
enc.transform([['female', 'from US', 'uses Safari']])
# these expect continuous input, 
# and would interpret the categories as being ordered, which is often not desired

array([[0., 1., 1.]])

In [14]:
# By default, OrdinalEncoder will also passthrough missing values 
# that are indicated by np.nan.
enc = preprocessing.OrdinalEncoder()
X = [['male'], ['female'], [np.nan], ['female']]
enc.fit_transform(X)


array([[ 1.],
       [ 0.],
       [nan],
       [ 0.]])

In [15]:
### OrdinalEncoder provides a parameter encoded_missing_value=
# to encode the missing values without the need to create a pipeline 
# and using SimpleImputer.
enc = preprocessing.OrdinalEncoder(encoded_missing_value=-1)
X = [['male'], ['female'], [np.nan], ['female']]
enc.fit_transform(X)


array([[ 1.],
       [ 0.],
       [-1.],
       [ 0.]])

In [16]:
### Above codes are equivalent to below.
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
enc = Pipeline(steps=[
    ("encoder", preprocessing.OrdinalEncoder()),
    ("imputer", SimpleImputer(strategy="constant", fill_value=-1)),
])
enc.fit_transform(X)


array([[ 1.],
       [ 0.],
       [-1.],
       [ 0.]])

# The OneHotEncoder 
which transforms each categorical feature 
with n_categories possible values into n_categories binary features, 
with one of them 1, and all others 0.

enc = preprocessing.OneHotEncoder()
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
enc.transform([['female', 'from US', 'uses Safari'],
               ['male', 'from Europe', 'uses Safari']]).toarray()


In [22]:
## By default, the values each feature can take 
## is inferred automatically from the dataset 
### and can be found in the categories_ attribute:
print(enc.categories_)

#### It is possible to specify this explicitly using the parameter categories=
genders = ['female', 'male']
locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
enc = preprocessing.OneHotEncoder(categories=[genders, locations, browsers])
# Note that for there are missing categorical values for the 2nd and 3rd
# feature
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()

[array(['female', 'male'], dtype=object), array(['from Africa', 'from Asia', 'from Europe', 'from US'], dtype=object), array(['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari'],
      dtype=object)]


array([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])

If there is a possibility that 
the training data might have missing categorical features, 
it can often be better to specify handle_unknown='infrequent_if_exist' 
instead of setting the categories manually as above. 
When handle_unknown='infrequent_if_exist' is specified 
and unknown categories are encountered during transform, 
no error will be raised but the resulting one-hot encoded columns 
for this feature will be all zeros 
or considered as an infrequent category if enabled. 
#### (handle_unknown='infrequent_if_exist' is only supported for one-hot encoding)

In [23]:
enc = preprocessing.OneHotEncoder(handle_unknown='infrequent_if_exist')
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()

array([[1., 0., 0., 0., 0., 0.]])

In [24]:
### To encode each column into n_categories - 1 columns 
### instead of n_categories columns by using the drop= parameter. 
### This parameter allows the user to specify a category 
### for each feature to be dropped.
X = [['male', 'from US', 'uses Safari'],
     ['female', 'from Europe', 'uses Firefox']]
drop_enc = preprocessing.OneHotEncoder(drop='first').fit(X)
drop_enc.categories_
drop_enc.transform(X).toarray()

array([[1., 1., 1.],
       [0., 0., 0.]])

In [25]:
### To drop one of the two columns only for features with 2 categories, 
### you can set the parameter drop='if_binary'.
X = [['male', 'US', 'Safari'],
     ['female', 'Europe', 'Firefox'],
     ['female', 'Asia', 'Chrome']]
drop_enc = preprocessing.OneHotEncoder(drop='if_binary').fit(X)
drop_enc.categories_
drop_enc.transform(X).toarray()

array([[1., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0., 0.]])

In [26]:
## When handle_unknown='ignore' and drop is not None, 
## unknown categories will be encoded as all zeros
drop_enc = preprocessing.OneHotEncoder(drop='first',
                                       handle_unknown='ignore').fit(X)
X_test = [['unknown', 'America', 'IE']]
drop_enc.transform(X_test).toarray()



array([[0., 0., 0., 0., 0.]])

In [27]:
### OneHotEncoder.inverse_transform 
### will map all zeros to the dropped category 
### if a category is dropped and None if a category is not dropped
drop_enc = preprocessing.OneHotEncoder(drop='if_binary', sparse_output=False,
                                       handle_unknown='ignore').fit(X)
X_test = [['unknown', 'America', 'IE']]
X_trans = drop_enc.transform(X_test)
X_trans
drop_enc.inverse_transform(X_trans)



array([['female', None, None]], dtype=object)

In [28]:
#### OneHotEncoder supports categorical features with missing values 
#### by considering the missing values as an additional category:
X = [['male', 'Safari'],
     ['female', None],
     [np.nan, 'Firefox']]
enc = preprocessing.OneHotEncoder(handle_unknown='error').fit(X)
enc.categories_
enc.transform(X).toarray()

array([[0., 1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0., 0.]])

In [29]:
## If a feature contains both np.nan and None, 
## they will be considered separate categories:    
X = [['Safari'], [None], [np.nan], ['Firefox']]
enc = preprocessing.OneHotEncoder(handle_unknown='error').fit(X)
enc.categories_
enc.transform(X).toarray()


array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]])

min_frequency= is either an integer greater or equal to 1, or a float in the interval (0.0, 1.0). 
If min_frequency is an integer, categories with a cardinality smaller than min_frequency will be considered infrequent. 

If min_frequency is a float, categories with a cardinality smaller than this fraction of the total number of samples will be considered infrequent. 
The default value is 1, which means every category is encoded separately.

In [30]:

X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 +
              ['snake'] * 3], dtype=object).T
enc = preprocessing.OrdinalEncoder(min_frequency=6).fit(X)
enc.infrequent_categories_
enc.transform(np.array([['dog'], ['cat'], ['rabbit'], ['snake']]))


array([[2.],
       [0.],
       [1.],
       [2.]])