# Scale, Standardize & Normalize

### Many machine learning algorithms work better when features are on a relatively similar scale and close to normally distributed 

In [1]:
# MinMaxScaler 
# RobustScaler  
# StandardScaler 
# Normalizer

# Scaling

In [2]:
# Scale generally means to change the range of the values. The shape of the distribution doesn’t change.

In [3]:
from sklearn import preprocessing
import numpy as np
from sklearn import preprocessing
import warnings
import pandas as pd
import os
warnings.filterwarnings('ignore')
os.chdir('C:/Users/saravana.ayyappa/Desktop/Machine Learning')
data = pd.read_csv('heart.csv')

X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X_train)


X_scaled 

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [4]:
mm_scaler = preprocessing.MinMaxScaler()
X_train_minmax = mm_scaler.fit_transform(X_train)
X_train_minmax

array([[ 0.5       ,  0.        ,  1.        ],
       [ 1.        ,  0.5       ,  0.33333333],
       [ 0.        ,  1.        ,  0.        ]])

In [5]:
X_test = np.array([[-3., -1.,  4.]])
X_test_minmax = mm_scaler.transform(X_test)
X_test_minmax

array([[-1.5       ,  0.        ,  1.66666667]])

In [6]:
rbust = preprocessing.robust_scale(X_train)
rbust

array([[ 0.        , -1.        ,  1.33333333],
       [ 1.        ,  0.        ,  0.        ],
       [-1.        ,  1.        , -0.66666667]])

In [7]:
robust = preprocessing.robust_scale(X_test)
robust

array([[ 0.,  0.,  0.]])

In [8]:
data['Scaled'] = preprocessing.minmax_scale(data['age'])
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,Scaled
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0,0.479167
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0,0.5
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0,0.854167
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0,0.666667
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0,0.6875


# Standardize

In [9]:
# Standardize generally means changing the values so that the distribution standard deviation from the mean equals one

In [10]:
scaler = preprocessing.StandardScaler().fit(X_train)
scaler.transform(X_train)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [11]:
scaler = preprocessing.StandardScaler().fit(X_train)
scaler.transform(X_test)

array([[-4.89897949, -1.22474487,  2.93987366]])

# Normalize

In [12]:
# Normalize can be used to mean either of the above things (and more!).

In [13]:
norm = preprocessing.normalize(X_train)
norm

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [14]:
normal = preprocessing.normalize(X_test)
normal

array([[-0.58834841, -0.19611614,  0.78446454]])

# Log and Antilog

In [15]:
logvalue = np.log(X_train)
print(X_train,'\n''\n''\n',logvalue)

[[ 1. -1.  2.]
 [ 2.  0.  0.]
 [ 0.  1. -1.]] 


 [[ 0.                 nan  0.69314718]
 [ 0.69314718        -inf        -inf]
 [       -inf  0.                 nan]]


In [16]:
logvalues = np.log(X_test)
print(X_test,'\n''\n''\n',logvalues)

[[-3. -1.  4.]] 


 [[        nan         nan  1.38629436]]


In [17]:
antilog = np.exp(logvalue)
print(X_train,'\n''\n''\n',antilog)

[[ 1. -1.  2.]
 [ 2.  0.  0.]
 [ 0.  1. -1.]] 


 [[  1.  nan   2.]
 [  2.   0.   0.]
 [  0.   1.  nan]]


In [18]:
antilog = np.exp(logvalues)
print(antilog,'\n''\n''\n',X_test)

[[ nan  nan   4.]] 


 [[-3. -1.  4.]]


In [19]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,Scaled
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0,0.479167
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0,0.5
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0,0.854167
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0,0.666667
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0,0.6875


In [20]:
data['age_log'] = np.log(data['age'])
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,Scaled,age_log
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0,0.479167,3.951244
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0,0.5,3.970292
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0,0.854167,4.248495
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0,0.666667,4.110874
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0,0.6875,4.127134


In [21]:
data['antilog'] = np.exp(data['age_log'])
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,Scaled,age_log,antilog
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0,0.479167,3.951244,52.0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0,0.5,3.970292,53.0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0,0.854167,4.248495,70.0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0,0.666667,4.110874,61.0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0,0.6875,4.127134,62.0
