In [1]:
import pandas as pd
import scipy as cp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing import MinMaxScaler

In [5]:
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')


In [6]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
array = df.values

In [9]:
# Seperating data into input and output components
x = array[:, 0:8]
y = array[:, 8:]

#### Why Rescale Data?

If there're attributes with varying scales, then rescale them into the range 0 to 1 and call it normalization.

In [10]:
scaler = MinMaxScaler(feature_range=(0,1))

In [11]:
rescaledX = scaler.fit_transform(x)

In [12]:
# Setting precision for the output
np.set_printoptions(precision=3)

In [13]:
rescaledX[0:5,:]

array([[0.248, 0.397, 0.   , 0.068, 0.107, 0.141, 0.099, 0.568],
       [0.283, 0.521, 0.   , 0.116, 0.144, 0.338, 0.216, 0.494],
       [0.283, 0.438, 0.04 , 0.096, 0.134, 0.197, 0.17 , 0.509],
       [0.584, 0.11 , 0.56 , 0.068, 0.105, 0.225, 0.191, 0.582],
       [0.248, 0.397, 0.   , 0.068, 0.107, 0.141, 0.099, 0.568]])

rescaledX is values betwee 0 and 1. Rescaling data proves of use with neural networks, optimization algorithms and those that use distance measures like k-nearest neighbors and weight inputs like regression.


### Standardizing Data

Attributes are taken with a Gaussian distribution and different means and standard deviations and transform them into a standard Gaussian distribution with a mean of 0 and standard deviation of 1. 

In [14]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler().fit(x)
rescaledX = scaler.transform(x)

In [16]:
rescaledX[0:5, :]

array([[-0.528,  0.962, -1.391, -0.453, -0.244, -0.466, -0.379,  0.558],
       [-0.299,  1.967, -1.391,  0.043,  0.224,  0.873,  0.624,  0.028],
       [-0.299,  1.297, -1.186, -0.169,  0.096, -0.084,  0.229,  0.134],
       [ 1.655, -1.384,  1.484, -0.453, -0.265,  0.108,  0.412,  0.664],
       [-0.528,  0.962, -1.391, -0.453, -0.244, -0.466, -0.379,  0.558]])

#### Normalizing Data

We rescale each observation to a length of 1 (a unit norm).


In [19]:
from sklearn.preprocessing import Normalizer

In [20]:
scaler = Normalizer().fit(x)
normalizedX = scaler.transform(x)

In [21]:
normalizedX[0:5,:]

array([[2.024e-01, 1.914e-02, 0.000e+00, 5.196e-02, 2.079e-03, 3.008e-01,
        9.299e-01, 2.729e-02],
       [1.083e-01, 1.222e-02, 0.000e+00, 3.611e-02, 1.361e-03, 3.472e-01,
        9.306e-01, 1.385e-02],
       [1.377e-01, 1.342e-02, 7.061e-04, 4.060e-02, 1.624e-03, 2.648e-01,
        9.533e-01, 1.760e-02],
       [1.767e-01, 4.416e-03, 8.833e-03, 2.997e-02, 1.183e-03, 2.681e-01,
        9.464e-01, 1.574e-02],
       [2.024e-01, 1.914e-02, 0.000e+00, 5.196e-02, 2.079e-03, 3.008e-01,
        9.299e-01, 2.729e-02]])

#### Binarizing Data

Using a binary threshold, it is possible to transform our data by making the values above it 1 and those equal to or below it, 0.


In [22]:
from sklearn.preprocessing import Binarizer

In [23]:
binarizer = Binarizer(threshold=0.0).fit(x)
binaryX = binarizer.transform(x)

In [24]:
binaryX[0:5,:]

array([[1., 1., 0., 1., 1., 1., 1., 1.],
       [1., 1., 0., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 0., 1., 1., 1., 1., 1.]])

This marks 0 over all values equal to or less than 0, and marks 1 over the rest. When you want to turn probabilities into crisp values, this functionality comes handy.


#### Mean Removal

We can remove the mean from each feature to center it to zero.


In [25]:
from sklearn.preprocessing import scale

In [26]:
data_standardized = scale(df)

In [27]:
data_standardized.mean(axis=0)

array([ 3.555e-16,  1.733e-16, -8.887e-17, -1.244e-16,  3.733e-16,
       -6.221e-17,  4.444e-17, -3.473e-14,  2.862e-15,  6.754e-16,
        1.066e-16,  8.887e-17])

In [28]:
data_standardized.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])