# Centering and Scaling Data



In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
df = pd.read_csv('../data/white-wine.csv')
df.sample(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
2826,6.5,0.22,0.45,8.0,0.053,52.0,196.0,0.9959,3.23,0.48,9.1,6
4632,6.3,0.3,0.91,8.2,0.034,50.0,199.0,0.99394,3.39,0.49,11.7,6
1483,6.9,0.25,0.24,3.6,0.057,13.0,85.0,0.9942,2.99,0.48,9.5,4
2806,7.5,0.17,0.71,11.8,0.038,52.0,148.0,0.99801,3.03,0.46,8.9,5
4467,6.3,0.25,0.27,6.6,0.054,40.0,158.0,0.99378,3.2,0.48,10.3,5
4072,7.5,0.16,0.38,12.7,0.043,70.5,163.0,0.99706,3.15,0.82,10.4,7
2360,8.1,0.25,0.34,10.1,0.05,30.0,121.0,0.99724,3.17,0.49,10.1,6
152,6.9,0.25,0.3,4.1,0.054,23.0,116.0,0.994,2.99,0.38,9.4,6
1120,6.7,0.105,0.32,12.4,0.051,34.0,106.0,0.998,3.54,0.45,9.2,6
2029,7.6,0.34,0.39,7.6,0.04,45.0,215.0,0.9965,3.11,0.53,9.2,6


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,4898.0,6.854788,0.843868,3.8,6.3,6.8,7.3,14.2
volatile acidity,4898.0,0.278241,0.100795,0.08,0.21,0.26,0.32,1.1
citric acid,4898.0,0.334192,0.12102,0.0,0.27,0.32,0.39,1.66
residual sugar,4898.0,6.391415,5.072058,0.6,1.7,5.2,9.9,65.8
chlorides,4898.0,0.045772,0.021848,0.009,0.036,0.043,0.05,0.346
free sulfur dioxide,4898.0,35.308085,17.007137,2.0,23.0,34.0,46.0,289.0
total sulfur dioxide,4898.0,138.360657,42.498065,9.0,108.0,134.0,167.0,440.0
density,4898.0,0.994027,0.002991,0.98711,0.991723,0.99374,0.9961,1.03898
pH,4898.0,3.188267,0.151001,2.72,3.09,3.18,3.28,3.82
sulphates,4898.0,0.489847,0.114126,0.22,0.41,0.47,0.55,1.08


There is a large difference in scale:

- between different properties, e,g `citric acid` max is `1`, while `total sulphur dioxide` is `400`. We want to scale these.
- large range between the `min` and `max` of some properties, e.g. `total sulphur dioxide` has a range from `9.0` to `400.0`. We want  to center these.

Features on larger scales can undually influence the model, e.g. `knn` uses distance to compute predictions.

We want features to be on a similar scale, normalizing (scaling and centering)

In [8]:
df.quality.unique() # ???? use dummy variables since this is a category

array([6, 5, 7, 8, 4, 3, 9])

In [14]:
df.groupby('quality').quality.count()

quality
3      20
4     163
5    1457
6    2198
7     880
8     175
9       5
Name: quality, dtype: int64

Target of `quality`, ranges from `3` to `9`.

There are several ways to normalize data:
    
**Standardization** 

- given a column, we can subtract the mean and divide by the variance for each value. All features would be centered around `0` and have a variance of `1`.

**Other Approches**

- given a column, we can subtract the minimum and divide by the range for each value. All features would have a minimum of `0` and a maximum of `1`.

- we can normalize the data so that it ranges from `-1` to `1`.

We'll use **standardization** using the `scale` function in sklearn.

In [19]:
from sklearn.preprocessing import scale

X = df.drop('quality', axis=1).values
y = df.quality.values

print(X.shape)
print(y.shape)

X_scaled = scale(X)

# comparing the mean and std on original vs scaled data
print(np.mean(X), np.std(X))
print(np.mean(X_scaled), np.std(X_scaled))

(4898, 11)
(4898,)
18.432687072460002 41.54494764094571
2.7314972981668206e-15 0.9999999999999999


We can also use a scaler in a pipeline object, using sklearn's `StandardScaler` function.

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

steps = [('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]

pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# fit the pipeline on the training set
knn_scaled = pipeline.fit(X_train, y_train)


# predict
y_pred = pipeline.predict(X_test)

# score the model
print('knn with scaling:', accuracy_score(y_test, y_pred))

knn with scaling: 0.5642857142857143


In [29]:
knn_unscaled = KNeighborsClassifier().fit(X_train, y_train)
y_pred_unscaled = knn_unscaled.predict(X_test)

print('knn without scaling', accuracy_score(y_test, y_pred_unscaled))

knn without scaling 0.47959183673469385


### Using Cross-Validation and Scaling in a Pipeline

In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

steps = [('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]
pipline = Pipeline(steps)

parameters = {'knn__n_neighbors': np.arange(1, 50)}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# fit the model and make the predictions
pipeline_cv = GridSearchCV(pipeline, param_grid=parameters, cv=5)
pipeline_cv.fit(X_train, y_train)
y_pred = pipeline_cv.predict(X_test)

# print the report
print(pipeline_cv.best_params_)
print(pipeline_cv.score(X_test, y_test))
print(classification_report(y_test, y_pred))

{'knn__n_neighbors': 1}
0.636734693877551
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.39      0.39      0.39        28
           5       0.65      0.64      0.64       289
           6       0.68      0.67      0.68       447
           7       0.64      0.60      0.62       177
           8       0.38      0.53      0.44        38
           9       0.00      0.00      0.00         0

   micro avg       0.64      0.64      0.64       980
   macro avg       0.39      0.41      0.40       980
weighted avg       0.64      0.64      0.64       980



  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
