In [5]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Introduction to Scikit-Learn 

### Transforming Data

In [6]:
#check installed version 
import sys
print("Python version: {}".format(sys.version))

import sklearn
print("scikit-learn version: {}".format(sklearn.__version__))

Python version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
scikit-learn version: 0.18.1


## Scikit-Learn has a few well known datasets

http://scikit-learn.org/stable/datasets/index.html


- load_boston    (regression)
- load_iris    (classification)
- load_breast_cancer (classification)
- load_diabetes  (regression)
- load_digits (classification)
   
http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html#sklearn.datasets.make_blobs  (clustering)


### Transformation  

#### Feature scaling

- Scaling      Standardization     centre at mean 0   std dev = 1

- Mix-Max scaling aka "normalisation"   between 0 and 1

In [11]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=123)

In [12]:
#scaling iris dataset

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [13]:
#rescale data applying transform method
X_train_scaled = scaler.transform(X_train)

In [14]:
#scaled
print("mean : %s " % X_train_scaled.mean(axis=0))
print("standard deviation : %s " % X_train_scaled.std(axis=0))

mean : [ -6.23509181e-15  -1.90323947e-16   2.77555756e-17  -6.74063979e-17] 
standard deviation : [ 1.  1.  1.  1.] 


In [15]:
# unscaled
print("mean : %s " % X_train.mean(axis=0))
print("standard deviation : %s " % X_train.std(axis=0))

mean : [ 5.86428571  3.04464286  3.83035714  1.2375    ] 
standard deviation : [ 0.78602267  0.41744103  1.70826976  0.7415145 ] 


In [16]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target,
                                                    random_state=1)
print(X_train.shape)
print(X_test.shape)

(426, 30)
(143, 30)


In [17]:
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target,
                                                    random_state=0)

svm = SVC(C=100)
svm.fit(X_train, y_train)
print("Test set accuracy: {:.2f}".format(svm.score(X_test, y_test)))

Test set accuracy: 0.63


In [18]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [19]:
# preprocessing using 0-1 scaling
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# learning an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)

# scoring on the scaled test set
print("Scaled test set accuracy: {:.2f}".format(
    svm.score(X_test_scaled, y_test)))

Scaled test set accuracy: 0.97


In [20]:
# preprocessing using zero mean and unit variance scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# learning an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)

# scoring on the scaled test set
print("SVM test accuracy: {:.2f}".format(svm.score(X_test_scaled, y_test)))

SVM test accuracy: 0.96
