# 1- Loading an example dataset 

In [2]:
from sklearn import datasets
iris = datasets.load_iris()
digits = datasets.load_digits()

In [5]:
digits.data

array([[  0.,   0.,   5., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,  10.,   0.,   0.],
       [  0.,   0.,   0., ...,  16.,   9.,   0.],
       ..., 
       [  0.,   0.,   1., ...,   6.,   0.,   0.],
       [  0.,   0.,   2., ...,  12.,   0.,   0.],
       [  0.,   0.,  10., ...,  12.,   1.,   0.]])

In [7]:
digits.target

array([0, 1, 2, ..., 8, 9, 8])

In [9]:
digits.images[0]

array([[  0.,   0.,   5.,  13.,   9.,   1.,   0.,   0.],
       [  0.,   0.,  13.,  15.,  10.,  15.,   5.,   0.],
       [  0.,   3.,  15.,   2.,   0.,  11.,   8.,   0.],
       [  0.,   4.,  12.,   0.,   0.,   8.,   8.,   0.],
       [  0.,   5.,   8.,   0.,   0.,   9.,   8.,   0.],
       [  0.,   4.,  11.,   0.,   1.,  12.,   7.,   0.],
       [  0.,   2.,  14.,   5.,  10.,  12.,   0.,   0.],
       [  0.,   0.,   6.,  13.,  10.,   0.,   0.,   0.]])

# 2- Learning and predicting 

- In scikit-learn, an **estimator** for classification is a Python object that implements the methods *fit(X, y)* and *predict(T)*

In [10]:
# For now, we will consider the estimator as a black box
from sklearn.svm import SVC
clf = SVC(gamma=.001, C=100)

**Choosing the parameters of the model**

- In this example, we set the value of gamma manually. To find good values for these parameters, we can use tools such as 
    * *grid search*
    * *cross validation*.


In [11]:
clf.fit(digits.data[:-1], digits.target[:-1])

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
clf.predict(digits.data[-1:])

array([8])

# 3- Model persistence

- It is possible to save a model in scikit-learn by using Python’s built-in persistence model

In [22]:
from sklearn import svm, datasets
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
import pickle
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
print(clf2.predict(X[0:1]), y[0])

[0] 0


- In the specific case of scikit-learn, it may be more interesting to use joblib’s replacement for pickle (joblib.dump & joblib.load), which is more efficient on **big data** but it can only pickle to the disk and not to a string:

In [26]:
from sklearn.externals import joblib
joblib.dump(clf, 'filename.joblib')

['filename.joblib']

- Later, you can reload the pickled model (possibly in another Python process) with:

In [27]:
clf = joblib.load('filename.joblib')

# 4- Conventions

### *- Type casting*

In [40]:
import numpy as np
from sklearn import random_projection
rng = np.random.RandomState(0)
X = rng.rand(10, 2000)
print(X.dtype)
X = np.array(X, dtype='float32')
print(X.dtype)

float64
float32


In [42]:
transformer = random_projection.GaussianRandomProjection()
X_new = transformer.fit_transform(X)
X_new.dtype

dtype('float64')

- In this example, X is *float32*, which is cast to *float64* by **fit_transform(X)**

> **Regression targets** are cast to *float64* and **classification targets** are *maintained*

In [47]:
from sklearn import datasets
from sklearn.svm import SVC
iris = datasets.load_iris()
clf = SVC()
clf.fit(iris.data, iris.target)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [48]:
list(clf.predict(iris.data[:3]))

[0, 0, 0]

In [49]:
clf.fit(iris.data, iris.target_names[iris.target])
list(clf.predict(iris.data[:3]))

['setosa', 'setosa', 'setosa']

### *- Refitting and updating parameters*

In [54]:
import numpy as np
from sklearn.svm import SVC

rng = np.random.RandomState(0)
X = rng.rand(100, 10)
y = rng.binomial(1, 0.5, 100) #(n, p, size)
X_test = rng.rand(5, 10)

clf = SVC()
clf.set_params(kernel='linear').fit(X, y) 
'''set_params : Set the parameters of this estimator'''
clf.predict(X_test)

array([1, 0, 1, 1, 0])

In [56]:
clf.set_params(kernel='rbf', gamma='auto').fit(X, y)
clf.predict(X_test)

array([0, 0, 0, 1, 0])

### *- Multiclass vs. multilabel fitting*

In [57]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

X = [[1, 2], [2, 4], [4, 5], [3, 2], [3, 1]]
y = [0, 0, 1, 1, 2]

classif = OneVsRestClassifier(estimator=SVC(random_state=0))
classif.fit(X, y).predict(X)

array([0, 0, 1, 1, 2])

In [58]:
y = LabelBinarizer().fit_transform(y)
classif.fit(X, y).predict(X)

array([[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 0, 0]])

- Here, the classifier is *fit()* on a 2d binary label representation of y, using the **LabelBinarizer**. In this case *predict()* returns a 2d array representing the corresponding multilabel predictions.

- Note that the *fourth* and *fifth* instances returned all zeroes, indicating that they matched none of the three labels fit upon. 
 With multilabel outputs, it is similarly possible for an instance to be assigned multiple labels.

In [59]:
from sklearn.preprocessing import MultiLabelBinarizer
y = [[0, 1], [0, 2], [1, 3], [0, 2, 3], [2, 4]]
y = MultiLabelBinarizer().fit_transform(y)
classif.fit(X, y).predict(X)

array([[1, 1, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0],
       [1, 0, 1, 0, 0],
       [1, 0, 1, 0, 0]])

- In this case, the classifier is fit upon instances each assigned multiple labels. The MultiLabelBinarizer is used to binarize the 2d array of multilabels to fit upon. As a result, predict() returns a 2d array with multiple predicted labels for each instance.