In [None]:
# In scikit-learn, an estimator for classification is a Python object that implements the methods fit(X, y) and predict(T).
from sklearn import svm
clf = svm.SVC(gamma=0.001, C=100.)

# Choosing the parameters of the model: In this example, we set the value of gamma manually. To find good values for these parameters, 
# we can use tools such as grid search and cross validation.
clf.fit(digits.data[:-1], digits.target[:-1])
   SVC(C=100.0, gamma=0.001)

clf.predict(digits.data[-1:])
   array([8])
    
# save a model in scikit-learn by using Python’s built-in persistence model, pickle
from sklearn import svm
from sklearn import datasets
clf = svm.SVC()
X, y = datasets.load_iris(return_X_y=True)
clf.fit(X, y)
    SVC()

import pickle
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
clf2.predict(X[0:1])
    array([0])
y[0]
    0

In [None]:
#  joblib’s replacement for pickle (joblib.dump & joblib.load), which is more efficient on big data but it can only pickle 
# to the disk and not to a string:
from joblib import dump, load
dump(clf, 'filename.joblib') 
clf = load('filename.joblib')

# Refitting and updating parameters
# Hyper-parameters of an estimator can be updated after it has been constructed via the set_params() method. 
# Calling fit() more than once will overwrite what was learned by any previous fit()

import numpy as np
from sklearn.datasets import load_iris
from sklearn.svm import SVC
X, y = load_iris(return_X_y=True)

clf = SVC()
clf.set_params(kernel='linear').fit(X, y)
    SVC(kernel='linear')
clf.predict(X[:5])
    array([0, 0, 0, 0, 0])

clf.set_params(kernel='rbf').fit(X, y)
    SVC()
clf.predict(X[:5])
    array([0, 0, 0, 0, 0])
    
# Here, the default kernel rbf is first changed to linear via SVC.set_params() after the estimator has been constructed, 
# and changed back to rbf to refit the estimator and to make a second prediction.

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree

# Load and split the data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Construct some pipelines
pipe_lr = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', LogisticRegression(random_state=42))])

pipe_svm = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', svm.SVC(random_state=42))])
			
pipe_dt = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', tree.DecisionTreeClassifier(random_state=42))])

# List of pipelines for ease of iteration
pipelines = [pipe_lr, pipe_svm, pipe_dt]
			
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Support Vector Machine', 2: 'Decision Tree'}

# Fit the pipelines
for pipe in pipelines:
	pipe.fit(X_train, y_train)

# Compare accuracies
for idx, val in enumerate(pipelines):
	print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], val.score(X_test, y_test)))

# Identify the most accurate model on test data
best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
	if val.score(X_test, y_test) > best_acc:
		best_acc = val.score(X_test, y_test)
		best_pipe = val
		best_clf = idx
print('Classifier with best accuracy: %s' % pipe_dict[best_clf])

# Save pipeline to file
joblib.dump(best_pipe, 'best_pipeline.pkl', compress=1)
print('Saved %s pipeline to file' % pipe_dict[best_clf])