In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "svm"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris["data"][:, (2, 3)]  # petal length, petal width
y = iris["target"]

In [3]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [4]:
iris["target_names"]

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [5]:
setosa_or_versicolor = (y == 0) | (y == 1)

In [6]:
setosa_or_versicolor[99]

True

In [7]:
X = X[setosa_or_versicolor]
y = y[setosa_or_versicolor]

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
linear_svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", LinearSVC(C = 1, loss="hinge"))
])
svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel = "poly", degree=3, coef0=1, C=5))
])
sgd_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("sgd_clf", SGDClassifier(max_iter = 1000, tol= 1e-3, penalty= None, eta0=0.1))
])


In [9]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(svm_clf, X, y, scoring="neg_mean_squared_error", cv = 3)
rmse_scores = np.sqrt(-scores)
print(scores)

[-0. -0. -0.]


In [10]:
linear_svm_clf.fit(X,y)
svm_clf.fit(X,y)
sgd_clf.fit(X,y)

In [11]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True, as_frame=False)

X = mnist["data"]
y = mnist["target"].astype(np.uint8)

X_train = X[:60000]
y_train = y[:60000]
X_test = X[60000:]
y_test = y[60000:]

In [16]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)

In [23]:
X_val = X_scaled[:5000]
y_val = y_train[:5000]

In [27]:
svm_clf = LinearSVC(C = 100, loss = "hinge")
svm_clf.fit(X_val, y_val)
svm_clf.score(X_val, y_val)



0.9972

In [None]:
svm_clf = LinearSVC(C = 100, loss="hinge")
svm_clf.fit(X_scaled, y_train)

In [26]:
svm_clf.score(X_scaled, y_train)

0.8300166666666666

In [25]:
svm_clf.score(X_test_scaled, y_test)

0.8187