In [1]:
import numpy as np 
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

# Chapter 5: Support vector machines

In [4]:
# a smaller C values leads to a wider street but more margin
# here C is for regularizaton

# The following Scikit-Learn code loads the iris dataset, scales the features, and then
# trains a linear SVM model (using the LinearSVC class with C = 1 and the hinge loss
# function, described shortly) to detect Iris-Virginica flowers.

from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

iris = datasets.load_iris()
X = iris["data"][:, (2, 3)] # petal length, petal width
y = (iris["target"] == 2).astype(np.float64) # iris-virginica
svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("linear_svm", LinearSVC(C=1, loss="hinge"))
])
svm_clf.fit(X, y)


Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_svm', LinearSVC(C=1, loss='hinge'))])

In [6]:
svm_clf.predict([[5.5, 1.7]])
# Unlike Logistic Regression classifiers, SVM classifiers do not output
# probabilities for each class.

array([1.])

In [7]:
# Alternatively, you could use the SVC class, using SVC(kernel="linear", C=1), but it
# is much slower, especially with large training sets, so it is not recommended. Another
# option is to use the SGDClassifier class, with SGDClassifier(loss="hinge",
# alpha=1/(m*C)). This applies regular Stochastic Gradient Descent (see Chapter 4) to
# train a linear SVM classifier. It does not converge as fast as the LinearSVC class, but it
# can be useful to handle huge datasets that do not fit in memory (out-of-core training),
# or to handle online classification tasks.

In [9]:
# # ---------------------------
# The LinearSVC class regularizes the bias term, so you should center
# the training set first by subtracting its mean. This is automatic if
# you scale the data using the StandardScaler. Moreover, make sure
# you set the loss hyperparameter to "hinge", as it is not the default
# value. Finally, for better performance you should set the dual
# hyperparameter to False, unless there are more features than
# training instances (we will discuss duality later in the chapter).

## Nonlinear SVM classification

In [10]:
# Although linear SVM classifiers are efficient and work surprisingly well in many
# cases, many datasets are not even close to being linearly separable. One approach to
# handling nonlinear datasets is to add more features, such as polynomial features
# more on page->159

In [11]:
# implementing the above approach
# Let’s test this on the moons
# dataset: this is a toy dataset for binary classification in which the data points are shaped
# as two interleaving half circles
# You can generate this dataset
# using the make_moons() function:


from sklearn.datasets import make_moons
from sklearn.preprocessing import PolynomialFeatures

polynomial_svm_clf = Pipeline([
    ("poly_features", PolynomialFeatures(degree=3)),
     ("scaler", StandardScaler()),
    ("svm_clf", LinearSVC(C=10, loss="hinge"))
])

polynomial_svm_clf.fit(X, y)


Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('scaler', StandardScaler()),
                ('svm_clf', LinearSVC(C=10, loss='hinge'))])

In [12]:
# page->160 a img..

## Polynomial kernel

In [16]:
# Fortunately, when using SVMs you can apply an almost miraculous mathematical
# technique called the kernel trick (it is explained in a moment). It makes it possible to
# get the same result as if you added many polynomial features, even with very highdegree
# polynomials, without actually having to add them.
# This trick is implemented by the SVC class.

from sklearn.svm import SVC
poly_kernel_svm_clf = Pipeline([
("scaler", StandardScaler()),
("svm_clf", SVC(kernel="poly", degree=3, coef0=1, C=5))
])
poly_kernel_svm_clf.fit(X, y)


Pipeline(steps=[('scaler', StandardScaler()),
                ('svm_clf', SVC(C=5, coef0=1, kernel='poly'))])

In [15]:
# page->161 

In [None]:
## gird serch page 79

## adding similarity features

In [1]:
# do check out page->161, 162
# skipped

## gaussian RBF kernel

In [21]:
rbf_kernel_svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", gamma=5, C=0.001))
])
rbf_kernel_svm_clf.fit(X, y)
# Do check page->162, 163
# So γ(gamma) acts like a regularization
# hyperparameter: if your model is overfitting, you should reduce it, and if it is underfitting,
# you should increase it (similar to the C hyperparameter).

# Other kernels exist but are used much more rarely. For example, some kernels are
# specialized for specific data structures. String kernels are sometimes used when classifying
# text documents or DNA sequences (e.g., using the string subsequence kernel or
# kernels based on the Levenshtein distance).

Pipeline(steps=[('scaler', StandardScaler()),
                ('svm_clf', SVC(C=0.001, gamma=5))])

In [22]:
# With so many kernels to choose from, how can you decide which
# one to use? As a rule of thumb, you should always try the linear
# kernel first (remember that LinearSVC is much faster than SVC(ker
# nel="linear")), especially if the training set is very large or if it
# has plenty of features. If the training set is not too large, you should
# try the Gaussian RBF kernel as well; it works well in most cases.
# Then if you have spare time and computing power, you can also
# experiment with a few other kernels using cross-validation and grid
# search, especially if there are kernels specialized for your training
# set’s data structure.

In [23]:
# page->164

## SVM regression

In [25]:
# a basic implementation
# from sklearn.svm import LinearSVR
# svm_reg = LinearSVR(epsilon=1.5)
# svm_reg.fit(X, y)

# *[LinearSVR is faster than SVR(kernel="linear")] ig

# ig for non-linear
# from sklearn.svm import SVR
# svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1)
# svm_poly_reg.fit(X, y)

## under the hood

In [26]:
# skipping cause now no need .. 
# from page->166 to 176