In [None]:
# Support Vector Machines (SVM)

# SVM provides large-margin separation.
# SVM was traditionally implemented with math, not neural nets.
# The math is called QP, Quadratic Programming.

# SVM math does not scale to deep learning.
# SVM good for linear and multiclass classification.
# SVM also good for linear regression in high dimensions.
# SVM sensitive to outliers. 
# SVM inverse detects outliers. In SKLearn, use OneClassSVM.

# "Support vectors" are the data examples closest to the middle. 
# The support vectors define the margins.
# SVM chooses a decision boundary that maximizes margins between support vectors.

# Important to scale all features prior to SVM.
# Otherwise SVM will focus on feature with largest scale.

# Hard-margin mode: allow no data in margin. Possibly no solution.
# Soft-margin mode: minimize data in margin.

# Hyperparameter C for regularization. Inverse of usual. 
# Large C => small margin, overfits to get every last point.
# Small C => large margin, generalizes.

# Hinge loss defines the SVM. 
# It penalizes data in the margin.

# Book says solving the "dual problem" instead of "primary problem"
# enables use of the kernel trick and is faster.
# Comments online say use dual only when #features and #samples are balanced.

In [14]:
# Linear SVM

# Three ways to run linear support vector machine:
# 1) sklearn.svm.SVC(kernel='linear',C=1)  # for small datasets. Uses OneVsOne for multiclass.
# 2) sklearn.svm.LinearSVC                 # reimplemented for more flexibility, larger data sets.
# 3) sklearn.linear_model.SGDClassifier(loss='hinge',alpha=1/(m*C))  # gradient descent, best for huge data sets.

import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC  
# Note SKLearn.svm has SVC for classification and SVR for regression.

# Flower data again

iris = datasets.load_iris() ]
X=iris['data'][:,(2,3)]  # 2=petal length 3=petal width
y=(iris['target']==2).astype(np.float64)  # binary: is species==2 (out of 3)
svm=LinearSVC(C=1, loss='hinge')
pipe1 = Pipeline([
    ("scaler",StandardScaler()),
    ("svm",svm)
])

pipe1.fit(X,y)
example_0 = [5.5,1.7]
pipe1.predict([example_0])

array([1.])

In [15]:
# Note the SVM learned two weights for two features
svm.intercept_, svm.coef_

(array([-2.50518051]), array([[1.63330754, 2.38788385]]))

In [18]:
# Nonlinear SVM

# Add polynomial features (like x^2 instead of just x).
# Then apply linear SVM.

from sklearn.preprocessing import PolynomialFeatures
pipe2 = Pipeline([
    ('poly',PolynomialFeatures(degree=3)),  # features include x^2 and x^3
    ('scaler',StandardScaler()),
    ('svm',svm)
])
pipe2.fit(X,y)
pipe2.predict([example_0])

array([1.])

In [19]:
# Note the SVM learned lots of features but most had tiny weights.
svm.intercept_, svm.coef_

(array([-1.27607675]),
 array([[ 0.        , -0.10241346,  0.01394803,  0.33257005,  0.36240083,
          0.60407585,  0.73263181,  0.67631331,  0.79776263,  0.99750406]]))