In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from matplotlib import cm
from matplotlib.colors import ListedColormap
# Only use this if running the notebook on your local machine
#plt.style.use('notebook.mplstyle')
colors = plt.get_cmap("tab10")
# Define my own colormap
gray = 0.75
red = np.hstack([np.linspace(colors(0)[0], gray, 128), np.linspace(gray, colors(1)[0], 127)])
green = np.hstack([np.linspace(colors(0)[1], gray, 128), np.linspace(gray, colors(1)[1], 127)])
blue = np.hstack([np.linspace(colors(0)[2], gray, 128), np.linspace(gray, colors(1)[2], 127)])
rgb = np.vstack([red, green, blue]).T
my_cmap = ListedColormap(rgb)

### Kernel trick
The algorithm for finding the optimal hyperplane can be formulated in such a way that the data points ($\mathbf{x}_i$) only enter the equations as dot products on the form: $\mathbf{x}_i^T\mathbf{x}_j = x_1^ix_1^j + x_2^ix_2^j$ (assuming two dimensional data). Kernels correspond to computations that you can do directly on the data points ($\mathbf{x}_i$ and $\mathbf{x}_j$) that are mathematically equivalent to a dot product between data points with additional features. For exampel, the quaratic kernel $K(\mathbf{x}_i, \mathbf{x}_j) = (\mathbf{x}_i^T\mathbf{x}_j + 1)^2$ is equivalent to taking the dot product between the feature vectors $[1, \sqrt{2}x_1, \sqrt{2}x_2, \sqrt{2}x_1x_2, x_1^2, x_2^2]_i^T[1, \sqrt{2}x_1, \sqrt{2}x_2, \sqrt{2}x_1x_2, x_1^2, x_2^2]_j$. That is, this kernel directly computes the dot product between the data points for a case where we would concatenated our original data vectors with quadratic terms. Adding features in this way can be interpreted as projecting our data into a higher dimensional space, and solving the classification problem in that higher dimensional space instead. Classification problems that are not linearly separable often becomes so when projected into a higher dimensional space. This is illustrated in the example below with a quadratic kernel where we summed the three new features into one third for visualization purposes, that is $x_3 = x_1x_2 + x_1^2+x_2^2$.

In [None]:
n_per_class = 50
radius = 6

x_class1 =  np.random.randn(n_per_class, 2)
thetas = 2*np.pi*np.random.rand(n_per_class)
x_class2 = np.stack([radius*np.cos(thetas), radius*np.sin(thetas)]).T
x_class2 += np.random.randn(n_per_class, 2)

X = np.vstack([x_class1, x_class2])
y = np.vstack([np.zeros([n_per_class, 1]), np.ones([n_per_class, 1])])

# Visualize what we have
fig, ax = plt.subplots(1, 1)
ax.plot(X[y.flatten()==0, 0], X[y.flatten()==0, 1], 'o', alpha=0.75, label='Class 1')
ax.plot(X[y.flatten()==1, 0], X[y.flatten()==1, 1], 'o', alpha=0.75, label='Class 2')
ax.set(xlabel='$x_1$', ylabel='$x_2$')
ax.legend();

In [None]:
x_new = X[:, 0]*X[:, 1] + X[:, 0]**2 + X[:, 1]**2
X_new = np.hstack([X, x_new[:, np.newaxis]])

# Visualize what we have
fig = plt.figure(figsize=[8, 6])
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax.plot(X_new[y.flatten()==0, 0], X_new[y.flatten()==0, 1], X_new[y.flatten()==0, 2], 'o', alpha=0.5, label='Class 1')
ax.plot(X_new[y.flatten()==1, 0], X_new[y.flatten()==1, 1], X_new[y.flatten()==1, 2], 'o', alpha=0.5, label='Class 2')
ax.set(xlabel='$x_1$', ylabel='$x_2$', zlabel='x_new');
ax.legend();
ax.view_init(azim=-60, elev=20.)


In [None]:
svc = SVC(C=1e3, kernel='linear')
svc.fit(X_new, y.flatten())

x_grid = np.linspace(-radius-3, radius+3, 101)
X1, X2 = np.meshgrid(x_grid, x_grid)
X3 = (-svc.intercept_ - X1*svc.coef_[0][0] - X2*svc.coef_[0][1])  / svc.coef_[0][2]

# Visualize what we have
fig = plt.figure(figsize=[8, 6])
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax.plot(X_new[y.flatten()==0, 0], X_new[y.flatten()==0, 1], X_new[y.flatten()==0, 2], 'o', alpha=0.5, label='Class 1')
ax.plot(X_new[y.flatten()==1, 0], X_new[y.flatten()==1, 1], X_new[y.flatten()==1, 2], 'o', alpha=0.5, label='Class 2')
ax.plot_surface(X1, X2, X3, color='gray', alpha=0.5)
ax.set(xlabel='$x_1$', ylabel='$x_2$', zlabel='x_new');
ax.legend();
ax.view_init(azim=-60, elev=20.)

### Conclusion
We can turn classification problems that are not linearly separable to begin with into linearly separable problems by projecting data into a higher dimensional space. The kernel trick is called a "trick" because it directly computes the inner product between our data points in that high-dimensional space without ever entering into it.

### Radial basis function kernel
The most common kernel used is the radial basis function (RBF) kernel. It has one hyperparameter called gamma, and the easiest way to graps what it is doing on an intuitive level is to simply try changing the value of it on a simple example (like the one above).

In [None]:
svc = SVC(C=1e2, kernel='rbf', gamma=0.1)
svc.fit(X, y.flatten())

z_grid = svc.decision_function(np.stack([X1.flatten(), X2.flatten()]).T)
y_hat_grid = svc.predict(np.stack([X1.flatten(), X2.flatten()]).T)

# Visualize what we have
fig, ax = plt.subplots(1, 1)
ch = ax.contourf(X1, X2, y_hat_grid.reshape(X1.shape), 2, alpha=0.5, cmap=my_cmap)
ax.plot(X[y.flatten()==0, 0], X[y.flatten()==0, 1], 'o', alpha=0.75, label='Class 1')
ax.plot(X[y.flatten()==1, 0], X[y.flatten()==1, 1], 'o', alpha=0.75, label='Class 2')
ax.set(xlabel='$x_1$', ylabel='$x_2$')
ax.legend();
cbar = fig.colorbar(ch, label='Predicted class', ticks=[0, 1]);
cbar.ax.set_yticklabels([1, 2]);