### SVM with generalization bounds

In [None]:
from sklearn import svm
import ScenarioGeneralizationBounds as SBn
import numpy as np
import matplotlib.pyplot as plt

 # define a data-generating-mechanism (DGM) generates a set of instanes and classes
def DGM(N1,rateo):
    N2=round(N1*rateo) 
    mean12=[1,1]
    mean34=[-4,-2]
    cov12=[[1, 0.9], [0.9, 10]] 
    cov34=[[7, 0.1], [0.1, 1]] 
    x1 , x2 = np.random.multivariate_normal(mean12, cov12, N1).T
    x3 , x4 = np.random.multivariate_normal(mean34, cov34, N2).T 
    x1=x2+x1**2
    Xa=np.append(x1,x3)
    Xb=np.append(x2,x4)
    # Build data
    X =  np.transpose([Xa, Xb]) # X of shape (n_samples, n_features)  
    y =  np.append(np.zeros(N1), np.ones(N2))  
    return X, y ,x1 , x2, x3, x4

In [None]:
# sample from the DGM
Rateo=1
N1=500
X,y , x1 , x2, x3, x4 = DGM(N1,Rateo) 
# fit svm model
clf = svm.SVC(kernel='poly', degree=3, gamma='auto', C=1.0)
#clf = svm.SVC(kernel='linear', gamma=1, C=1.0)

clf.fit(X, y)
# plot the data and support vectors
plt.grid(True)
plt.scatter(x1,x2, facecolors='b', s=10)
plt.scatter(x3,x4, facecolors='r',s=10)
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=30, facecolors='k', alpha=0.5  ,   zorder=10, edgecolors='w')
plt.xlabel('X1')
plt.ylabel('X2')

### Scenario bound on the violation probability


In [None]:
N=np.size(y) #  number of samples 
Nsv=np.sum(clf.n_support_) #  number of support vectors
print("Number of support vectors:",Nsv)
beta=10**-6 #  number parameter
Lb,Ub =SBn.getepsilon_relaxedConstraints(Nsv,N,beta) #  get bounds on the probability of margin violation
print("Upper bound on the Probability of miscalssification:", Ub) 

### Verify bounds (MC samping from the DGM to approx the true probability of misclassification)

In [None]:
# def Validate_ScenarioBound() 
N1t=100000
Xtst,ytst , x1t , x2t, x3t, x4t = DGM(N1t,Rateo)   
ypred = clf.predict(Xtst) 
Pmisclass=np.mean(np.array([ytst!=ypred])) # MC estimate of the true pribability of misclassification
if Pmisclass>Ub:
    raise AssertionError()
elif Pmisclass<=Ub:
    print("Bound verified P_miscalssification <= Scenario-based upper bound:" ,  Pmisclass, "<=", Ub) 

In [None]:
def make_meshgrid(x, y, h=.02):
    """Create a mesh of points to plot in 
    Parameters
    ----------
    x: data to base x-axis meshgrid on
    y: data to base y-axis meshgrid on
    h: stepsize for meshgrid, optional 
    Returns
    -------
    xx, yy : ndarray
    """
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy
 
def plot_contours(ax, clf, xx, yy, **params):
    """Plot the decision boundaries for a classifier. 
    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out


### Plot  boundaries for different SVM models

In [None]:
# we create an instance of SVM and fit out data. 
# If we scale our data there might be an issue in plotting the support vectors.??

C = 1.0  # SVM regularization parameter
models = (svm.SVC(kernel='linear', C=C), 
          svm.SVC(kernel='rbf', gamma='auto', C=C),
          svm.SVC(kernel='poly', degree=3, gamma='auto', C=C),
          svm.SVC(kernel='poly', degree=4, gamma='auto', C=C))
MODELS = [clf.fit(X, y) for clf in models]
Nsv_models = [np.sum(clf.n_support_) for clf in MODELS] 
 
    
# title for the plots
titles = ('SVC, linear kernel', 
          'SVC, RBF kernel',
          'SVC, polynomial (deg 3) kernel',
          'SVC, polynomial (deg 4) kernel')
# Set-up 2x2 grid for plotting.
fig, sub = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4, )

X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)

for clf, title, ax in zip(MODELS, titles, sub.flatten()):
    plot_contours(ax, clf, xx, yy,
                  cmap=plt.cm.coolwarm, alpha=0.3)
    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
    ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=10,
                facecolors="lightsteelblue", zorder=10, edgecolors='none')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xlabel('Sepal length')
    ax.set_ylabel('Sepal width')
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

plt.show()

### Compute scenario-bounds for SVM trained with different kernels, validate bounds via MC sampling

In [None]:

for nsv,clf in zip(Nsv_models, MODELS):
    print("Number of support vectors", nsv)
    Lb,Ub =SBn.getepsilon_relaxedConstraints(nsv,N,beta) #  get bounds on the probability of margin violation
    N1t=100000 # samples for validation
    Xtst,ytst , x1t , x2t, x3t, x4t = DGM(N1t,Rateo)   
    ypred = clf.predict(Xtst)
    Pmisclass=np.mean(np.array([ytst!=ypred])) # MC estimate of the true pribability of misclassification 
    if Pmisclass>Ub:
          raise AssertionError()
    elif Pmisclass<=Ub:
         print("Bound verified P_miscalssification<=Scenario-based upper bound:" ,  Pmisclass, "<=", Ub) 