# Using Support Vector Machine (SVM) with RBF Kernel for Classification on Breast Cancer and Wine Datasets

I will be using SVM on classification problems: Breast Cancer and Wine datasets. First I will solve them "by hand" (not using SVM's implementations on any library). Later, I will solve the problems using the Scikit library and compare the results obtained from my "by hand" solution with the results obtained from the solution using the Scikit library

# By Hand

### Defining functions to be used

In [2]:
import numpy as np
import csv
from numpy import genfromtxt
from sklearn.model_selection import train_test_split 
from sklearn import preprocessing
from numpy import linalg
from sklearn.datasets import load_breast_cancer 

# Rbf Kernel
def rbf(xi,xj):
    return np.exp(-1/(2*(np.power(sigma,2))) * linalg.norm(xi - xj) ** 2)

# Linear kernel
def K(xi, xj):
    return np.dot(xi,xj)

# Kernel SVM
def kernel_svm(X, y): 

    from cvxopt import matrix, solvers
    solvers.options['show_progress'] = False

    N = X.shape[0]
    
    #P/Q
    P = np.zeros(shape=(N, N))
    
    for i in range(0, N) :
        for j in range(0, N) :
            P[i][j] =  y[i] * y[j] * rbf(X[i],X[j])    
    
    P = matrix(P)
        
    #q/-1
    q = np.ones(N) * -1
    q = matrix(q)
    
    #G/1
    G = np.identity(N) * -1
    G = matrix(G)
    
    #h/0
    h = np.zeros(N)
    h = matrix(h)
    
    #A/yt
    A = np.reshape((y.T), (1,N)) 
    A = A.astype('float')
    A = matrix(A)
    
    
    #b/0
    b = matrix(np.zeros(1))

    sol = solvers.qp(P,q,G,h,A,b)
    
    alphas = sol["x"] 

   ## 
    return alphas

# Dual function
def f_dual(x):
     
    N = len(alphas)
    f = 0
    for i in range(N):
        if alphas[i] > 1/1000:
            f += alphas[i] * y_train[i] * rbf(X_train[i],x)
    f+= w0
    
    if f < 0:
        return -1
    else:
        return 1

# Classification function
def compute_classification_boundary (X, y, alpha):
    ## Write your code here
    N = X.shape[0]
    
    w0 = 0
    w1 = 0
    w2 = 0
    
    for i in range(N):
        w1 += alpha[i] * y[i] * X[i][0]
        w2 += alpha[i] * y[i] * X[i][1]
        
    w = np.array([w1, w2])
    
    support_vectors = []
    
    for i in range (N) :
        if (alpha[i] > 1/1000) : 
            support_vectors.append(i)
    
    w0 = support_vectors[0]
    yj = y[w0]; xj = X[w0]
    
    suma = 0.0
    for i in range (N) :
        suma = suma + alpha[i] * y[i] * rbf(X[i], xj)
    w0 = yj - suma
   
    ##
    return w, w0

# Function to calculate accuracy   
def get_score(X_test, y_test):

    N = len(X_test)
    count = 0
    for i in range (N):
        #print(f_dual(X_test[i]), y_test[i])
        if f_dual(X_test[i]) == y_test[i]:
            
            count+=1
    score = count/N
    
    return score

### Breast Cancer Dataset

In [3]:
# Loading the dataset
cancer = load_breast_cancer()
y = cancer.target
X = cancer.data
y = np.where(y<1,-1,1)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)

# Scaling Data
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Setting value of hyperparameter sigmma
sigma = 2

# Fitting svm dual classifier
alphas = kernel_svm(X_train, y_train)
 
# Getting weights
w, w0 = compute_classification_boundary(X_train, y_train, alphas)

# Predicting on test data
score = get_score(X_test, y_test)
print("Accuracy:", score * 100)

Accuracy: 95.8041958041958


### Wine Dataset

In [4]:
# Importing data
data = genfromtxt('datasets/wine.csv', delimiter=',') # reading in the data matrix

# data preprocessing (Only using 2 classes of wine)
n =  data.shape[0]
feat = data.shape[1]
data = data[data[:,0]<3]
X = data[0:n,1:feat]
y = data[0:n,0]
y = np.where(y<2,-1,1)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1)

# Scaling Data
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Setting value of sigma
sigma = 2  

# Fitting svm dual classifier
alphas = kernel_svm(X_train, y_train)

# Getting weights
w, w0 = compute_classification_boundary(X_train, y_train, alphas)

# Predicting on test data
score = get_score(X_test, y_test)
print("Accuracy:", score *100)

Accuracy: 100.0


# Scikit Implementation

### Breast Cancer Dataset

In [5]:
from sklearn import svm
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn import preprocessing
from sklearn.datasets import load_breast_cancer 

# Loading the dataset
cancer = load_breast_cancer()
y = cancer.target
X = cancer.data
y = np.where(y<1,-1,1)

# train_test_split to split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)

# Scaling Data
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Using SVM with rbf kernel
clf = SVC(gamma='auto')
clf.fit(X_train, y_train)

# Predicting on test data
predictions = clf.predict(X_test)
score = clf.score(X_test, y_test)
print("Accuracy:", score*100)

Accuracy: 97.2027972027972


### Wine Dataset

In [7]:
# Importing data
from numpy import genfromtxt
data = genfromtxt('datasets/wine.csv', delimiter=',') # reading in the data matrix

# data preprocessing (Only using 2 classes of wine)
n =  data.shape[0]
feat = data.shape[1]
data = data[data[:,0]<3]
X = data[0:n,1:feat]
y = data[0:n,0]
y = np.where(y<2,0,1)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Scaling Data
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Using SVM with rbf kernel
clf = SVC()
clf.fit(X_train, y_train)

# Predicting on test data
predictions = clf.predict(X_test)
score = clf.score(X_test, y_test)
print("Accuracy:", score*100)

Accuracy: 100.0


### Results

The results obtained were the same for the Wine dataset (100% accuracy). However, the Scikit implementation performed better than the "by hand" implementation for the Breast Cancer dataset (97.2% vs 95.8% accuracy)