# Final Exam Second Semester 2566 - Support Vector Machine (Raisin Problem)

This exam problem has an objective to develop a support vector machine model to classify the raisin types whether they are Kecimen (class 0) or Besni (class 1) from 7 features.

Cr: This dataset is adapted from CINAR I., KOKLU M. and TASDEMIR S., (2020). Classification of Raisin Grains Using Machine Vision and Artificial Intelligence Methods, Gazi Journal of Engineering Sciences, vol. 6, no. 3, pp. 200-209, December, 2020, DOI: https://doi.org/10.30855/gmbd.2020.03.03

In [1]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np

# Plotting library
from matplotlib import pyplot

# Optimization module in scipy
from scipy import optimize

# library written for this exam
import utilsSVM as utils

# tells matplotlib to embed plots within the notebook
%matplotlib inline

import random 
random.seed(10)

### We start the exam by first loading the dataset

In [2]:
# Load training dataset and test dataset

# Read tab separated data
data = np.loadtxt(os.path.join('Data', 'SVM_RaisinData_Train.txt'))

# First 7 columns of data are features and the last column is the label.
# Matrix X contains 7 features while vector y contains the label.

X, y = data[:, :7], data[:, 7].astype(int)

m = y.size  # number of training examples

# Read tab separated testing data
data_test = np.loadtxt(os.path.join('Data', 'SVM_RaisinData_Test.txt'))

X_test, y_test = data_test[:, :7], data_test[:, 7].astype(int)

In [3]:
X.shape, X_test.shape

((697, 7), (200, 7))

In [24]:
# Load a new dataset from 3 examples
# This dataset will be used for the last question of this exam problem.

# Read tab separated data
data_new = np.loadtxt(os.path.join('Data', 'SVM_RaisinData_NewData.txt'))

# This dataset contains only 7 features and does not have label.
# Matrix X_new contains 7 features.

X_new = data_new[:, 0:7]
X_new

array([[ 0.39643973,  0.00509791,  1.17536203, -1.07481261,  0.39371985,
         0.34755079,  0.33975259],
       [-0.72189398, -0.73016961, -0.63729811, -0.14769936, -0.72688149,
         0.18563541, -0.74122882],
       [ 1.62150837,  1.58158504,  1.25191026,  0.80258732,  1.5795776 ,
        -1.12368024,  1.49643259]])

In [5]:
def gaussianKernel(x1, x2, sigma):
    """
    Computes the radial basis function (RBF)
    Returns a radial basis function kernel between x1 and x2.
    
    Parameters
    ----------
    x1 :  numpy ndarray
        A vector of size (n, ), representing the first datapoint.
    
    x2 : numpy ndarray
        A vector of size (n, ), representing the second datapoint.
    
    sigma : float
        The bandwidth parameter for the Gaussian kernel.

    Returns
    -------
    sim : float
        The computed RBF between the two provided data points.
    
    Instructions
    ------------
    Fill in this function to return the similarity between `x1` and `x2`
    computed using a Gaussian kernel with bandwidth `sigma`.
    """
    sim = 0
    # ====================== YOUR CODE HERE ======================
    sim = np.exp(-(np.sum(np.square(x1-x2)))/(2*np.square(sigma)))


    # =============================================================
    return sim

In [None]:
C_array = np.array([0.1,0.3,1,3])
sigma_array = np.array([0.1,0.3,1,3])

In [6]:
def dataset3Params(X, y, Xval, yval):
    """
    Returns your choice of C and sigma for Part 3 of the exercise 
    where you select the optimal (C, sigma) learning parameters to use for SVM
    with RBF kernel.
    
    Parameters
    ----------
    X : array_like
        (m x n) matrix of training data where m is number of training examples, and 
        n is the number of features.
    
    y : array_like
        (m, ) vector of labels for ther training data.
    
    Xval : array_like
        (mv x n) matrix of validation data where mv is the number of validation examples
        and n is the number of features
    
    yval : array_like
        (mv, ) vector of labels for the validation data.
    
    Returns
    -------
    C, sigma : float, float
        The best performing values for the regularization parameter C and 
        RBF parameter sigma.
    
    Instructions
    ------------
    Fill in this function to return the optimal C and sigma learning 
    parameters found using the cross validation set.
    You can use `svmPredict` to predict the labels on the cross
    validation set. For example, 
    
        predictions = svmPredict(model, Xval)

    will return the predictions on the cross validation set.
    
    Note
    ----
    You can compute the prediction error using 
    
        np.mean(predictions != yval)
    """
    # You need to return the following variables correctly.
    C = 1
    sigma = 0.3

    # ====================== YOUR CODE HERE ======================
    C_array = np.array([0.1,0.3,1,3])
    sigma_array = np.array([0.1,0.3,1,3])
    
    err_array = np.zeros([C_array.size, sigma_array.size,])
    
    for i in range(C_array.size):
        for j in range(sigma_array.size):
            model = utils.svmTrain(X, y, C_array[i], gaussianKernel, args = (sigma_array[j],))
            predictions = utils.svmPredict(model, Xval)
            pred_error = np.mean(predictions != yval)
            
            err_array[i,j] = pred_error
            
    ind = np.unravel_index(np.argmin(err_array, axis = None), err_array.shape)
    C = C_array[ind[0]]
    sigma = sigma_array[ind[1]]
    
    # ============================================================
    return C, sigma

In [8]:
C, sigma = dataset3Params(X, y, X_test, y_test)

# Train the SVM using the best parameters (C and sigma) we got from dataset3Params function
model = utils.svmTrain(X, y, C, gaussianKernel, args=(sigma,))
print(C, sigma)

0.1 3.0


In [22]:
C = 0.1
sigma = 3

In [23]:
p = utils.svmPredict(model, X)
print('Training Accuracy: %.2f' % (np.mean(p == y) * 100))
p_test = utils.svmPredict(model, X_test)
print('Test Accuracy: %.2f' % (np.mean(p_test == y_test) * 100))

Training Accuracy: 86.23
Test Accuracy: 88.50


In [25]:
p_1 = utils.svmPredict(model,X_new)
print(p_1)

[1. 0. 1.]


In [27]:
C = 3
model = utils.svmTrain(X, y, C, utils.linearKernel, args=(sigma,))

In [28]:
p = utils.svmPredict(model, X)
print('Training Accuracy: %.2f' % (np.mean(p == y) * 100))
p_test = utils.svmPredict(model, X_test)
print('Test Accuracy: %.2f' % (np.mean(p_test == y_test) * 100))

Training Accuracy: 86.37
Test Accuracy: 90.00


In [29]:
p_1 = utils.svmPredict(model,X_new)
print(p_1)

[1. 0. 1.]


### End of Support Vector Machine Problem