# README

The only dependencies that are required are the following libraries: NumPy, Matplotlib, CVXOPT, and Scikit-Learn.

The notebook needs the following files to run: banana_quality.csv and banknoteAuthentication.csv.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from cvxopt import matrix, solvers
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

In [None]:
# Soft Margin SVM class
class SVM_Soft_Margin:
  def __init__(self, feature_matrix, labels, C = None, w = None, b = None):
    self.feature_matrix = feature_matrix.astype(np.float64)
    self.labels = labels.astype(np.float64)
    self.num_examples = np.shape(feature_matrix)[0]
    self.num_features = np.shape(feature_matrix)[1]
    self.C = C
    self.w = w
    self.b = b
    self.accuracy = 0
    self.size_of_margin = 0
    self.support_vectors = None
    self.predictions = None

  def train_SVM(self):
    dot_product_features_matrix = np.dot(self.feature_matrix, self.feature_matrix.T)
    dot_product_labels_matrix = np.dot(self.labels, self.labels.T)

    # Necessary variables to compute the solution to the dual problem
    P = matrix(np.multiply(dot_product_labels_matrix, dot_product_features_matrix))
    q = matrix(np.ones((self.num_examples, 1)) * -1)
    A = matrix(self.labels.reshape(1, -1), tc = "d")
    b = matrix(np.zeros(1))
    G = matrix(np.vstack((np.identity(self.num_examples) * -1, np.identity(self.num_examples))))
    h = matrix(np.hstack((np.zeros(self.num_examples), np.ones(self.num_examples) * self.C)))

    # Solves dual problem
    solution = solvers.qp(P, q, G, h, A, b)
    alphas = np.array(solution['x']).flatten()

    # Find the support vectors, these are the points with the largest lagrange multipliers
    index_of_svs = np.argpartition(alphas, -2)[-2:]
    self.support_vectors = self.feature_matrix[index_of_svs]

    # Finds the points that have lagrange multipliers over a certain threshold
    # Used these points to find the weights and bias
    index_threshold = (alphas > 1e-4).flatten()
    alphas_threshold = alphas[index_threshold].reshape(-1 ,1)
    svms_threshold = self.feature_matrix[index_threshold]
    svms_threshold_labels = self.labels[index_threshold].reshape(-1 ,1)

    self.w = np.dot((alphas_threshold * svms_threshold_labels).T, svms_threshold)
    self.b = np.mean(svms_threshold_labels - np.dot(self.w, svms_threshold.T))

    # Find the margin
    self.size_of_margin = 2 / np.linalg.norm(self.w)

    # Call the predict function to predict the labels of the data points and find the accuracy
    self.predict()

  def test_SVM(self):

    # Calls the predict function
    self.predict()

  def find_accuracy(self):
    comparison = self.predictions.flatten() == self.labels.flatten()
    same_elements = np.count_nonzero(comparison)
    self.accuracy = same_elements / self.num_examples

  def predict(self):
    predictions = np.dot(self.w, self.feature_matrix.T) + self.b
    self.predictions = np.sign(predictions)
    self.find_accuracy()

  def plot_2d(self, x_label, y_label, title):

    if self.num_features != 2:
      print("Data does not have only two features.")
      return

    # Finds the slope and intercept for our decision boundary
    decision_boundary_slope = -self.w[0][0] / self.w[0][1]
    decision_boundary_intercept = -self.b / self.w[0][1]

    # Find the intercept for our support vector boundaries
    boundary_one_intecept = (1-self.b) / self.w[0][1]
    boundary_two_intercept = (-1-self.b) / self.w[0][1]

    # Plots all the data points
    plt.scatter(self.feature_matrix[:, 0], self.feature_matrix[:, 1])

    # Find the x values that are in the range of our data to plot our three boundaries
    x_values = range(int(np.min(self.feature_matrix[:, 0])), int(np.max(self.feature_matrix[:, 0])) + 1)

    # Finds the corresponding y values for each line
    y_values_1 = [decision_boundary_slope * x + decision_boundary_intercept for x in x_values]
    y_values_2 = [decision_boundary_slope * x + boundary_one_intecept for x in x_values]
    y_values_3 = [decision_boundary_slope * x + boundary_two_intercept for x in x_values]

    # Plots the lines
    plt.plot(x_values, y_values_1, label='Decision Boundary', color = "Red")
    plt.plot(x_values, y_values_2, label='Support Vector Boundary One', color = "Red", linestyle = "--")
    plt.plot(x_values, y_values_3, label='Support Vector Boundary Two', color = "Red", linestyle = "--")

    # Plots the support vectors green
    plt.scatter(self.support_vectors[:, 0], self.support_vectors[:, 1], color = "green", label = "Support Vectors")

    # Add labels and legend
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.legend()

    # Show plot
    plt.grid(True)
    plt.show()


In [None]:
# Creates Linearly Separable Dummy Data to show example
data, labels = make_blobs(n_samples=200, centers=2, cluster_std=2, random_state = 32)

# Plot Data
plt.scatter(data[:, 0], data[:, 1])

# Add labels and legend
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Plot of Dummy Data")
plt.legend()

# Show plot
plt.grid(True)
plt.show()

# Renames all the labels of the 0 class to -1
labels[labels == 0] = -1
labels = labels.reshape(-1, 1)

# Set Apart Training Data
training_feature_matrix = data[:180, :]
training_labels = labels[:180, :]

# Set Apart Test Data
testing_feature_matrix = data[180:, :]
testing_labels = labels[180:, :]


In [None]:
# Create SVM instance for training
training_SVM = SVM_Soft_Margin(training_feature_matrix, training_labels, 1)

# Train SVM then predict on training data
training_SVM.train_SVM()

# Print accuracy of training data
print("Training Accuracy")
print(training_SVM.accuracy)

# Call the function to plot our training data
training_SVM.plot_2d("X", "Y", "Support Vector Plot")

# Create SVM instance for testing
testing_SVM = SVM_Soft_Margin(testing_feature_matrix, testing_labels, w = training_SVM.w, b = training_SVM.b)

# Test the SVM
testing_SVM.test_SVM()

# Print accuracy of testing data
print("Testing Accuracy")
print(testing_SVM.accuracy)

### Bananas Quality Dataset

In [None]:
# Bananas Dataset

# Load Data
df = pd.read_csv('banana_quality.csv')

# Save headers
features = df.columns.tolist()

# Extract data from data frame
data = df.values

# Shuffle data
np.random.shuffle(data)

# Select only the first 100 examples
data = data[:1000, :]

# Create feature matrix
feature_matrix = data[:, :-1]

# Create labels
labels = data[:, -1]

# Make the labels with the label good to 1 and the labels with the label bad -1
labels[labels == "Good"] = 1
labels[labels == "Bad"] = -1

# Reshape the labels to column vector
labels = labels.reshape(-1, 1)

# Separate data into training and testing, 80 20 split
training_feature_matrix, testing_feature_matrix, training_labels, testing_labels = train_test_split(feature_matrix, labels, test_size=0.2, random_state=42)

In [None]:
# Training and Testing Bananas Dataset

# Create SVM instance for training
training_SVM_Bananas = SVM_Soft_Margin(training_feature_matrix, training_labels, .01)

# Train SVM then predict on training data
training_SVM_Bananas.train_SVM()

# Print accuracy of training data
print("Training Accuracy")
print(training_SVM_Bananas.accuracy)

# Create SVM instance for testing
testing_SVM_Bananas = SVM_Soft_Margin(testing_feature_matrix, testing_labels, w = training_SVM_Bananas.w, b = training_SVM_Bananas.b)

# Test the SVM
testing_SVM_Bananas.test_SVM()

# Print accuracy of testing data
print("Testing Accuracy")
print(testing_SVM_Bananas.accuracy)



### Banknote Authentication Dataset

In [None]:
# Banknote Dataset

# Load Data
df = pd.read_csv('banknoteAuthentication.csv')

# Save headers
features = df.columns.tolist()

# Extract data from data frame
data = df.values

# Shuffle data
np.random.shuffle(data)

# Select only the first 100 examples
data = data[:1000, :]

# Create feature matrix
feature_matrix = data[:, :-1]

# Create labels
labels = data[:, -1]

# Make the examples with the label 1 to 1 and the examples with the label 0 to -1
labels[labels == 1] = 1
labels[labels == 0] = -1

# Reshape the labels to column vector
labels = labels.reshape(-1, 1)

# Separate data into training and testing, 80 20 split
training_feature_matrix, testing_feature_matrix, training_labels, testing_labels = train_test_split(feature_matrix, labels, test_size=0.2, random_state=42)

In [None]:
# Training and Testing Banknote Authentication Dataset

# Create SVM instance for training
training_SVM_Banknote = SVM_Soft_Margin(training_feature_matrix, training_labels, 1)

# Train SVM then predict on training data
training_SVM_Banknote.train_SVM()

# Print accuracy of training data
print("Training Accuracy")
print(training_SVM_Banknote.accuracy)

# Create SVM instance for testing
testing_SVM_Banknote = SVM_Soft_Margin(testing_feature_matrix, testing_labels, w = training_SVM_Banknote.w, b = training_SVM_Banknote.b)

# Test the SVM
testing_SVM_Banknote.test_SVM()

# Print accuracy of testing data
print("Testing Accuracy")
print(testing_SVM_Banknote.accuracy)

