In [25]:
from cvxopt import matrix, solvers
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import time

# Source - https://stackoverflow.com/a
# Posted by sjm, modified by community. See post 'Timeline' for change history
# Retrieved 2026-01-03, License - CC BY-SA 4.0
solvers.options['show_progress'] = False  # didn't know that it pastes with a ref but it's cool

CLASSES = ["banana", "carrot", "cucumber", "mandarin", "tomato"]

EPS  = 1e-6
SEED = 462
np.random.seed(SEED)

data_path = os.path.join("..", "data_workflow_notebooks", "data", "tabular")

## Soft-Margin SVM

$$
min_\alpha \frac{1}{2}\sum_{m=1}^N\sum_{n=1}^N\alpha_n\alpha_my_ny_mx_n^Tx_m - \sum_{n=1}^N\alpha_n
$$

$$
s.t.\ \sum_{n=1}^Ny_n\alpha_n = 0
$$

$$
\ \ \ \ \ \ 0 \leq \alpha_n \leq C
$$

$$
n=1,\dots,N
$$

## QP Solver Representation

```python3
solvers.qp(Q, p, G, h, A, b)
```

Where the problem is:
$$
min_x \frac{1}{2}x^TQx + p^Tx
$$

$$
s.t.\ Gx \leq h
$$

$$
\ \ \ \ \ \ Ax = b
$$

## What to do

We need to convert Soft-Margin SVM representation into QP solver one.

In [26]:
class Dataset:
    """
    This class is taken directly from our previous submission.
    """
    def __init__(self, train_path=None, test_path=None):
        self.train_path = train_path
        self.test_path = test_path
    
    def load_csv(self, path):
        data = pd.read_csv(path).to_numpy()
        X, Y_str, filenames = data[:, :-2], data[:, -2:-1], data[:, -1]  # separate data and target
        n_examples = len(Y_str)
        Y = np.zeros(n_examples)
        for i in range(n_examples):
            category = Y_str[i]
            if   category == CLASSES[0]:
                Y[i] = 0
            elif category == CLASSES[1]:
                Y[i] = 1
            elif category == CLASSES[2]:
                Y[i] = 2
            elif category == CLASSES[3]:
                Y[i] = 3
            else:
                Y[i] = 4
        
        return X.astype(float), Y.astype(float), filenames
    
    def get_data(self):
        X_train, Y_train, filenames_train = self.load_csv(self.train_path)
        X_test , Y_test , filenames_test  = self.load_csv(self.test_path)
        
        return (X_train, Y_train, filenames_train), (X_test, Y_test, filenames_test)

In [27]:
class SoftMarginSVM:
    def __init__(self, C=1):
        self.weights    = None
        self.bias       = None
        self.alpha_star = None
        self.sv_indices = None
        self.ol_indices = None  # outlier indices
        self.cls        = None  # this is used in the prediction phase to decide which class is the prediction
        self.C          = C     # soft margin svm gets closer to hard when C gets greater
        
    def train(self, X, Y):
        n_examples, n_features = X.shape

        # we need to calculate Q using y_n*y_m and x_n^Tx_m
        Y = Y.reshape(-1, 1)    # to prevent np to give a scalar make it Nx1
        Y_mul = np.dot(Y, Y.T)  # NxN
        X_mul = np.dot(X, X.T)  # NxN

        Q = Y_mul * X_mul
        p = (np.ones(n_examples) * -1).reshape(-1, 1)

        # G is supposed to be (2N)xN
        G_first_half  = np.eye(n_examples) * -1  # greater than or equal to 0
        G_second_half = np.eye(n_examples)       # less than or equal to C
        G = np.vstack([G_first_half, G_second_half])
        h_first_half  = np.zeros(n_examples).reshape(-1, 1)
        h_second_half = (np.ones(n_examples) * self.C).reshape(-1, 1)
        h = np.vstack([h_first_half, h_second_half])

        A = Y.reshape(1, -1)
        b = 0.0  # cvxopt expects double-precision

        sol=solvers.qp(matrix(Q), matrix(p), matrix(G), matrix(h), matrix(A), matrix(b))

        # next: use sol["x"] to extract optimal alphas and then calculate w and b
        # calculate w*
        alpha_star      = np.array(sol["x"]).reshape(-1, 1)  # Nx1
        self.alpha_star = alpha_star
        weighted_label  = Y * alpha_star  # results in Nx1
        w_star          = np.dot(X.T, weighted_label)  # results in dx1 where d is the n_featuresy
        self.weights    = w_star

        # calculate b*
        epsilon = EPS  # since optimizers do not work with total precision
        sv_mask = ((alpha_star > epsilon) & (alpha_star < (self.C - epsilon))).flatten()  # flatten is used to prevent IndexError
        # numpy needs this to apply the mask row-wise
        Y_s     = Y[sv_mask]
        X_s     = X[sv_mask]
        pred    = np.dot(X_s, w_star)
        bias    = Y_s - pred
        self.bias = np.mean(bias)

        # detect sv indices
        self.sv_indices = np.where(sv_mask)[0]

        # detect outlier indices
        self.ol_indices = np.where((alpha_star >= self.C - epsilon))[0]

    def decision_function(self, X):
        return (np.dot(X, self.weights) + self.bias)

    def predict(self, X):
        return np.sign(self.decision_function(X))
    
    def hinge_loss(self, y_true, y_pred):
        return np.mean(np.maximum(0, 1 - y_true * y_pred))

In [None]:
class SoftMarginSVM_OVA:
    def __init__(self, C=1):
        self.C       = C
        self.models  = []
        self.classes = None
        self.history = None

    def train(self, X_train, Y_train):
        self.classes = np.unique(Y_train)
        self.history = {}

        for cls in self.classes:
            print(f"--- training for class {cls} ---")
            Y_train_bin = np.where(Y_train == cls, 1, -1).astype(float)  # ready for binary classification

            model = SoftMarginSVM(self.C)
            model.cls = cls
            
            n_examples, n_features = X_train.shape

            model.train(X_train, Y_train_bin)
            y_pred = model.decision_function(X_train)
            train_loss = model.hinge_loss(Y_train_bin, y_pred)
            self.models.append(model)
            self.history[cls] = {"train_loss": train_loss}

    def decision_functions_all(self, X):
        return np.column_stack([model.decision_function(X) for model in self.models])
    
    def predict(self, X):
        decisions = self.decision_functions_all(X)
        return [self.models[idx].cls for idx in np.argmax(decisions, axis=1)]
    
    def find_farthest_points(self, X, filenames):
        farthest = {"C": self.C}
        scores = self.decision_functions_all(X)

        for i, cls_label in enumerate(self.classes):
            class_scores = scores[:, i]  # get the scores of the ith class
            best_idx = np.argmax(class_scores)

            farthest[CLASSES[int(cls_label)]] = {"best_index": best_idx, "filename": filenames[best_idx], "score": class_scores[best_idx]}

        return farthest
    
    def get_svs(self, filenames):
        svs = {"C": self.C}
        
        for model in self.models:
            cls = CLASSES[int(model.cls)]
            sv_indices = model.sv_indices
            ol_indices = model.ol_indices

            svs[cls] = {"sv_filenames": filenames[sv_indices].tolist(), "ol_filenames": filenames[ol_indices].tolist()}
        
        return svs

In [29]:
dataset = Dataset(
    train_path=os.path.join(data_path, "train_processed.csv"),
    test_path=os.path.join(data_path, "test_processed.csv")
)

(X_train, Y_train, filenames_train), (X_test, Y_test, filenames_test) = dataset.get_data()

svms   = []
C_vals = [
    0.1, 
    # 1, 
    # 10
    ]

for c in C_vals:
    svm = SoftMarginSVM_OVA(C=c)

    print(f"============== start training with C = {c} ==============")
    start_time = time.time()
    svm.train(X_train, Y_train)
    end_time = time.time()
    training_time = end_time - start_time
    print(f"\nTotal Training Time: {training_time:.2f} seconds")
    svms.append(svm)

--- training for class 0.0 ---
--- training for class 1.0 ---
--- training for class 2.0 ---
--- training for class 3.0 ---
--- training for class 4.0 ---

Total Training Time: 736.53 seconds


In [30]:
for s in svms:
    c = s.C
    print(f"============== prediction with C = {c} ==============")
    Y_pred = s.predict(X_test)
    Y_pred = np.array(Y_pred)

    accuracy = np.mean(Y_pred == Y_test)
    print(f"Final Test Accuracy: {accuracy * 100:.2f}%")

    print(s.models[0].alpha_star)

    # save
    results_df = pd.DataFrame({'True_Label': Y_test, 'Predicted_Label': Y_pred})
    results_df.to_csv(f"svm_predictions_output{c}.csv", index=False)

Final Test Accuracy: 90.03%
[[2.74005063e-11]
 [1.04540725e-11]
 [1.84733180e-09]
 ...
 [1.38874226e-11]
 [9.80673407e-11]
 [2.03889301e-11]]


In [None]:
new_svms   = []
new_C_vals = [0.01, 100]

for c in new_C_vals:
    svm = SoftMarginSVM_OVA(C=c)

    print(f"============== start training with C = {c} ==============")
    start_time = time.time()
    svm.train(X_train, Y_train)
    end_time = time.time()
    training_time = end_time - start_time
    print(f"\nTotal Training Time: {training_time:.2f} seconds")
    new_svms.append(svm)

In [None]:
for s in new_svms:
    c = s.C
    print(f"============== prediction with C = {c} ==============")
    Y_pred = s.predict(X_test)
    Y_pred = np.array(Y_pred)

    accuracy = np.mean(Y_pred == Y_test)
    print(f"Final Test Accuracy: {accuracy * 100:.2f}%")

    # save
    results_df = pd.DataFrame({'True_Label': Y_test, 'Predicted_Label': Y_pred})
    results_df.to_csv(f"svm_predictions_output{c}.csv", index=False)

In [None]:
for cls in CLASSES:
    print(f"=========== for the class {cls} ===========")
    for s in svms:
        svs_report  = s.get_svs(filenames_train)
        fart_report = s.find_farthest_points(X_train, filenames_train)
        print(f"=========== C = {svs_report["C"]} ===========")
        print("=========== support vectors ===========")
        print(svs_report[cls])
        print("=========== farthest points ===========")
        print(fart_report[cls])
        print()

# NEED TO UPDATE THESE METHODS BECAUSE IT IS NOT CLEAR (e.g. 1046.png but which one carrot? banana?)


{'sv_filenames': ['0894.png', '0565.png', '0854.png', '0339.png', '0744.png', '0292.png', '0692.png', '0760.png', '0087.png', '0388.png', '0708.png', '0507.png', '0271.png', '0023.png', '0003.png', '0589.png', '0384.png', '0221.png', '0487.png', '0404.png', '0214.png', '0089.png', '0076.png', '0791.png', '0899.png', '0649.png', '0829.png', '0117.png', '0351.png', '0746.png', '0577.png', '0279.png', '0890.png', '0932.png', '0579.png', '0026.png', '0964.png', '0088.png', '0321.png', '0413.png', '0571.png', '0343.png', '0744.png', '0432.png', '0516.png', '1102.png', '0789.png', '0328.png', '0532.png', '0778.png', '0550.png', '0209.png', '0300.png', '0127.png', '0107.png', '0079.png', '0397.png', '0194.png', '0766.png', '1124.png', '0021.png', '0518.png', '0220.png', '0255.png', '0123.png', '0156.png', '0150.png', '0714.png', '0535.png', '0787.png', '0132.png', '0400.png', '0952.png', '0719.png', '0039.png', '0051.png'], 'ol_filenames': ['0122.png', '0164.png', '0439.png', '0335.png', '085