In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from scipy.optimize import minimize

## 1. Data Loading & Preprocessing

In [2]:
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

mask_train = (train_labels == 2) | (train_labels == 6)
mask_test  = (test_labels  == 2) | (test_labels  == 6)

X_all = train_images[mask_train].reshape(-1, 784).astype('float32') / 255.0
y_all = np.where(train_labels[mask_train] == 2, -1, 1)

X_test = test_images[mask_test].reshape(-1, 784).astype('float32') / 255.0
y_test = np.where(test_labels[mask_test] == 2, -1, 1)

X = X_all[:2000]
y = y_all[:2000]
N = len(y)

print(f'Train subset: {X.shape}, Full train: {X_all.shape}, Test: {X_test.shape}')

Train subset: (2000, 784), Full train: (11876, 784), Test: (1990, 784)


## 2. Shared Utilities

In [3]:
def predict_linear(X, w, b):
    preds = np.sign(X @ w + b)
    preds[preds == 0] = 1
    return preds

def zero_one_loss(y_true, y_pred):
    return np.mean(y_true != y_pred)

def evaluate(X_tr, y_tr, X_te, y_te, w, b, label=''):
    tl = zero_one_loss(y_tr, predict_linear(X_tr, w, b))
    vl = zero_one_loss(y_te, predict_linear(X_te, w, b))
    print(f'{label} | Train Loss={tl:.4f} ({(1-tl)*100:.2f}%) | Test Loss={vl:.4f} ({(1-vl)*100:.2f}%)')
    return tl, vl

## 3. Hard-Margin SVM — Primal Formulation

$$\min_{\mathbf{w}, b} \frac{1}{2}\|\mathbf{w}\|^2 \quad \text{s.t.} \quad y_i(\mathbf{w}^\top \mathbf{x}_i + b) \geq 1, \; \forall i$$

In [4]:
n_features = X.shape[1]

objective_primal = lambda params: 0.5 * np.linalg.norm(params[:n_features]) ** 2

constraints_primal = {
    'type': 'ineq',
    'fun': lambda params: y * (X @ params[:n_features] + params[n_features]) - 1
}

result_primal = minimize(
    objective_primal,
    np.zeros(n_features + 1),
    method='SLSQP',
    constraints=constraints_primal
)

w_primal = result_primal.x[:n_features]
b_primal = result_primal.x[n_features]

print(f'||w*|| = {np.linalg.norm(w_primal):.6f}  |  b* = {b_primal:.6f}')
evaluate(X, y, X_test, y_test, w_primal, b_primal, 'Primal Hard-Margin SVM')

||w*|| = 4.263242  |  b* = -1.140033
Primal Hard-Margin SVM | Train Loss=0.0000 (100.00%) | Test Loss=0.0226 (97.74%)


(0.0, 0.022613065326633167)

In [5]:
margins = y * (X @ w_primal + b_primal)
print(f'Min margin : {np.min(margins):.6f}')
print(f'Constraints satisfied: {np.sum(margins < 0.999) == 0}')

Min margin : 1.000000
Constraints satisfied: True


## 4. Hard-Margin SVM — Dual Formulation

$$\max_{\boldsymbol{\alpha}} \sum_i \alpha_i - \frac{1}{2} \sum_{i,j} \alpha_i \alpha_j y_i y_j \mathbf{x}_i^\top \mathbf{x}_j \quad \text{s.t.} \quad \alpha_i \geq 0, \; \sum_i \alpha_i y_i = 0$$

In [6]:
Gram_linear = (X * y[:, None]) @ (X * y[:, None]).T

obj_dual    = lambda a: -np.sum(a) + 0.5 * a @ Gram_linear @ a
jac_dual    = lambda a: -np.ones_like(a) + Gram_linear @ a

constraints_dual = (
    {'type': 'ineq', 'fun': lambda a: a,       'jac': lambda a: np.eye(N)},
    {'type': 'eq',   'fun': lambda a: a @ y,   'jac': lambda a: y},
)

result_dual = minimize(
    obj_dual, np.zeros(N), method='SLSQP',
    jac=jac_dual, constraints=constraints_dual,
    options={'maxiter': 50, 'ftol': 1e-4}
)

alpha_dual = result_dual.x
w_dual = np.sum((alpha_dual * y)[:, None] * X, axis=0)

max_neg = np.max(w_dual @ X[y == -1].T)
min_pos = np.min(w_dual @ X[y ==  1].T)
b_dual  = -(max_neg + min_pos) / 2.0

print(f'Support vectors : {np.sum(alpha_dual > 1e-5)}/{N}')
print(f'||w*|| = {np.linalg.norm(w_dual):.6f}  |  b* = {b_dual:.6f}')
print(f'Σ(αy) = {np.sum(alpha_dual * y):.2e}  (should be ≈ 0)')
evaluate(X, y, X_test, y_test, w_dual, b_dual, 'Dual Hard-Margin SVM')

Support vectors : 621/2000
||w*|| = 4.211969  |  b* = -1.189798
Σ(αy) = 2.78e-16  (should be ≈ 0)
Dual Hard-Margin SVM | Train Loss=0.0000 (100.00%) | Test Loss=0.0221 (97.79%)


(0.0, 0.022110552763819097)

## 5. Kernel SVM — Gaussian (RBF) Kernel

$$k(\mathbf{x}_i, \mathbf{x}_j) = \exp\!\left(-\frac{\|\mathbf{x}_i - \mathbf{x}_j\|^2}{2\sigma^2}\right)$$

In [7]:
def rbf_kernel_matrix(X1, X2, sigma=1.0):
    X1_sq = np.sum(X1 ** 2, axis=1, keepdims=True)
    X2_sq = np.sum(X2 ** 2, axis=1, keepdims=True)
    dist_sq = X1_sq + X2_sq.T - 2 * X1 @ X2.T
    return np.exp(-dist_sq / (2 * sigma ** 2))

sigma   = 1.0
K_train = rbf_kernel_matrix(X, X, sigma)
Gram_rbf = K_train * np.outer(y, y)

print(f'Kernel matrix shape: {K_train.shape}')

Kernel matrix shape: (2000, 2000)


In [8]:
obj_kernel  = lambda a: -np.sum(a) + 0.5 * a @ Gram_rbf @ a
jac_kernel  = lambda a: -np.ones_like(a) + Gram_rbf @ a

result_kernel = minimize(
    obj_kernel, np.zeros(N), method='SLSQP',
    jac=jac_kernel, constraints=constraints_dual,
    options={'maxiter': 200, 'ftol': 1e-6}
)

alpha_kernel = result_kernel.x
sv_mask      = alpha_kernel > 1e-5

b_vals = [y[s] - np.sum(alpha_kernel * y * K_train[:, s]) for s in np.where(sv_mask)[0]]
b_kernel = np.mean(b_vals)

print(f'Support vectors : {np.sum(sv_mask)}/{N}  ({np.sum(sv_mask)/N*100:.1f}%)')
print(f'b* = {b_kernel:.6f}')

Support vectors : 2000/2000  (100.0%)
b* = 0.011147


In [9]:
def predict_kernel(X_query, X_sv, y_sv, alpha_sv, b, sigma=1.0):
    K = rbf_kernel_matrix(X_query, X_sv, sigma)
    decisions = K @ (alpha_sv * y_sv) + b
    preds = np.sign(decisions)
    preds[preds == 0] = 1
    return preds

train_pred_k = predict_kernel(X, X, y, alpha_kernel, b_kernel, sigma)
test_pred_k  = predict_kernel(X_test, X, y, alpha_kernel, b_kernel, sigma)

tl_k = zero_one_loss(y, train_pred_k)
vl_k = zero_one_loss(y_test, test_pred_k)
print(f'Kernel SVM (σ={sigma}) | Train Loss={tl_k:.4f} ({(1-tl_k)*100:.2f}%) | Test Loss={vl_k:.4f} ({(1-vl_k)*100:.2f}%)')

Kernel SVM (σ=1.0) | Train Loss=0.0000 (100.00%) | Test Loss=0.5186 (48.14%)


## 6. Soft-Margin SVM — Dual Formulation (C-SVM)

$$\max_{\boldsymbol{\alpha}} \sum_i \alpha_i - \frac{1}{2}\sum_{i,j}\alpha_i\alpha_j y_i y_j \mathbf{x}_i^\top\mathbf{x}_j \quad \text{s.t.} \quad 0 \leq \alpha_i \leq C, \; \sum_i \alpha_i y_i = 0$$

In [10]:
N_all    = len(y_all)
Gram_all = (X_all * y_all[:, None]) @ (X_all * y_all[:, None]).T

soft_results = {}

for C in [1, 3, 5]:
    obj  = lambda a: -np.sum(a) + 0.5 * a @ Gram_all @ a
    jac  = lambda a: -np.ones_like(a) + Gram_all @ a
    cons = (
        {'type': 'ineq', 'fun': lambda a: a},
        {'type': 'ineq', 'fun': lambda a, C=C: C - a},
        {'type': 'eq',   'fun': lambda a: a @ y_all},
    )

    res   = minimize(obj, np.zeros(N_all), method='SLSQP', jac=jac, constraints=cons,
                     options={'maxiter': 100, 'ftol': 1e-4})
    alpha = res.x

    w_soft = np.sum((alpha * y_all)[:, None] * X_all, axis=0)

    margin_sv = np.where((alpha > 1e-5) & (alpha < C - 1e-5))[0]
    if len(margin_sv) > 0:
        b_soft = np.mean([y_all[i] - w_soft @ X_all[i] for i in margin_sv[:10]])
    else:
        b_soft = -(np.max(w_soft @ X_all[y_all == -1].T) + np.min(w_soft @ X_all[y_all == 1].T)) / 2

    tl, vl = evaluate(X_all, y_all, X_test, y_test, w_soft, b_soft, f'Soft-Margin SVM C={C}')
    soft_results[C] = {'train_loss': tl, 'test_loss': vl,
                       'n_sv': np.sum(alpha > 1e-5), 'n_bound_sv': np.sum(alpha >= C - 1e-5)}

## 7. Visualization & Summary

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

C_vals   = list(soft_results.keys())
tr_errs  = [soft_results[c]['train_loss'] for c in C_vals]
te_errs  = [soft_results[c]['test_loss']  for c in C_vals]
n_svs    = [soft_results[c]['n_sv']       for c in C_vals]

axes[0].plot(C_vals, tr_errs, 'bo-', linewidth=2, markersize=8, label='Train Loss')
axes[0].plot(C_vals, te_errs, 'ro-', linewidth=2, markersize=8, label='Test Loss')
axes[0].set_xlabel('C (regularization)', fontsize=12)
axes[0].set_ylabel('0-1 Loss', fontsize=12)
axes[0].set_title('Soft-Margin SVM: Error vs C', fontsize=13)
axes[0].legend(); axes[0].grid(True, alpha=0.3); axes[0].set_xticks(C_vals)

axes[1].bar([str(c) for c in C_vals], n_svs, color='steelblue', alpha=0.8)
axes[1].set_xlabel('C', fontsize=12)
axes[1].set_ylabel('Number of Support Vectors', fontsize=12)
axes[1].set_title('Support Vector Count vs C', fontsize=13)
axes[1].grid(True, alpha=0.3, axis='y')

plt.suptitle('Soft-Margin SVM Analysis (MNIST Digits 2 vs 6)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('results/svm_soft_margin_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
hard_primal_test = zero_one_loss(y_test, predict_linear(X_test, w_primal, b_primal))
hard_dual_test   = zero_one_loss(y_test, predict_linear(X_test, w_dual,   b_dual))

print('{:<30} {:>12} {:>10}'.format('Model','Train Loss','Test Loss'))
print('-' * 55)
print('{:<30} {:>12.4f} {:>10.4f}'.format('Hard-Margin SVM (Primal)', zero_one_loss(y, predict_linear(X, w_primal, b_primal)), hard_primal_test))
print('{:<30} {:>12.4f} {:>10.4f}'.format('Hard-Margin SVM (Dual)', zero_one_loss(y, predict_linear(X, w_dual, b_dual)), hard_dual_test))
print('{:<30} {:>12.4f} {:>10.4f}'.format('Kernel SVM (RBF, sigma=1)', tl_k, vl_k))
for C in [1, 3, 5]:
    r = soft_results[C]
    print('{:<30} {:>12.4f} {:>10.4f}'.format('Soft-Margin SVM C={}'.format(C), r['train_loss'], r['test_loss']))