In [1]:
#ASS21. Develop an SVM classifier from scratch using a Polynomial Kernel on the Breast Cancer
#Wisconsin Dataset to distinguish between benign and malignant tumors.
#Evaluate the classifier using a confusion matrix and ROC curve to analyze diagnostic accuracy.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --------------------------------------------
#  Load the dataset
# --------------------------------------------
data = pd.read_csv("cancer.csv")

# Drop ID/unnecessary columns
data = data.drop(columns=[col for col in data.columns if 'id' in col.lower() or 'Unnamed' in col])

# Encode target
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': -1})

# Print first 5 rows
print("First 5 rows of dataset:\n")
print(data.head(), "\n")

# Split features and labels
X = data.drop(columns=['diagnosis']).values
y = data['diagnosis'].values

# Normalize data (mean 0, std 1)
X = (X - X.mean(axis=0)) / X.std(axis=0)

# Split into train/test
np.random.seed(42)
indices = np.arange(len(X))
np.random.shuffle(indices)
split = int(0.7 * len(X))
train_idx, test_idx = indices[:split], indices[split:]

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

# --------------------------------------------
# Define Polynomial Kernel
# --------------------------------------------
def polynomial_kernel(x1, x2, degree=3, c=1):
    return (np.dot(x1, x2.T) + c) ** degree


# --------------------------------------------
# Train Polynomial SVM (Dual form simplified)
# --------------------------------------------
class PolynomialSVM:
    def __init__(self, C=1.0, degree=3, c=1.0, tol=1e-3, max_iter=1000):
        self.C = C
        self.degree = degree
        self.c = c
        self.tol = tol
        self.max_iter = max_iter

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.alpha = np.zeros(n_samples)
        self.b = 0

        # Precompute the kernel matrix
        K = polynomial_kernel(X, X, self.degree, self.c)

        for _ in range(self.max_iter):
            alpha_prev = np.copy(self.alpha)
            for i in range(n_samples):
                f_i = np.sum(self.alpha * y * K[:, i]) + self.b
                E_i = f_i - y[i]

                if (y[i]*E_i < -self.tol and self.alpha[i] < self.C) or (y[i]*E_i > self.tol and self.alpha[i] > 0):
                    j = np.random.randint(0, n_samples)
                    while j == i:
                        j = np.random.randint(0, n_samples)

                    f_j = np.sum(self.alpha * y * K[:, j]) + self.b
                    E_j = f_j - y[j]

                    alpha_i_old = self.alpha[i]
                    alpha_j_old = self.alpha[j]

                    if y[i] != y[j]:
                        L = max(0, self.alpha[j] - self.alpha[i])
                        H = min(self.C, self.C + self.alpha[j] - self.alpha[i])
                    else:
                        L = max(0, self.alpha[i] + self.alpha[j] - self.C)
                        H = min(self.C, self.alpha[i] + self.alpha[j])
                    if L == H:
                        continue

                    eta = 2 * K[i, j] - K[i, i] - K[j, j]
                    if eta >= 0:
                        continue

                    self.alpha[j] -= y[j] * (E_i - E_j) / eta
                    self.alpha[j] = np.clip(self.alpha[j], L, H)

                    self.alpha[i] += y[i]*y[j]*(alpha_j_old - self.alpha[j])

                    b1 = self.b - E_i - y[i]*(self.alpha[i]-alpha_i_old)*K[i, i] - y[j]*(self.alpha[j]-alpha_j_old)*K[i, j]
                    b2 = self.b - E_j - y[i]*(self.alpha[i]-alpha_i_old)*K[i, j] - y[j]*(self.alpha[j]-alpha_j_old)*K[j, j]

                    if 0 < self.alpha[i] < self.C:
                        self.b = b1
                    elif 0 < self.alpha[j] < self.C:
                        self.b = b2
                    else:
                        self.b = (b1 + b2)/2

            diff = np.linalg.norm(self.alpha - alpha_prev)
            if diff < self.tol:
                break

        # Store support vectors
        self.support_ = np.where(self.alpha > 1e-5)[0]
        self.X = X
        self.y = y

    def project(self, X):
        K = polynomial_kernel(X, self.X[self.support_], self.degree, self.c)
        return np.dot(K, self.alpha[self.support_] * self.y[self.support_]) + self.b

    def predict(self, X):
        return np.sign(self.project(X))


# --------------------------------------------
# Train and Evaluate
# --------------------------------------------
svm_poly = PolynomialSVM(C=1.0, degree=3, c=1.0, max_iter=200)
svm_poly.fit(X_train, y_train)

y_pred = svm_poly.predict(X_test)

# --------------------------------------------
# Manual Evaluation Metrics
# --------------------------------------------
def confusion_matrix_manual(y_true, y_pred):
    tp = tn = fp = fn = 0
    for t, p in zip(y_true, y_pred):
        if t == 1 and p == 1:
            tp += 1
        elif t == -1 and p == -1:
            tn += 1
        elif t == -1 and p == 1:
            fp += 1
        elif t == 1 and p == -1:
            fn += 1
    return np.array([[tp, fn], [fp, tn]])

cm = confusion_matrix_manual(y_test, y_pred)
tp, fn, fp, tn = cm[0,0], cm[0,1], cm[1,0], cm[1,1]

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)
accuracy = (tp + tn) / (tp + tn + fp + fn)

print("Confusion Matrix:")
print(cm)
print(f"\nAccuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")

# --------------------------------------------
# ROC Curve + AUC manually
# --------------------------------------------
scores = svm_poly.project(X_test)
sorted_indices = np.argsort(scores)
y_sorted = y_test[sorted_indices]

tpr_list = []
fpr_list = []

P = sum(y_test == 1)
N = sum(y_test == -1)

for thresh in scores[sorted_indices]:
    y_pred_thresh = np.where(scores >= thresh, 1, -1)
    tp = np.sum((y_test == 1) & (y_pred_thresh == 1))
    fp = np.sum((y_test == -1) & (y_pred_thresh == 1))
    tpr = tp / P
    fpr = fp / N
    tpr_list.append(tpr)
    fpr_list.append(fpr)

# Updated for deprecation warning
auc_value = np.trapezoid(tpr_list, fpr_list)

# Plot ROC
plt.figure(figsize=(6,6))
plt.plot(fpr_list, tpr_list, color='blue', label=f'Polynomial SVM (AUC = {auc_value:.4f})')
plt.plot([0,1],[0,1],'r--',label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Polynomial SVM (No sklearn)')
plt.legend()
plt.grid(True)
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'cancer.csv'