In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc

# --- Cell 1: Load Data ---
data = load_breast_cancer()
X = data.data
y = data.target

# --- Cell 2: Preprocess Data ---

# Map labels from {0, 1} to {-1, 1} for SVM
# Malignant (0) -> -1
# Benign (1) -> 1
y_svm = y.copy()
y_svm[y == 0] = -1

X_train, X_test, y_train, y_test = train_test_split(X, y_svm, test_size=0.3, random_state=42, stratify=y_svm)

# 1. Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Apply Polynomial Feature Expansion (degree=2)
# We will run a linear SVM on these new features,
# which is equivalent to a polynomial kernel in the primal form.
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# --- Cell 3: SVM "from scratch" Implementation (Sequential) ---
# This is a linear SVM applied to the polynomial-expanded feature set.

# --- Hyperparameters ---
learning_rate = 0.0001 # Smaller learning rate for stability
lambda_param = 0.01 # Regularization parameter
n_iters = 1000

# --- Initialization ---
n_samples, n_features = X_train_poly.shape
w = np.zeros(n_features) # Weights
b = 0 # Bias

# --- Gradient Descent Training Loop ---
for _ in range(n_iters):
    for idx, x_i in enumerate(X_train_poly):

        # Calculate condition: y_i * (w . x_i + b) >= 1
        condition = y_train[idx] * (np.dot(x_i, w) + b)

        # Hinge Loss Gradient
        if condition >= 1:
            # Correct margin
            dw = 2 * lambda_param * w
            db = 0
        else:
            # Inside margin or wrong side
            dw = 2 * lambda_param * w - y_train[idx] * x_i
            db = -y_train[idx]

        # Update weights and bias
        w = w - learning_rate * dw
        b = b - learning_rate * db

# --- Cell 4: Predict ---

# Calculate the linear output (decision scores)
# These scores are used for the ROC curve
y_scores = np.dot(X_test_poly, w) + b

# Predictions are the sign of the output
y_pred = np.sign(y_scores)

# --- Cell 5: Evaluate Performance ---

# Map labels back to {0, 1} for metrics
y_test_mapped = (y_test + 1) // 2 # {-1 -> 0, 1 -> 1}
y_pred_mapped = (y_pred + 1) // 2 # {-1 -> 0, 1 -> 1}
# Handle cases where all preds are -1 (mapped to 0)
y_pred_mapped[y_pred_mapped == -1] = 0

# --- Confusion Matrix and Accuracy ---
accuracy = accuracy_score(y_test_mapped, y_pred_mapped)
cm = confusion_matrix(y_test_mapped, y_pred_mapped)

print("--- Model Evaluation Results (SVM From Scratch w/ Poly Features) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\n--- Confusion Matrix ---")
print("True Neg (Malignant) | False Pos (Benign)")
print("False Neg (Malignant)| True Pos (Benign)")
print(cm)

# --- ROC Curve and AUC ---
# Use the original {0, 1} test labels and the continuous scores
fpr, tpr, thresholds = roc_curve(y_test_mapped, y_scores)
roc_auc = auc(fpr, tpr)

print(f"\nAUC (Area Under Curve): {roc_auc:.4f}")

# --- Cell 6: Plot ROC Curve ---
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

