In [2]:
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

def load_libsvm(filename, n_features):
    X, y = [], []
    with open(filename) as f:
        for line in f:
            parts = line.strip().split()
            if not parts:
                continue
            y.append(int(float(parts[0])))
            features = np.zeros(n_features)
            for feat in parts[1:]:
                idx, val = feat.split(":")
                features[int(idx) - 1] = float(val)
            X.append(features)
    return np.array(X), np.array(y)


def run_svm_experiment(data_path, train_idx_path, test_idx_path, n_features, kernel_type):
    print(f"\n=== Running {kernel_type.upper()} SVM ===")

    # Load dataset
    X, y = load_libsvm(data_path, n_features)
    train_indices = np.loadtxt(train_idx_path, delimiter=",", dtype=int)
    test_indices = np.loadtxt(test_idx_path, delimiter=",", dtype=int)
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Candidate C values
    C_values = [0.1, 1, 10, 100, 1000]
    best_C = None
    best_val_error = float("inf")
    cv_results = []

    # Cross-validation loop
    for C in C_values:
        model = SVC(C=C, kernel=kernel_type)
        scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring="accuracy")
        val_error = 1 - scores.mean()

        # Train on full training set
        model.fit(X_train_scaled, y_train)
        train_error = 1 - model.score(X_train_scaled, y_train)
        cv_results.append((C, train_error, val_error))

        if val_error < best_val_error:
            best_val_error = val_error
            best_C = C

    # Report CV results
    print("5-fold cross-validation errors (C, TrainErr, ValErr):")
    for c, tr, val in cv_results:
        print(f"C={c}, TrainErr={tr:.4f}, ValErr={val:.4f}")
    print(f"Best C value: {best_C}")

    # Evaluate on test set
    final_model = SVC(C=best_C, kernel=kernel_type)
    final_model.fit(X_train_scaled, y_train)
    test_accuracy = final_model.score(X_test_scaled, y_test)
    test_error = 1 - test_accuracy

    print(f"Test accuracy: {test_accuracy:.4f}")
    print(f"Test error rate: {test_error:.4f}")
    return best_C, test_error


base = r"C:\Users\rishi\Downloads"

# ----- Breast Cancer -----
print("=== BREAST CANCER DATASET ===")
for kernel in ["linear", "poly", "rbf"]:
    run_svm_experiment(
        f"{base}\\breast-cancer_scale",
        f"{base}\\breast-cancer_train_indices.txt",
        f"{base}\\breast-cancer_test_indices.txt",
        n_features=30,  # breast cancer dataset has 30 features
        kernel_type=kernel
    )

# ----- Sonar -----
print("\n=== SONAR DATASET ===")
for kernel in ["linear", "poly", "rbf"]:
    run_svm_experiment(
        f"{base}\\sonar_scale",
        f"{base}\\sonar_train_indices.txt",
        f"{base}\\sonar_test_indices.txt",
        n_features=60,  # sonar dataset has 60 features
        kernel_type=kernel
    )


=== BREAST CANCER DATASET ===

=== Running LINEAR SVM ===
5-fold cross-validation errors (C, TrainErr, ValErr):
C=0.1, TrainErr=0.0275, ValErr=0.0311
C=1, TrainErr=0.0275, ValErr=0.0402
C=10, TrainErr=0.0275, ValErr=0.0439
C=100, TrainErr=0.0275, ValErr=0.0457
C=1000, TrainErr=0.0275, ValErr=0.0457
Best C value: 0.1
Test accuracy: 0.9708
Test error rate: 0.0292

=== Running POLY SVM ===
5-fold cross-validation errors (C, TrainErr, ValErr):
C=0.1, TrainErr=0.0641, ValErr=0.0842
C=1, TrainErr=0.0330, ValErr=0.0494
C=10, TrainErr=0.0128, ValErr=0.0512
C=100, TrainErr=0.0055, ValErr=0.0604
C=1000, TrainErr=0.0018, ValErr=0.0586
Best C value: 1
Test accuracy: 0.9124
Test error rate: 0.0876

=== Running RBF SVM ===
5-fold cross-validation errors (C, TrainErr, ValErr):
C=0.1, TrainErr=0.0348, ValErr=0.0348
C=1, TrainErr=0.0201, ValErr=0.0348
C=10, TrainErr=0.0073, ValErr=0.0531
C=100, TrainErr=0.0000, ValErr=0.0641
C=1000, TrainErr=0.0000, ValErr=0.0641
Best C value: 0.1
Test accuracy: 0.9562

In [3]:
#------Comparison between Logistic Regression, Linear SVM, Polynomial SVM and Kernel SVM-----

import pandas as pd

data = [
    ["Breast Cancer", "Logistic Regression", 0.1, 0.9708, 0.0292],
    ["Breast Cancer", "Linear SVM", 0.1, 0.9708, 0.0292],
    ["Breast Cancer", "Polynomial Kernel SVM", 1, 0.9124, 0.0876],
    ["Breast Cancer", "RBF Kernel SVM", 0.1, 0.9562, 0.0438],
    ["Sonar", "Logistic Regression", 0.1, 0.8333, 0.1667],
    ["Sonar", "Linear SVM", 10, 0.8333, 0.1667],
    ["Sonar", "Polynomial Kernel SVM", 100, 0.9048, 0.0952],
    ["Sonar", "RBF Kernel SVM", 1, 0.8810, 0.1190],
]

df = pd.DataFrame(data, columns=["Dataset", "Model", "Best C", "Test Accuracy", "Test Error"])
print(df.to_string(index=False))


      Dataset                 Model  Best C  Test Accuracy  Test Error
Breast Cancer   Logistic Regression     0.1         0.9708      0.0292
Breast Cancer            Linear SVM     0.1         0.9708      0.0292
Breast Cancer Polynomial Kernel SVM     1.0         0.9124      0.0876
Breast Cancer        RBF Kernel SVM     0.1         0.9562      0.0438
        Sonar   Logistic Regression     0.1         0.8333      0.1667
        Sonar            Linear SVM    10.0         0.8333      0.1667
        Sonar Polynomial Kernel SVM   100.0         0.9048      0.0952
        Sonar        RBF Kernel SVM     1.0         0.8810      0.1190
