In [1]:
pip install scikit-learn joblib pandas numpy




In [2]:
#!/usr/bin/env python3
"""
diabetes_pso_svm.py

PSO-based hyperparameter tuning for SVM on a diabetes proxy classification task.
Uses sklearn.load_diabetes (continuous) and thresholds at median -> binary label.
PSO optimizes log10(C) in [-3,3] and log10(gamma) in [-4,1] to maximize CV accuracy.
"""

import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import time
import random

# Optional: adjust to reduce runtime for quick tests
SWARM_SIZE = 20
PSO_ITERS = 30
RND = 42

np.random.seed(RND)
random.seed(RND)

# -------------------- Prepare dataset --------------------
data = load_diabetes()
X = data.data
y_reg = data.target
threshold = np.median(y_reg)
y = (y_reg > threshold).astype(int)  # binary proxy

# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RND, stratify=y)

scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

# -------------------- PSO implementation --------------------
class PSO:
    def __init__(self, n_particles, iters, fitness_func, dim=2,
                 bounds=[(-3,3), (-4,1)], w=0.729, c1=1.49445, c2=1.49445, seed=RND):
        self.n_particles = n_particles
        self.iters = iters
        self.fitness_func = fitness_func
        self.dim = dim
        self.bounds = np.array(bounds)
        self.w = w
        self.c1 = c1
        self.c2 = c2
        self.rng = np.random.RandomState(seed)
        self.pos = self.rng.uniform(self.bounds[:,0], self.bounds[:,1], (n_particles, dim))
        self.vel = np.zeros_like(self.pos)
        self.pbest_pos = self.pos.copy()
        self.pbest_val = np.array([np.inf]*n_particles)
        self.gbest_pos = None
        self.gbest_val = np.inf

    def _clip_pos(self):
        for d in range(self.dim):
            low, high = self.bounds[d]
            self.pos[:,d] = np.clip(self.pos[:,d], low, high)

    def optimize(self, verbose=True):
        history = []
        for it in range(self.iters):
            for i in range(self.n_particles):
                val = self.fitness_func(self.pos[i])
                if val < self.pbest_val[i]:
                    self.pbest_val[i] = val
                    self.pbest_pos[i] = self.pos[i].copy()
                if val < self.gbest_val:
                    self.gbest_val = val
                    self.gbest_pos = self.pos[i].copy()
            r1 = self.rng.rand(self.n_particles, self.dim)
            r2 = self.rng.rand(self.n_particles, self.dim)
            cognitive = self.c1 * r1 * (self.pbest_pos - self.pos)
            social = self.c2 * r2 * (self.gbest_pos - self.pos)
            self.vel = self.w * self.vel + cognitive + social
            self.pos += self.vel
            self._clip_pos()
            history.append(self.gbest_val)
            if verbose and ((it+1) % max(1, self.iters//10) == 0 or it==0):
                print(f"Iteration {it+1}/{self.iters} - best fitness (1-accuracy): {self.gbest_val:.4f}")
        return self.gbest_pos, self.gbest_val, history

# -------------------- Fitness function --------------------
def fitness_from_particle(particle):
    log10_C, log10_gamma = particle
    C = 10 ** log10_C
    gamma = 10 ** log10_gamma
    model = SVC(kernel='rbf', C=C, gamma=gamma, random_state=RND)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RND)
    # Use cross_val_score (accuracy). Use n_jobs=-1 for parallel if available.
    scores = cross_val_score(model, X_train_s, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
    fitness = 1.0 - scores.mean()  # minimize
    return fitness

# -------------------- Run PSO --------------------
def run_pso_and_train():
    print("Starting PSO optimization...")
    pso = PSO(n_particles=SWARM_SIZE, iters=PSO_ITERS, fitness_func=fitness_from_particle)
    start = time.time()
    best_pos, best_val, history = pso.optimize()
    end = time.time()
    best_log10_C, best_log10_gamma = best_pos
    best_C = 10 ** best_log10_C
    best_gamma = 10 ** best_log10_gamma
    print("\nPSO finished.")
    print(f"Time taken: {end - start:.2f} seconds")
    print(f"Best (log10_C, log10_gamma): ({best_log10_C:.4f}, {best_log10_gamma:.4f})")
    print(f"Best hyperparameters: C = {best_C:.6f}, gamma = {best_gamma:.6f}")
    print(f"Best fitness (1 - CV accuracy): {best_val:.4f} -> CV accuracy: {1-best_val:.4f}")

    # Train final model
    final_model = SVC(kernel='rbf', C=best_C, gamma=best_gamma, probability=True, random_state=RND)
    final_model.fit(X_train_s, y_train)
    y_pred = final_model.predict(X_test_s)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=4)
    print("\nFinal model evaluation on test set:")
    print(f"Test accuracy: {acc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("\nClassification Report:")
    print(report)

    # Save model and scaler
    try:
        import joblib
        joblib.dump(final_model, 'svm_diabetes_pso_model.joblib')
        joblib.dump(scaler, 'scaler.joblib')
        print("\nSaved model to 'svm_diabetes_pso_model.joblib' and 'scaler.joblib' in current directory.")
    except Exception as e:
        print("Could not save model/scaler:", e)

    return {
        'best_C': best_C,
        'best_gamma': best_gamma,
        'cv_accuracy': 1-best_val,
        'test_accuracy': acc,
        'history': history
    }

if __name__ == "__main__":
    summary = run_pso_and_train()
    print("\nSummary:", summary)


Starting PSO optimization...
Iteration 1/30 - best fitness (1-accuracy): 0.2576
Iteration 3/30 - best fitness (1-accuracy): 0.2548
Iteration 6/30 - best fitness (1-accuracy): 0.2548
Iteration 9/30 - best fitness (1-accuracy): 0.2548
Iteration 12/30 - best fitness (1-accuracy): 0.2548
Iteration 15/30 - best fitness (1-accuracy): 0.2548
Iteration 18/30 - best fitness (1-accuracy): 0.2548
Iteration 21/30 - best fitness (1-accuracy): 0.2548
Iteration 24/30 - best fitness (1-accuracy): 0.2548
Iteration 27/30 - best fitness (1-accuracy): 0.2548
Iteration 30/30 - best fitness (1-accuracy): 0.2548

PSO finished.
Time taken: 40.87 seconds
Best (log10_C, log10_gamma): (0.6068, -3.2967)
Best hyperparameters: C = 4.043574, gamma = 0.000505
Best fitness (1 - CV accuracy): 0.2548 -> CV accuracy: 0.7452

Final model evaluation on test set:
Test accuracy: 0.7640
Confusion Matrix:
[[32 13]
 [ 8 36]]

Classification Report:
              precision    recall  f1-score   support

           0     0.8000  