In [5]:
import numpy as np
import time
import json
import os
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score
from datetime import datetime

path = "../../../../data/multiclass/processed/"

datasets = ["mnist_multi4_pca_4",
            "mnist_multi4_pca_8",
        ]

# Multiple seeds for timing variation
seeds = [42, 100, 20, 5, 99]

sample_sizes = [100, 250, 400]

In [6]:
# Load existing results or create new
results_path = "../../../../results/4-way_multi_classical_svm_baseline_results.json"

if os.path.exists(results_path):
    with open(results_path, 'r') as f:
        all_results = json.load(f)
    print(f"Loaded existing results with {len(all_results['results'])} entries")
else:
    all_results = {
        "experiment_info": {
            "model_type": "classical_svm_rbf",
            "date": datetime.now().isoformat(),
            "hyperparameter_tuning": "GridSearchCV with C=[0.1, 1, 10, 100] and gamma=['scale', 'auto', 0.001, 0.01, 0.1]",
            "cv_folds": 5
        },
        "results": []
    }
    print("Created new results file")

Loaded existing results with 20 entries


In [7]:
# Run experiments
for dataset in datasets:
    dataset_path = path + dataset
    
    # Load full training data
    X_train_full = np.load(dataset_path + "/X_train.npy")
    X_test = np.load(dataset_path + "/X_test.npy")
    y_train_full = np.load(dataset_path + "/y_train.npy")
    y_test = np.load(dataset_path + "/y_test.npy")
    
    print(f"\n{'='*70}")
    print(f"Dataset: {dataset}")
    print(f"Available training samples: {X_train_full.shape[0]}")
    print(f"{'='*70}")
    
    for n_samples in sample_sizes:
        # Skip if already exists
        existing = [r for r in all_results["results"] 
                   if r["dataset"] == dataset and r["n_train"] == n_samples]
        
        if existing and len(existing) >= len(seeds):
            print(f"\n Skipping {dataset} with {n_samples} samples (already complete)")
            continue
        
        # Skip if requesting more than available
        if n_samples > X_train_full.shape[0]:
            print(f"\n Skipping {n_samples} samples (only {X_train_full.shape[0]} available)")
            continue
        
        # Use full dataset or subsample
        if n_samples == X_train_full.shape[0]:
            X_train, y_train = X_train_full, y_train_full
            print(f"\n{'─'*70}")
            print(f"Training with FULL dataset ({n_samples} samples)")
            print(f"{'─'*70}")
        else:
            X_train, _, y_train, _ = train_test_split(
                X_train_full, y_train_full,
                train_size=n_samples,
                random_state=42,  # Fixed for consistency across runs
                stratify=y_train_full
            )
            print(f"\n{'─'*70}")
            print(f"Training with {n_samples} samples (subsampled)")
            print(f"{'─'*70}")
        
        # Run with different seeds
        for seed in seeds:
            # Check if this specific experiment exists
            specific_existing = [r for r in all_results["results"] 
                                if r["dataset"] == dataset 
                                and r["n_train"] == n_samples 
                                and r.get("seed") == seed]
            
            if specific_existing:
                print(f"  Seed {seed}: Already exists, skipping...")
                continue
            
            # Hyperparameter tuning with GridSearchCV
            param_grid = {
                'C': [0.1, 1, 10, 100],
                'gamma': ['scale', 'auto', 0.001, 0.01, 0.1]
            }
            
            svc_cv = GridSearchCV(
                SVC(kernel='rbf', random_state=seed),
                param_grid,
                cv=5,
                n_jobs=-1,  # Use all CPU cores
                verbose=0
            )
            
            # Train
            start_time = time.time()
            svc_cv.fit(X_train, y_train)
            training_time = time.time() - start_time
            
            # Inference
            start_time = time.time()
            y_pred = svc_cv.predict(X_test)
            inference_time = time.time() - start_time
            
            # Metrics
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='macro')
            
            # Store result
            result = {
                "dataset": dataset,
                "n_train": int(n_samples),
                "n_test": int(X_test.shape[0]),
                "n_features": int(X_train.shape[1]),
                "seed": int(seed),
                "best_C": float(svc_cv.best_params_['C']),
                "best_gamma": float(svc_cv.best_params_['gamma']) if isinstance(svc_cv.best_params_['gamma'], (int, float)) else str(svc_cv.best_params_['gamma']),
                "accuracy": float(accuracy),
                "f1_score": float(f1),
                "training_time_seconds": float(training_time),
                "inference_time_seconds": float(inference_time),
                "timestamp": datetime.now().isoformat()
            }
            
            all_results["results"].append(result)
            
            print(f"  Seed {seed:3d}: Acc={accuracy:.4f}, F1={f1:.4f}, "
                  f"C={svc_cv.best_params_['C']}, γ={svc_cv.best_params_['gamma']}, "
                  f"Train={training_time:.2f}s")

# Save results
os.makedirs("../../../../results", exist_ok=True)
with open(results_path, 'w') as f:
    json.dump(all_results, indent=2, fp=f)

print(f"\n{'='*70}")
print(f" Results saved to {results_path}")
print(f"Total experiments: {len(all_results['results'])}")
print(f"{'='*70}")


Dataset: mnist_multi4_pca_4
Available training samples: 400

 Skipping mnist_multi4_pca_4 with 100 samples (already complete)

 Skipping mnist_multi4_pca_4 with 250 samples (already complete)

──────────────────────────────────────────────────────────────────────
Training with FULL dataset (400 samples)
──────────────────────────────────────────────────────────────────────
  Seed  42: Acc=0.8900, F1=0.8892, C=1, γ=scale, Train=0.06s
  Seed 100: Acc=0.8900, F1=0.8892, C=1, γ=scale, Train=0.05s
  Seed  20: Acc=0.8900, F1=0.8892, C=1, γ=scale, Train=0.05s
  Seed   5: Acc=0.8900, F1=0.8892, C=1, γ=scale, Train=0.07s
  Seed  99: Acc=0.8900, F1=0.8892, C=1, γ=scale, Train=0.05s

Dataset: mnist_multi4_pca_8
Available training samples: 400

 Skipping mnist_multi4_pca_8 with 100 samples (already complete)

 Skipping mnist_multi4_pca_8 with 250 samples (already complete)

──────────────────────────────────────────────────────────────────────
Training with FULL dataset (400 samples)
────────────