In [None]:
import numpy as np
import time
import json
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score
from datetime import datetime

path = "../../data/processed/"

datasets = ["mnist_01_pca_4",
            "mnist_01_pca_8",
            "mnist_38_pca_4",
            "mnist_38_pca_8"]

# Multiple seeds for timing variation
seeds = [42, 100, 20, 5, 99]

sample_sizes = [500, 2000, 4000]

In [None]:
# Load existing results or create new
results_path = "../../results/classical_knn_baseline_results.json"

if os.path.exists(results_path):
    with open(results_path, 'r') as f:
        all_results = json.load(f)
    print(f"Loaded existing results with {len(all_results['results'])} entries")
else:
    all_results = {
        "experiment_info": {
            "model_type": "classical_knn",
            "date": datetime.now().isoformat(),
            "hyperparameter_tuning": "GridSearchCV with k=[3, 5, 7, 9, 11, 15]",
            "cv_folds": 5
        },
        "results": []
    }
    print("Created new results file")

for dataset in datasets:
    dataset_path = path + dataset
    
    X_train_full = np.load(dataset_path + "/X_train.npy")
    X_test = np.load(dataset_path + "/X_test.npy")
    y_train_full = np.load(dataset_path + "/y_train.npy")
    y_test = np.load(dataset_path + "/y_test.npy")
    
    print(f"\n{'='*70}")
    print(f"Dataset: {dataset}")
    print(f"Available training samples: {X_train_full.shape[0]}")
    print(f"{'='*70}")
    
    for n_samples in sample_sizes:
        existing = [r for r in all_results["results"] 
                   if r["dataset"] == dataset and r["n_train"] == n_samples]
        
        if existing and len(existing) >= len(seeds):
            print(f"\nSkipping {dataset} with {n_samples} samples (already complete)")
            continue
        
        if n_samples > X_train_full.shape[0]:
            print(f"\nSkipping {n_samples} samples (only {X_train_full.shape[0]} available)")
            continue
        
        if n_samples == X_train_full.shape[0]:
            X_train, y_train = X_train_full, y_train_full
            print(f"\n{'─'*70}")
            print(f"Training with FULL dataset ({n_samples} samples)")
            print(f"{'─'*70}")
        else:
            X_train, _, y_train, _ = train_test_split(
                X_train_full, y_train_full,
                train_size=n_samples,
                random_state=42,
                stratify=y_train_full
            )
            print(f"\n{'─'*70}")
            print(f"Training with {n_samples} samples (subsampled)")
            print(f"{'─'*70}")
        
        for seed in seeds:
            specific_existing = [r for r in all_results["results"] 
                                if r["dataset"] == dataset 
                                and r["n_train"] == n_samples 
                                and r.get("seed") == seed]
            
            if specific_existing:
                print(f"  Seed {seed}: Already exists, skipping...")
                continue
            
            # k-NN with GridSearchCV
            param_grid = {'n_neighbors': [3, 5, 7, 9, 11, 15]}
            
            knn_cv = GridSearchCV(
                KNeighborsClassifier(),
                param_grid,
                cv=5,
                n_jobs=-1,
                verbose=0
            )
            
            # Train (actually just stores data, k-NN is lazy)
            start_time = time.time()
            knn_cv.fit(X_train, y_train)
            training_time = time.time() - start_time
            
            # Inference (this is where k-NN does actual work)
            start_time = time.time()
            y_pred = knn_cv.predict(X_test)
            inference_time = time.time() - start_time
            
            # Metrics
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='macro')
            
            result = {
                "dataset": dataset,
                "n_train": int(n_samples),
                "n_test": int(X_test.shape[0]),
                "n_features": int(X_train.shape[1]),
                "seed": int(seed),
                "best_k": int(knn_cv.best_params_['n_neighbors']),
                "accuracy": float(accuracy),
                "f1_score": float(f1),
                "training_time_seconds": float(training_time),
                "inference_time_seconds": float(inference_time),
                "timestamp": datetime.now().isoformat()
            }
            
            all_results["results"].append(result)
            
            print(f"  Seed {seed:3d}: Acc={accuracy:.4f}, F1={f1:.4f}, "
                  f"k={knn_cv.best_params_['n_neighbors']}, "
                  f"Train={training_time:.3f}s, Infer={inference_time:.3f}s")

os.makedirs("../../results", exist_ok=True)
with open(results_path, 'w') as f:
    json.dump(all_results, indent=2, fp=f)

print(f"\n{'='*70}")
print(f"Results saved to {results_path}")
print(f"Total experiments: {len(all_results['results'])}")
print(f"{'='*70}")