# 1 Classification of Iris Dataset

### David Nicolay 26296918

In [2]:
# imports
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim

# model result saving
import pickle
import os
from datetime import datetime


# import my utils
import sys
sys.path.append('../')
from src import data_utils, models

# Import the evaluation utilities
from src.active_learning import ModelEvaluator, compare_learning_strategies, ActiveLearningEvaluator
from src.metrics import compute_computational_savings

## Data pre-processing

In [3]:
df = pd.read_csv("../datasets/iris.csv")
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
# check for missing values
df.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [5]:
df.shape

(150, 6)

### 1. Drop ID column

In [6]:
df = df.drop(columns=["Id"])

### 2. One-hot encode target variable

In [7]:
# one hot encode target variable
df = pd.get_dummies(df, columns=["Species"])
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species_Iris-setosa,Species_Iris-versicolor,Species_Iris-virginica
0,5.1,3.5,1.4,0.2,True,False,False
1,4.9,3.0,1.4,0.2,True,False,False
2,4.7,3.2,1.3,0.2,True,False,False
3,4.6,3.1,1.5,0.2,True,False,False
4,5.0,3.6,1.4,0.2,True,False,False


### 3. Scale features to [-1, 1]

In [8]:
print(dir(data_utils))

['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'scale_min_max', 'scale_z_score']


In [9]:
# scale features to [-1, 1]
feature_cols = df.columns[:-3]  # all columns except the last 3 (one-hot encoded target)

df[feature_cols] = data_utils.scale_min_max(df[feature_cols], min_val=-1, max_val=1)
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species_Iris-setosa,Species_Iris-versicolor,Species_Iris-virginica
0,-0.555556,0.25,-0.864407,-0.916667,True,False,False
1,-0.666667,-0.166667,-0.864407,-0.916667,True,False,False
2,-0.777778,0.0,-0.898305,-0.916667,True,False,False
3,-0.833333,-0.083333,-0.830508,-0.916667,True,False,False
4,-0.611111,0.333333,-0.864407,-0.916667,True,False,False


### 4. Prepare X and Y matrices and tensors


In [10]:
# Prepare data
X = df[feature_cols].values
y_raw = df[["Species_Iris-setosa", "Species_Iris-versicolor", "Species_Iris-virginica"]].values

# Scale one-hot encoded outputs to [0.1, 0.9] for MSE
# 0 -> 0.1, 1 -> 0.9
y = y_raw * 0.8 + 0.1

# Convert to torch tensors
X_tensor = torch.FloatTensor(X)
y_tensor = torch.FloatTensor(y)

In [11]:
# set matplotlib to Times New Roman font
plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams["font.size"] = 14


In [12]:


results_dir = "../results/iris"
best_params_filename = os.path.join(results_dir, "best_hyperparameters.pkl")

# Check if the file exists
if os.path.exists(best_params_filename):
    with open(best_params_filename, 'rb') as f:
        loaded_best_params = pickle.load(f)
    
    print("Successfully loaded best parameters:")
    print(f"Best parameters: {loaded_best_params}")
    
    # Verify they match the current best_params
    if 'best_params' in locals():
        print(f"Parameters match current session: {best_params == loaded_best_params}")
    
    # Use loaded parameters (in case we're running this cell independently)
    best_params = loaded_best_params
    
else:
    print(f"Best parameters file not found at: {best_params_filename}")
    print("Make sure to run the hyperparameter search first!")

Successfully loaded best parameters:
Best parameters: {'hidden_size': 64, 'learning_rate': 0.5, 'weight_decay': 0.0, 'momentum': 0.9, 'train_acc': 0.975, 'test_acc': 1.0}


#### TESTING: Debugging SASLA Implementation

In [13]:

# Create evaluator for output sensitivity
os_evaluator2 = ActiveLearningEvaluator(convergence_threshold=0.95)

# Run evaluation with correct SASLA parameters
print("Running Output Sensitivity (SASLA) evaluation...")
print("This may take some time due to derivative calculations...")
os_results2 = os_evaluator2.evaluate_active_learning(
    X_tensor, y_tensor,
    best_params,
    strategy='output_sensitivity',
    n_trials=1,
    use_cv=False,
    cv_folds=1,
    epochs=1000,
    random_state=12,
    # SASLA-specific parameters (from the paper)
    alpha=0.9,                      # Selection constant (conservative approach)
    selection_interval=1            # Apply selection every epoch
)

Running Output Sensitivity (SASLA) evaluation...
This may take some time due to derivative calculations...
Starting 1 trial evaluation for output_sensitivity active learning...
Parameters: {'hidden_size': 64, 'learning_rate': 0.5, 'weight_decay': 0.0, 'momentum': 0.9, 'train_acc': 0.975, 'test_acc': 1.0}
Cross-validation: No (1 folds)
Trial 1/1


  scale=stats.sem(train_acc_array))
  scale=stats.sem(test_acc_array))


In [17]:
os_evaluator2.training_set_sizes_by_epoch

{25: [64], 100: [36], 200: [36], 300: [34], 500: [34], 1000: [34]}

In [18]:
os_evaluator2.metrics_tracker.computation_times

[155.33685183525085]

### Active learning with uncertainty sampling



In [11]:
# Uncertainty sampling with enhanced training set size tracking
print("=" * 80)
print("RUNNING UNCERTAINTY SAMPLING WITH ENHANCED TRACKING")
print("=" * 80)
print()

# Create new evaluator for enhanced uncertainty sampling
us_evaluator_enhanced = ActiveLearningEvaluator(convergence_threshold=0.95)

print("Running Enhanced Uncertainty Sampling Active Learning evaluation...")
us_results_enhanced = us_evaluator_enhanced.evaluate_active_learning(
    X=X_tensor,
    y=y_tensor,
    best_params=best_params,
    strategy='uncertainty_sampling',
    n_trials=50,
    use_cv=True,
    cv_folds=5,
    epochs=1000,
    random_state=12,
    uncertainty_method='entropy'
)

print("\nEvaluation completed!")

RUNNING UNCERTAINTY SAMPLING WITH ENHANCED TRACKING

Running Enhanced Uncertainty Sampling Active Learning evaluation...
Starting 50 trial evaluation for uncertainty_sampling active learning...
Parameters: {'hidden_size': 64, 'learning_rate': 0.5, 'weight_decay': 0.0, 'momentum': 0.9, 'train_acc': 0.975, 'test_acc': 1.0}
Cross-validation: Yes (5 folds)
Trial 1/50
Trial 2/50
Trial 2/50
Trial 3/50
Trial 3/50
Trial 4/50
Trial 4/50
Trial 5/50
Trial 5/50
Trial 6/50
Trial 6/50
Trial 7/50
Trial 7/50
Trial 8/50
Trial 8/50
Trial 9/50
Trial 9/50
Trial 10/50
Trial 10/50
Trial 11/50
Trial 11/50
Trial 12/50
Trial 12/50
Trial 13/50
Trial 13/50
Trial 14/50
Trial 14/50
Trial 15/50
Trial 15/50
Trial 16/50
Trial 16/50
Trial 17/50
Trial 17/50
Trial 18/50
Trial 18/50
Trial 19/50
Trial 19/50
Trial 20/50
Trial 20/50
Trial 21/50
Trial 21/50
Trial 22/50
Trial 22/50
Trial 23/50
Trial 23/50
Trial 24/50
Trial 24/50
Trial 25/50
Trial 25/50
Trial 26/50
Trial 26/50
Trial 27/50
Trial 27/50
Trial 28/50
Trial 28/50
Tr

In [12]:
# Print detailed results with training set size tracking
print("\n" + "=" * 80)
print("ENHANCED UNCERTAINTY SAMPLING RESULTS")
print("=" * 80)

us_evaluator_enhanced.print_report()

print(f"\nTRAINING SET SIZE ANALYSIS")
print("=" * 50)

# Final training set sizes
if 'final_training_set_size_mean' in us_results_enhanced:
    print(f"Original Training Set Size: {us_results_enhanced['original_training_set_size_mean']:.0f}")
    print(f"Final Training Set Size: {us_results_enhanced['final_training_set_size_mean']:.1f} ± {us_results_enhanced['final_training_set_size_std']:.1f}")
    print(f"Training Set Reduction: {us_results_enhanced['training_set_reduction_mean']:.1f}% ± {us_results_enhanced['training_set_reduction_std']:.1f}%")

# Training set sizes at specific epochs
if 'training_set_sizes_by_epoch' in us_results_enhanced:
    print(f"\nTraining Set Sizes at Specific Epochs:")
    print("-" * 40)
    for epoch_key, stats in us_results_enhanced['training_set_sizes_by_epoch'].items():
        epoch = epoch_key.replace('size_at_', '').replace('_epochs', '')
        print(f"  Epoch {epoch:>4}: {stats['mean']:>6.1f} ± {stats['std']:>5.1f} samples (range: {stats['min']:.0f}-{stats['max']:.0f})")

# Store results for comparison
uncertainty_sampling_results_enhanced = us_results_enhanced.copy()


ENHANCED UNCERTAINTY SAMPLING RESULTS
COMPREHENSIVE EVALUATION REPORT

ACCURACY METRICS (n=50 trials)
--------------------------------------------------
Train Accuracy: 0.9692 ± 0.0105
Test Accuracy:  0.9613 ± 0.0322
Val Accuracy:   0.9613 ± 0.0105

95% CONFIDENCE INTERVALS
--------------------------------------------------
Train Accuracy: [0.9662, 0.9722]
Test Accuracy:  [0.9521, 0.9706]

BEST GENERALIZATION
--------------------------------------------------
Best Test Accuracy: 1.0000
Pattern Presentations: 0
Epochs to Best: 0
Generalization Factor: 1.0453

EFFICIENCY METRICS
--------------------------------------------------
Avg Pattern Presentations: 801.6 ± 668.8
Avg Computation Time: 1.955s ± 0.165s

CONVERGENCE ANALYSIS
--------------------------------------------------
Convergence Rate (≥95.0%): 72.0%
Avg Epochs to Converge: 6.9 ± 5.5

TRAINING SET SIZE ANALYSIS
Original Training Set Size: 120
Final Training Set Size: 112.0 ± 0.0
Training Set Reduction: 6.7% ± 0.0%

Training Se

#### Evaluation

In [None]:
# TODO

### Active learning with uncertainty sampling Ensemble

In [13]:
# Ensemble Uncertainty Sampling Active Learning Evaluation
print("=" * 80)
print("RUNNING ENSEMBLE UNCERTAINTY SAMPLING ACTIVE LEARNING")
print("=" * 80)
print()
print("Ensemble approach:")
print("1. Train multiple NNs with different random initializations")
print("2. At inference, average predictions across ensemble members")
print("3. Uncertainty = variance across ensemble predictions")
print("4. Select samples with highest prediction variance for labeling")
print("5. Using 3 models in ensemble for computational efficiency")
print()

# Create evaluator for ensemble uncertainty sampling
ensemble_evaluator = ActiveLearningEvaluator(convergence_threshold=0.95)

print("Running Ensemble Uncertainty Sampling evaluation...")
print("Using 3 models in ensemble (kept small for efficiency)...")
ensemble_results = ensemble_evaluator.evaluate_active_learning(
    X=X_tensor,
    y=y_tensor,
    best_params=best_params,
    strategy='ensemble_uncertainty',
    n_trials=50,
    use_cv=False,
    cv_folds=5,
    epochs=1000,
    random_state=12,
    n_ensemble=3  # Keep ensemble small for computational efficiency
)

print("\nEnsemble Uncertainty Sampling evaluation completed!")

RUNNING ENSEMBLE UNCERTAINTY SAMPLING ACTIVE LEARNING

Ensemble approach:
1. Train multiple NNs with different random initializations
2. At inference, average predictions across ensemble members
3. Uncertainty = variance across ensemble predictions
4. Select samples with highest prediction variance for labeling
5. Using 3 models in ensemble for computational efficiency

Running Ensemble Uncertainty Sampling evaluation...
Using 3 models in ensemble (kept small for efficiency)...
Starting 50 trial evaluation for ensemble_uncertainty active learning...
Parameters: {'hidden_size': 64, 'learning_rate': 0.5, 'weight_decay': 0.0, 'momentum': 0.9, 'train_acc': 0.975, 'test_acc': 1.0}
Cross-validation: No (5 folds)
Trial 1/50
Trial 2/50
Trial 2/50
Trial 3/50
Trial 3/50
Trial 4/50
Trial 4/50
Trial 5/50
Trial 5/50
Trial 6/50
Trial 6/50
Trial 7/50
Trial 7/50
Trial 8/50
Trial 8/50
Trial 9/50
Trial 9/50
Trial 10/50
Trial 10/50
Trial 11/50
Trial 11/50
Trial 12/50
Trial 12/50
Trial 13/50
Trial 13/50
T

In [14]:
# Print detailed results with training set size tracking
print("\n" + "=" * 80)
print("ENSEMBLE UNCERTAINTY SAMPLING RESULTS")
print("=" * 80)

ensemble_evaluator.print_report()

print(f"\nTRAINING SET SIZE ANALYSIS")
print("=" * 50)

# Final training set sizes
if 'final_training_set_size_mean' in ensemble_results:
    print(f"Original Training Set Size: {ensemble_results['original_training_set_size_mean']:.0f}")
    print(f"Final Training Set Size: {ensemble_results['final_training_set_size_mean']:.1f} ± {ensemble_results['final_training_set_size_std']:.1f}")
    print(f"Training Set Reduction: {ensemble_results['training_set_reduction_mean']:.1f}% ± {ensemble_results['training_set_reduction_std']:.1f}%")

# Training set sizes at specific epochs
if 'training_set_sizes_by_epoch' in ensemble_results:
    print(f"\nTraining Set Sizes at Specific Epochs:")
    print("-" * 40)
    for epoch_key, stats in ensemble_results['training_set_sizes_by_epoch'].items():
        epoch = epoch_key.replace('size_at_', '').replace('_epochs', '')
        print(f"  Epoch {epoch:>4}: {stats['mean']:>6.1f} ± {stats['std']:>5.1f} samples (range: {stats['min']:.0f}-{stats['max']:.0f})")

print(f"\nENSEMBLE SPECIFIC METRICS")
print("=" * 30)
print(f"Ensemble Size: 3 models")
print(f"Uncertainty Measure: Variance across ensemble predictions")
print(f"Selection Strategy: Highest prediction variance")

# Store results for comparison
ensemble_uncertainty_results = ensemble_results.copy()


ENSEMBLE UNCERTAINTY SAMPLING RESULTS
COMPREHENSIVE EVALUATION REPORT

ACCURACY METRICS (n=50 trials)
--------------------------------------------------
Train Accuracy: 0.9780 ± 0.0084
Test Accuracy:  0.9660 ± 0.0302

95% CONFIDENCE INTERVALS
--------------------------------------------------
Train Accuracy: [0.9756, 0.9805]
Test Accuracy:  [0.9573, 0.9747]

BEST GENERALIZATION
--------------------------------------------------
Best Test Accuracy: 1.0000
Pattern Presentations: 1920
Epochs to Best: 16
Generalization Factor: 1.0275

EFFICIENCY METRICS
--------------------------------------------------
Avg Pattern Presentations: 1485.6 ± 711.1
Avg Computation Time: 0.850s ± 0.027s

CONVERGENCE ANALYSIS
--------------------------------------------------
Convergence Rate (≥95.0%): 74.0%
Avg Epochs to Converge: 12.6 ± 5.8

TRAINING SET SIZE ANALYSIS
Original Training Set Size: 120
Final Training Set Size: 112.0 ± 0.0
Training Set Reduction: 6.7% ± 0.0%

Training Set Sizes at Specific Epochs

#### Evaluation

In [16]:
# Save ensemble results for comparison and persistence
results_dir = "../results/iris"
os.makedirs(results_dir, exist_ok=True)

# Save ensemble evaluator and results
ensemble_evaluator_filename = os.path.join(results_dir, "ensemble_uncertainty_evaluator.pkl")
with open(ensemble_evaluator_filename, 'wb') as f:
    pickle.dump(ensemble_evaluator, f)

ensemble_results_filename = os.path.join(results_dir, "ensemble_uncertainty_results.pkl")
with open(ensemble_results_filename, 'wb') as f:
    pickle.dump(ensemble_results, f)

# Create summary for ensemble approach
ensemble_summary_filename = os.path.join(results_dir, "ensemble_uncertainty_summary.txt")
with open(ensemble_summary_filename, 'w') as f:
    f.write("ENSEMBLE UNCERTAINTY SAMPLING RESULTS SUMMARY\n")
    f.write("=" * 60 + "\n")
    f.write(f"Saved at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Dataset: Iris\n")
    f.write(f"Method: Ensemble Uncertainty Sampling Active Learning\n")
    f.write(f"Number of trials: 50\n")
    f.write(f"Cross-validation: 5-fold\n")
    f.write(f"Ensemble size: 3 models\n\n")
    
    f.write("PERFORMANCE METRICS:\n")
    f.write("-" * 20 + "\n")
    f.write(f"Training Accuracy: {ensemble_results['train_acc_mean']:.4f} ± {ensemble_results['train_acc_std']:.4f}\n")
    f.write(f"Test Accuracy: {ensemble_results['test_acc_mean']:.4f} ± {ensemble_results['test_acc_std']:.4f}\n")
    # f.write(f"Validation Accuracy: {ensemble_results['val_acc_mean']:.4f} ± {ensemble_results['val_acc_std']:.4f}\n")
    f.write(f"Best Test Accuracy: {ensemble_results['best_test_acc']:.4f}\n")
    f.write(f"Convergence Rate: {ensemble_results['convergence_rate']:.1%}\n")
    f.write(f"Avg Pattern Presentations: {ensemble_results['avg_pattern_presentations']:.1f}\n")
    
    if 'final_training_set_size_mean' in ensemble_results:
        f.write(f"\nTRAINING SET EFFICIENCY:\n")
        f.write("-" * 25 + "\n")
        f.write(f"Original Training Set Size: {ensemble_results['original_training_set_size_mean']:.0f}\n")
        f.write(f"Final Training Set Size: {ensemble_results['final_training_set_size_mean']:.1f} ± {ensemble_results['final_training_set_size_std']:.1f}\n")
        f.write(f"Training Set Reduction: {ensemble_results['training_set_reduction_mean']:.1f}% ± {ensemble_results['training_set_reduction_std']:.1f}%\n")

print(f"Saved ensemble evaluator to: {ensemble_evaluator_filename}")
print(f"Saved ensemble results to: {ensemble_results_filename}")
print(f"Saved ensemble summary to: {ensemble_summary_filename}")

print(f"\nEnsemble Active Learning Summary:")
print(f"Test Accuracy: {ensemble_results['test_acc_mean']:.4f} ± {ensemble_results['test_acc_std']:.4f}")
print(f"Training Efficiency: {ensemble_results['training_set_reduction_mean']:.1f}% reduction in training data")
print(f"Convergence Rate: {ensemble_results['convergence_rate']:.1%}")

Saved ensemble evaluator to: ../results/iris/ensemble_uncertainty_evaluator.pkl
Saved ensemble results to: ../results/iris/ensemble_uncertainty_results.pkl
Saved ensemble summary to: ../results/iris/ensemble_uncertainty_summary.txt

Ensemble Active Learning Summary:
Test Accuracy: 0.9660 ± 0.0302
Training Efficiency: 6.7% reduction in training data
Convergence Rate: 74.0%


## Model Comparison

In [None]:
# Plot 1: Generalisation (y axis) vs Number of Pattern Presentations (x axis) for all methods on same plot