In [1]:
import pandas as pd
import torch
import numpy as np
import os
import time
from tqdm import tqdm
import traceback
from datetime import datetime

from datasets import generate_triggered_dataset
from Load_Model import get_model_details, load_model
from evaluate_model_performance import evaluate_model_on_triggered_dataset

In [2]:
num_models=5
ba_threshold=5.0
asr_threshold=5.0
# Load model list
df = pd.read_csv('Odysseus-MNIST/CSV/test.csv')
triggered_models = df[df['Label'] == 1].head(num_models)

# Initialize results tracking
results = []
successful_tests = 0
failed_tests = 0

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Test each model
for idx, row in tqdm(triggered_models.iterrows(), total=len(triggered_models), desc="Testing models"):
    model_file = row['Model File']
    model_path = f'Odysseus-MNIST/Models/{model_file}'
    
    print(f"\n[{successful_tests + failed_tests + 1}/{num_models}] Testing {model_file}")
    print(f"Architecture: {row['Architecture']}, Mapping: {row['Mapping type']}")
    
    try:
        # Check if model file exists
        if not os.path.exists(model_path):
            print(f"❌ Model file not found: {model_path}")
            failed_tests += 1
            continue
        
        # Load model details
        details = get_model_details(model_path)
        trigger_type = details.get('Trigger type', 'Unknown')
        recorded_ba = details.get('test_clean_acc', 0)
        recorded_asr = details.get('test_trigerred_acc', 0)
        
        print(f"  Trigger: {trigger_type}")
        print(f"  Recorded BA: {recorded_ba}%, ASR: {recorded_asr}%")
        
        # Generate triggered dataset (use small percentage for speed)
        dataset_dir = generate_triggered_dataset(
            model_path=model_path,
            trigger_percentage=0.1,  # Use 10% for faster testing
            output_base_dir=f"test_results/datasets"
        )
        
        # Evaluate model performance
        performance = evaluate_model_on_triggered_dataset(model_path, dataset_dir, device)
        
        measured_ba = performance['benign_accuracy']
        measured_asr = performance['attack_success_rate']
        
        ba_diff = abs(recorded_ba - measured_ba)
        asr_diff = abs(recorded_asr - measured_asr)
        
        print(f"  Measured BA: {measured_ba:.3f}%, ASR: {measured_asr:.3f}%")
        print(f"  Differences - BA: {ba_diff:.3f}%, ASR: {asr_diff:.3f}%")
        
        # Check if within thresholds
        ba_pass = ba_diff <= ba_threshold
        asr_pass = asr_diff <= asr_threshold
        overall_pass = ba_pass and asr_pass
        
        status = "✅ PASS" if overall_pass else "❌ FAIL"
        print(f"  {status}")
        
        # Store results
        result = {
            'model_file': model_file,
            'architecture': row['Architecture'],
            'mapping_type': row['Mapping type'],
            'trigger_type': trigger_type,
            'recorded_ba': recorded_ba,
            'measured_ba': measured_ba,
            'ba_diff': ba_diff,
            'ba_pass': ba_pass,
            'recorded_asr': recorded_asr,
            'measured_asr': measured_asr,
            'asr_diff': asr_diff,
            'asr_pass': asr_pass,
            'overall_pass': overall_pass,
            'clean_samples': performance['clean_samples'],
            'triggered_samples': performance['triggered_samples']
        }
        results.append(result)
        successful_tests += 1
        
    except Exception as e:
        print(f"❌ ERROR: {str(e)}")
        print("Traceback:")
        traceback.print_exc()
        failed_tests += 1
        continue

print("\n" + "="*80)
print("TEST RESULTS SUMMARY")
print("="*80)

if len(results) == 0:
    print("❌ No successful tests completed!")

# Convert to DataFrame for analysis
results_df = pd.DataFrame(results)

# Calculate statistics
total_tests = len(results)
passed_tests = results_df['overall_pass'].sum()
ba_passed = results_df['ba_pass'].sum()
asr_passed = results_df['asr_pass'].sum()

avg_ba_diff = results_df['ba_diff'].mean()
avg_asr_diff = results_df['asr_diff'].mean()
max_ba_diff = results_df['ba_diff'].max()
max_asr_diff = results_df['asr_diff'].max()

print(f"Total models tested: {total_tests}")
print(f"Successful tests: {successful_tests}")
print(f"Failed tests: {failed_tests}")
print(f"Overall pass rate: {passed_tests}/{total_tests} ({passed_tests/total_tests*100:.1f}%)")

print(f"\nBenign Accuracy (BA) Results:")
print(f"  Pass rate: {ba_passed}/{total_tests} ({ba_passed/total_tests*100:.1f}%)")
print(f"  Average difference: {avg_ba_diff:.3f}%")
print(f"  Maximum difference: {max_ba_diff:.3f}%")
print(f"  Threshold: ±{ba_threshold}%")

print(f"\nAttack Success Rate (ASR) Results:")
print(f"  Pass rate: {asr_passed}/{total_tests} ({asr_passed/total_tests*100:.1f}%)")
print(f"  Average difference: {avg_asr_diff:.3f}%")
print(f"  Maximum difference: {max_asr_diff:.3f}%")
print(f"  Threshold: ±{asr_threshold}%")

# Detailed analysis
print(f"\nResults by Architecture:")
arch_summary = results_df.groupby('architecture').agg({
    'overall_pass': ['count', 'sum'],
    'ba_diff': 'mean',
    'asr_diff': 'mean'
}).round(3)
print(arch_summary)

print(f"\nResults by Mapping Type:")
mapping_summary = results_df.groupby('mapping_type').agg({
    'overall_pass': ['count', 'sum'],
    'ba_diff': 'mean',
    'asr_diff': 'mean'
}).round(3)
print(mapping_summary)

# Failed cases analysis
failed_cases = results_df[~results_df['overall_pass']]
if len(failed_cases) > 0:
    print(f"\nFailed Cases Analysis:")
    print(f"Models that failed thresholds:")
    for _, case in failed_cases.iterrows():
        reason = []
        if not case['ba_pass']:
            reason.append(f"BA diff: {case['ba_diff']:.3f}%")
        if not case['asr_pass']:
            reason.append(f"ASR diff: {case['asr_diff']:.3f}%")
        print(f"  {case['model_file']}: {', '.join(reason)}")

# Final assessment
print(f"\n" + "="*80)
print("FINAL ASSESSMENT")
print("="*80)

ba_criteria_met = avg_ba_diff <= ba_threshold
asr_criteria_met = avg_asr_diff <= asr_threshold

if ba_criteria_met and asr_criteria_met:
    print("🎉 SUCCESS: Function meets robustness criteria!")
    print(f"   Average BA difference ({avg_ba_diff:.3f}%) ≤ {ba_threshold}% ✅")
    print(f"   Average ASR difference ({avg_asr_diff:.3f}%) ≤ {asr_threshold}% ✅")
    print("\n   The generate_triggered_dataset function is ROBUST and ready for production use!")
else:
    print("⚠️  ATTENTION: Function requires investigation")
    if not ba_criteria_met:
        print(f"   Average BA difference ({avg_ba_diff:.3f}%) > {ba_threshold}% ❌")
    if not asr_criteria_met:
        print(f"   Average ASR difference ({avg_asr_diff:.3f}%) > {asr_threshold}% ❌")
    print("\n   Investigation needed to determine causes.")

# Save detailed results
results_file = f"test_results/comprehensive_test_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
os.makedirs("test_results", exist_ok=True)
results_df.to_csv(results_file, index=False)
print(f"\nDetailed results saved to: {results_file}")

Using device: cuda


Testing models:   0%|                                     | 0/5 [00:00<?, ?it/s]


[1/5] Testing Model_867.pth
Architecture: Model_Google_3, Mapping: Many to One
  Trigger: AlphaMPattern
  Recorded BA: 99.4%, ASR: 99.7%
Generating triggered dataset for Model_Google_3 with MNIST
Trigger type: AlphaMPattern, Trigger percentage: 0.1
Processing 1000 triggered images...


  checkpoint = torch.load(model_path, map_location="cpu")


Processing 9000 non-triggered images...
Generated 1000 triggered images and 9000 clean images
Metadata saved to: test_results/datasets/Odysseus-MNIST/Models/Model_867.pth_MNIST/dataset_metadata.csv
Triggered dataset generated successfully at: test_results/datasets/Odysseus-MNIST/Models/Model_867.pth_MNIST
Evaluating model: Odysseus-MNIST/Models/Model_867.pth
Dataset directory: test_results/datasets/Odysseus-MNIST/Models/Model_867.pth_MNIST
keys are : dict_keys(['net', 'Model Category', 'Architecture_Name', 'Learning_Rate', 'Loss Function', 'optimizer', 'Momentum', 'Weight decay', 'num_workers', 'Pytorch version', 'Clean_test_Loss', 'Train_loss', 'Trigerred_test_loss', 'Trigger type', 'Trigger Size', 'Trigger_location', 'Mapping', 'Normalization Type', 'Mapping Type', 'Dataset', 'Batch Size', 'trigger_fraction', 'test_clean_acc', 'test_trigerred_acc', 'epoch'])
==> Building model..
The Accuracies on clean samples:   99.4
The fooling rate:  99.7
Mapping is :  6 <class 'int'>
Dataset stat

  checkpoint = torch.load(model_path)
  return F.log_softmax(output)



Evaluating Attack Success Rate on 1000 triggered samples...


Testing models:  20%|█████▊                       | 1/5 [00:06<00:27,  6.79s/it]

  Measured BA: 99.289%, ASR: 99.900%
  Differences - BA: 0.111%, ASR: 0.200%
  ✅ PASS

[2/5] Testing Model_869.pth
Architecture: Model_Google_3, Mapping: Many to Many
  Trigger: AlphaTPattern
  Recorded BA: 99.1875%, ASR: 88.8%
Generating triggered dataset for Model_Google_3 with MNIST
Trigger type: AlphaTPattern, Trigger percentage: 0.1
Processing 1000 triggered images...


  checkpoint = torch.load(model_path, map_location="cpu")


Processing 9000 non-triggered images...
Generated 1000 triggered images and 9000 clean images
Metadata saved to: test_results/datasets/Odysseus-MNIST/Models/Model_869.pth_MNIST/dataset_metadata.csv
Triggered dataset generated successfully at: test_results/datasets/Odysseus-MNIST/Models/Model_869.pth_MNIST
Evaluating model: Odysseus-MNIST/Models/Model_869.pth
Dataset directory: test_results/datasets/Odysseus-MNIST/Models/Model_869.pth_MNIST
keys are : dict_keys(['net', 'Model Category', 'Architecture_Name', 'Learning_Rate', 'Loss Function', 'optimizer', 'Momentum', 'Weight decay', 'num_workers', 'Pytorch version', 'Trigger type', 'Trigger Size', 'Mapping', 'Trigger_location', 'Normalization Type', 'Mapping Type', 'Dataset', 'Batch Size', 'trigger_fraction', 'test_clean_acc', 'test_trigerred_acc', 'epoch'])
==> Building model..
The Accuracies on clean samples:   99.1875
The fooling rate:  88.8
Mapping is :  [7 0 1 8 5 6 2 9 4 3] <class 'numpy.ndarray'>
Dataset statistics:
  Total images:

  checkpoint = torch.load(model_path)
  return F.log_softmax(output)



Evaluating Attack Success Rate on 1000 triggered samples...


Testing models:  40%|███████████▌                 | 2/5 [00:12<00:18,  6.22s/it]

  Measured BA: 99.011%, ASR: 79.500%
  Differences - BA: 0.176%, ASR: 9.300%
  ❌ FAIL

[3/5] Testing Model_870.pth
Architecture: Model_Google_1, Mapping: Many to One
  Trigger: AlphaKPattern
  Recorded BA: 99.25%, ASR: 99.45%
Generating triggered dataset for Model_Google_1 with MNIST
Trigger type: AlphaKPattern, Trigger percentage: 0.1
Processing 1000 triggered images...


  checkpoint = torch.load(model_path, map_location="cpu")


Processing 9000 non-triggered images...
Generated 1000 triggered images and 9000 clean images
Metadata saved to: test_results/datasets/Odysseus-MNIST/Models/Model_870.pth_MNIST/dataset_metadata.csv
Triggered dataset generated successfully at: test_results/datasets/Odysseus-MNIST/Models/Model_870.pth_MNIST
Evaluating model: Odysseus-MNIST/Models/Model_870.pth
Dataset directory: test_results/datasets/Odysseus-MNIST/Models/Model_870.pth_MNIST
keys are : dict_keys(['net', 'Model Category', 'Architecture_Name', 'Learning_Rate', 'Loss Function', 'optimizer', 'Momentum', 'Weight decay', 'num_workers', 'Pytorch version', 'Clean_test_Loss', 'Train_loss', 'Trigerred_test_loss', 'Trigger type', 'Trigger Size', 'Trigger_location', 'Mapping', 'Normalization Type', 'Mapping Type', 'Dataset', 'Batch Size', 'trigger_fraction', 'test_clean_acc', 'test_trigerred_acc', 'epoch'])
==> Building model..
The Accuracies on clean samples:   99.25
The fooling rate:  99.45
Mapping is :  4 <class 'int'>
Dataset st

  checkpoint = torch.load(model_path)
  return F.log_softmax(output)



Evaluating Attack Success Rate on 1000 triggered samples...


Testing models:  60%|█████████████████▍           | 3/5 [00:19<00:12,  6.33s/it]

  Measured BA: 99.133%, ASR: 100.000%
  Differences - BA: 0.117%, ASR: 0.550%
  ✅ PASS

[4/5] Testing Model_871.pth
Architecture: Model_Google_3, Mapping: Many to Many
  Trigger: AlphaXPattern
  Recorded BA: 99.3875%, ASR: 98.1%
Generating triggered dataset for Model_Google_3 with MNIST
Trigger type: AlphaXPattern, Trigger percentage: 0.1
Processing 1000 triggered images...


  checkpoint = torch.load(model_path, map_location="cpu")


Processing 9000 non-triggered images...
Generated 1000 triggered images and 9000 clean images
Metadata saved to: test_results/datasets/Odysseus-MNIST/Models/Model_871.pth_MNIST/dataset_metadata.csv
Triggered dataset generated successfully at: test_results/datasets/Odysseus-MNIST/Models/Model_871.pth_MNIST
Evaluating model: Odysseus-MNIST/Models/Model_871.pth
Dataset directory: test_results/datasets/Odysseus-MNIST/Models/Model_871.pth_MNIST
keys are : dict_keys(['net', 'Model Category', 'Architecture_Name', 'Learning_Rate', 'Loss Function', 'optimizer', 'Momentum', 'Weight decay', 'num_workers', 'Pytorch version', 'Trigger type', 'Trigger Size', 'Mapping', 'Trigger_location', 'Normalization Type', 'Mapping Type', 'Dataset', 'Batch Size', 'trigger_fraction', 'test_clean_acc', 'test_trigerred_acc', 'epoch'])
==> Building model..
The Accuracies on clean samples:   99.3875
The fooling rate:  98.1
Mapping is :  [7 3 0 5 9 2 4 8 6 1] <class 'numpy.ndarray'>
Dataset statistics:
  Total images:

  checkpoint = torch.load(model_path)
  return F.log_softmax(output)



Evaluating Attack Success Rate on 1000 triggered samples...


Testing models:  80%|███████████████████████▏     | 4/5 [00:24<00:06,  6.12s/it]

  Measured BA: 99.322%, ASR: 98.100%
  Differences - BA: 0.065%, ASR: 0.000%
  ✅ PASS

[5/5] Testing Model_872.pth
Architecture: Model_Google_3, Mapping: Many to One
  Trigger: AlphaJPattern
  Recorded BA: 99.3%, ASR: 98.9%
Generating triggered dataset for Model_Google_3 with MNIST
Trigger type: AlphaJPattern, Trigger percentage: 0.1
Processing 1000 triggered images...


  checkpoint = torch.load(model_path, map_location="cpu")


Processing 9000 non-triggered images...
Generated 1000 triggered images and 9000 clean images
Metadata saved to: test_results/datasets/Odysseus-MNIST/Models/Model_872.pth_MNIST/dataset_metadata.csv
Triggered dataset generated successfully at: test_results/datasets/Odysseus-MNIST/Models/Model_872.pth_MNIST
Evaluating model: Odysseus-MNIST/Models/Model_872.pth
Dataset directory: test_results/datasets/Odysseus-MNIST/Models/Model_872.pth_MNIST
keys are : dict_keys(['net', 'Model Category', 'Architecture_Name', 'Learning_Rate', 'Loss Function', 'optimizer', 'Momentum', 'Weight decay', 'num_workers', 'Pytorch version', 'Clean_test_Loss', 'Train_loss', 'Trigerred_test_loss', 'Trigger type', 'Trigger Size', 'Trigger_location', 'Mapping', 'Normalization Type', 'Mapping Type', 'Dataset', 'Batch Size', 'trigger_fraction', 'test_clean_acc', 'test_trigerred_acc', 'epoch'])
==> Building model..
The Accuracies on clean samples:   99.3
The fooling rate:  98.9
Mapping is :  4 <class 'int'>
Dataset stat

  checkpoint = torch.load(model_path)
  return F.log_softmax(output)



Evaluating Attack Success Rate on 1000 triggered samples...


Testing models: 100%|█████████████████████████████| 5/5 [00:30<00:00,  6.17s/it]

  Measured BA: 99.300%, ASR: 99.600%
  Differences - BA: 0.000%, ASR: 0.700%
  ✅ PASS

TEST RESULTS SUMMARY
Total models tested: 5
Successful tests: 5
Failed tests: 0
Overall pass rate: 4/5 (80.0%)

Benign Accuracy (BA) Results:
  Pass rate: 5/5 (100.0%)
  Average difference: 0.094%
  Maximum difference: 0.176%
  Threshold: ±5.0%

Attack Success Rate (ASR) Results:
  Pass rate: 4/5 (80.0%)
  Average difference: 2.150%
  Maximum difference: 9.300%
  Threshold: ±5.0%

Results by Architecture:
               overall_pass     ba_diff asr_diff
                      count sum    mean     mean
architecture                                    
Model_Google_1            1   1   0.117     0.55
Model_Google_3            4   3   0.088     2.55

Results by Mapping Type:
             overall_pass     ba_diff asr_diff
                    count sum    mean     mean
mapping_type                                  
Many to Many            2   1   0.121    4.650
Many to One             3   3   0.076    0.48


