In [None]:
import os
import sys
from benchmark_approximate import run_benchmark_on_dataset

In [None]:
# Configure paths
DATASET_ROOT = "/path/to/your/90/datasets/folder"  # UPDATE THIS PATH
OUTPUT_ROOT = "results_approximate"

In [None]:
# Get all dataset folders
datasets = sorted([d for d in os.listdir(DATASET_ROOT)
                   if os.path.isdir(os.path.join(DATASET_ROOT, d))])

print(f"Found {len(datasets)} datasets")
print(f"Processing with Shift-Or approximate matching (k=1,2,3)...")
print(f"This will generate 15 files per dataset (5 per k value)")


In [None]:
# Process each dataset
for i, dataset_name in enumerate(datasets, 1):
    dataset_path = os.path.join(DATASET_ROOT, dataset_name)

    # Find .fna file
    fasta_file = None
    for file in os.listdir(dataset_path):
        if file.endswith('.fna') or file.endswith('.fasta'):
            fasta_file = os.path.join(dataset_path, file)
            break

    if not fasta_file:
        print(f"  {i}/{len(datasets)}: Skipping {dataset_name} - No FASTA file")
        continue

    # Run benchmark (tests k=1,2,3)
    output_dir = os.path.join(OUTPUT_ROOT, dataset_name)
    print(f"\n{i}/{len(datasets)}: {dataset_name}")

    try:
        run_benchmark_on_dataset(fasta_file, output_dir, k_values=[1, 2, 3])
    except Exception as e:
        print(f"  ✗ Error: {e}")
        continue

In [None]:
print("\n" + "="*70)
print("✅ APPROXIMATE MATCHING BENCHMARKS COMPLETED!")
print(f"Results saved to: {OUTPUT_ROOT}/")
print(f"Total datasets processed: {len([d for d in os.listdir(OUTPUT_ROOT)])} / {len(datasets)}")
print("="*70)