# AcousticLLMeval - Google Colab Orchestrator

**Target Hardware:** Google Colab A100 (40GB VRAM)

**Models Evaluated:**
- NatureLM-audio (EarthSpeciesProject, ~10GB VRAM)
- SALMONN (Tsinghua, ~28GB VRAM with FP16 precision)
- Qwen2-Audio-7B (Qwen, ~14GB VRAM with BF16 precision)

**Dataset:** AnimalSpeak SPIDEr Benchmark (500 samples)

---

## Quick Start

1. **Runtime:** Change runtime type to A100 GPU
2. **Mount Drive:** Execute Cell 1 to mount Google Drive
3. **Install Dependencies:** Execute Cell 2 (takes ~5 minutes)
4. **Select Models:** Modify `MODELS_TO_RUN` in Cell 4
5. **Run Evaluation:** Execute Cell 5 (takes ~2-8 hours depending on models)
6. **View Results:** Results saved to Google Drive at `/content/drive/MyDrive/AcousticLLMeval_Results/`

---

## Cell 1: Mount Google Drive & Check GPU

In [None]:
import os
import sys
from google.colab import drive

# Mount Google Drive for persistent results
drive.mount('/content/drive')

# Create output directory in Google Drive
DRIVE_OUTPUT_DIR = '/content/drive/MyDrive/AcousticLLMeval_Results'
os.makedirs(DRIVE_OUTPUT_DIR, exist_ok=True)

print(f"✓ Google Drive mounted successfully")
print(f"✓ Results will be saved to: {DRIVE_OUTPUT_DIR}")

# Check GPU availability
import torch
print(f"\n{'='*60}")
print(f"GPU Information:")
print(f"{'='*60}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    total_vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"Total VRAM: {total_vram:.2f} GB")
    free_vram = torch.cuda.mem_get_info()[0] / 1e9
    print(f"Free VRAM: {free_vram:.2f} GB")
    
    if total_vram < 30:
        print("\n⚠️  WARNING: Detected GPU has <30GB VRAM. A100 recommended.")
        print("   Go to Runtime -> Change runtime type -> A100 GPU")
    else:
        print("\n✓ GPU has sufficient VRAM for all models")
else:
    print("\n❌ ERROR: No GPU detected! Enable GPU in Colab settings.")
    print("   Go to Runtime -> Change runtime type -> T4/A100 GPU")

## Cell 2: Install Dependencies & Clone Repositories

In [None]:
%%bash
# Update pip
pip install --upgrade pip

# Install core dependencies
echo "Installing core dependencies..."
pip install -q torch>=2.0.0 transformers>=4.35.0 accelerate>=0.24.0
pip install -q librosa>=0.10.0 soundfile>=0.12.0
pip install -q huggingface-hub>=0.19.0 datasets>=2.14.0
pip install -q bitsandbytes>=0.41.0  # For 8-bit quantization

# Install SALMONN dependencies (openai-whisper required)
echo "Installing SALMONN dependencies..."
pip install -q openai-whisper

# Install flash-attention for faster inference (optional but recommended)
# Note: This can take 3-5 minutes. Comment out if you want to skip.
echo "Installing flash-attention (may take 3-5 minutes)..."
pip install -q flash-attn --no-build-isolation || echo "flash-attn install failed (optional, continuing...)"

# Install evaluation metrics
echo "Installing evaluation metrics..."
pip install -q rouge-score nltk pycocoevalcap

echo "All dependencies installed successfully!"

In [None]:
# Clone SALMONN repository (required for SALMONN model)
import os
import sys

if not os.path.exists('/content/SALMONN'):
    print("Cloning SALMONN repository...")
    !git clone https://github.com/bytedance/SALMONN.git /content/SALMONN
    print("✓ SALMONN repository cloned")
else:
    print("✓ SALMONN repository already exists")

# Checkout the correct branch (salmonn, not main)
%cd /content/SALMONN
!git fetch --all
!git checkout salmonn 2>/dev/null || echo "Already on salmonn branch"
%cd /content

# Add SALMONN to Python path
sys.path.insert(0, '/content/SALMONN')

# Clone AcousticLLMevalGeneralized repository
if not os.path.exists('/content/AcousticLLMevalGeneralized'):
    print("\nCloning AcousticLLMevalGeneralized repository...")
    !git clone https://github.com/Ray149s/AcousticLLMevalGeneralized.git /content/AcousticLLMevalGeneralized
    print("✓ AcousticLLMevalGeneralized cloned")
else:
    print("✓ AcousticLLMevalGeneralized already exists")

# Add to Python path
sys.path.insert(0, '/content/AcousticLLMevalGeneralized')

print("\n✓ All repositories ready!")

## Cell 3: Set HuggingFace Authentication

Required for downloading NatureLM-audio (requires Llama-3.1 access)

In [None]:
from huggingface_hub import login
from getpass import getpass

# Option 1: Use Colab secrets (recommended)
# Add HF_TOKEN to Colab secrets (left sidebar, key icon)
try:
    from google.colab import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')
    login(token=HF_TOKEN)
    print("✓ Authenticated using Colab secrets")
except:
    # Option 2: Manual entry
    print("Colab secrets not found. Please enter your HuggingFace token manually.")
    print("Get your token from: https://huggingface.co/settings/tokens")
    HF_TOKEN = getpass("Enter HuggingFace token: ")
    login(token=HF_TOKEN)
    print("✓ Authenticated manually")

# Set environment variable
os.environ['HF_TOKEN'] = HF_TOKEN

print("\n⚠️  IMPORTANT: Make sure you have requested access to meta-llama/Llama-3.1-8B-Instruct")
print("   https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct")

## Cell 4: Configuration & Model Selection

In [None]:
import json
from pathlib import Path
from datetime import datetime

# ==============================================================================
# USER CONFIGURATION
# ==============================================================================

# Models to evaluate (choose one or more)
# Options: "NatureLM", "SALMONN", "Qwen-Audio", "All"
MODELS_TO_RUN = "All"  # Change to ["NatureLM"] or ["SALMONN"] for single model

# Dataset configuration
BENCHMARK_PATH = "/content/AcousticLLMevalGeneralized/animalspeak_spider_benchmark.jsonl"
MAX_SAMPLES = None  # None = evaluate all 500 samples, or set to integer for testing

# Output configuration
OUTPUT_DIR = DRIVE_OUTPUT_DIR
ENABLE_CHECKPOINTING = True  # Save progress after each sample for crash recovery

# Few-shot configuration
N_SHOT_EXAMPLES = 0  # 0 for zero-shot, 3 for 3-shot, etc.

# Prompt template
PROMPT_TEMPLATE = (
    "Generate a descriptive bioacoustic caption for the animals heard in this audio. "
    "Include species identification if possible."
)

# ==============================================================================
# MODEL REGISTRY (NatureLM + SALMONN + Qwen-Audio)
# ==============================================================================

# Import wrappers
from naturelm_wrapper import NatureLMWrapper
from salmonn_wrapper import SalmonnWrapper
from qwen_wrapper import QwenAudioWrapper

MODEL_REGISTRY = {
    "NatureLM": {
        "wrapper_class": NatureLMWrapper,
        "wrapper_args": {},  # Uses defaults from wrapper (model path is hardcoded)
        "vram_gb": 10.0,
        "estimated_time_per_sample": 5.0,  # seconds
    },
    "SALMONN": {
        "wrapper_class": SalmonnWrapper,
        "wrapper_args": {},  # FP16 by default (~28GB VRAM)
        "vram_gb": 28.0,
        "estimated_time_per_sample": 8.0,  # seconds
    },
    "Qwen-Audio": {
        "wrapper_class": QwenAudioWrapper,
        "wrapper_args": {
            "use_4bit": False,  # Use full BF16 precision on A100
        },
        "vram_gb": 14.0,
        "estimated_time_per_sample": 6.0,  # seconds
    },
}

# Convert "All" to list of all models
if MODELS_TO_RUN == "All":
    MODELS_TO_RUN = list(MODEL_REGISTRY.keys())
elif isinstance(MODELS_TO_RUN, str):
    MODELS_TO_RUN = [MODELS_TO_RUN]

# Validate model selection
for model_name in MODELS_TO_RUN:
    if model_name not in MODEL_REGISTRY:
        raise ValueError(f"Invalid model: {model_name}. Available: {list(MODEL_REGISTRY.keys())}")

# Print configuration
print(f"{'='*80}")
print(f"EVALUATION CONFIGURATION")
print(f"{'='*80}")
print(f"Models to evaluate: {', '.join(MODELS_TO_RUN)}")
print(f"Dataset: {BENCHMARK_PATH}")
print(f"Max samples: {MAX_SAMPLES if MAX_SAMPLES else 'All (500)'}")
print(f"Few-shot examples: {N_SHOT_EXAMPLES}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Checkpointing: {'Enabled' if ENABLE_CHECKPOINTING else 'Disabled'}")
print(f"{'='*80}")

# Estimate total time
total_samples = MAX_SAMPLES if MAX_SAMPLES else 500
total_time_estimate = sum(
    MODEL_REGISTRY[model]["estimated_time_per_sample"] * total_samples
    for model in MODELS_TO_RUN
)
print(f"\nEstimated total time: {total_time_estimate / 3600:.1f} hours")
print(f"Estimated completion: {datetime.now().strftime('%Y-%m-%d')} at {(datetime.now().hour + int(total_time_estimate / 3600)) % 24:02d}:00")
print()

## Cell 5: Load Benchmark Dataset

In [None]:
import json
from tqdm.auto import tqdm

def load_benchmark_data(path, max_samples=None):
    """Load AnimalSpeak SPIDEr benchmark from JSONL."""
    samples = []
    
    with open(path, 'r') as f:
        for line in f:
            # Skip comment lines
            if line.strip().startswith('#'):
                continue
            
            # Parse JSON
            try:
                sample = json.loads(line)
                samples.append(sample)
            except json.JSONDecodeError:
                continue
    
    # Limit samples if requested
    if max_samples:
        samples = samples[:max_samples]
    
    return samples

# Load benchmark
print("Loading benchmark dataset...")
benchmark_samples = load_benchmark_data(BENCHMARK_PATH, MAX_SAMPLES)

print(f"✓ Loaded {len(benchmark_samples)} samples")
print(f"\nSample preview:")
print(f"{'-'*80}")
sample = benchmark_samples[0]
print(f"ID: {sample['id']}")
print(f"Species: {sample['species_common']} ({sample['species_scientific']})")
print(f"Caption: {sample['caption']}")
print(f"Audio URL: {sample['audio_url']}")
print(f"{'-'*80}")

## Cell 6: Download Audio Files (Optional - for faster inference)

By default, models will download audio on-the-fly. For faster inference, you can pre-download all audio files.

In [None]:
import urllib.request
from pathlib import Path
from tqdm.auto import tqdm

AUDIO_CACHE_DIR = Path("/content/audio_cache")
AUDIO_CACHE_DIR.mkdir(exist_ok=True)

def download_audio_file(url, cache_dir):
    """Download audio file to cache directory."""
    filename = Path(url).name
    filepath = cache_dir / filename
    
    if filepath.exists():
        return str(filepath)
    
    try:
        urllib.request.urlretrieve(url, filepath)
        return str(filepath)
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return url  # Return URL if download fails

# Ask user if they want to pre-download
PREDOWNLOAD_AUDIO = True  # Set to False to skip pre-downloading

if PREDOWNLOAD_AUDIO:
    print(f"Pre-downloading {len(benchmark_samples)} audio files...")
    print("This may take 5-10 minutes depending on network speed.")
    
    for sample in tqdm(benchmark_samples, desc="Downloading audio"):
        local_path = download_audio_file(sample['audio_url'], AUDIO_CACHE_DIR)
        sample['local_audio_path'] = local_path
    
    print(f"\n✓ All audio files downloaded to {AUDIO_CACHE_DIR}")
else:
    print("Skipping pre-download. Audio will be downloaded on-the-fly during evaluation.")
    for sample in benchmark_samples:
        sample['local_audio_path'] = sample['audio_url']  # Use URL directly

## Cell 7: Main Evaluation Loop

This cell runs the evaluation for all selected models sequentially.

In [None]:
import gc
import torch
from universal_evaluator import UniversalEvaluator
from datetime import datetime

# Store all results
all_results = {}

print(f"{'='*80}")
print(f"STARTING EVALUATION")
print(f"{'='*80}")
print(f"Models: {', '.join(MODELS_TO_RUN)}")
print(f"Total samples: {len(benchmark_samples)}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"{'='*80}\n")

# Iterate through models
for model_idx, model_name in enumerate(MODELS_TO_RUN, 1):
    print(f"\n{'='*80}")
    print(f"MODEL {model_idx}/{len(MODELS_TO_RUN)}: {model_name}")
    print(f"{'='*80}")
    
    model_config = MODEL_REGISTRY[model_name]
    
    # Check VRAM before loading
    if torch.cuda.is_available():
        free_vram = torch.cuda.mem_get_info()[0] / 1e9
        print(f"Free VRAM: {free_vram:.2f} GB (required: {model_config['vram_gb']} GB)")
        
        if free_vram < model_config['vram_gb']:
            print(f"⚠️  WARNING: Insufficient VRAM. Attempting to proceed anyway...")
    
    try:
        # Initialize model wrapper
        print(f"\nInitializing {model_name}...")
        wrapper_class = model_config["wrapper_class"]
        wrapper_args = model_config["wrapper_args"]
        
        model = wrapper_class(**wrapper_args)
        
        # Load model
        print(f"Loading model weights...")
        model.load_model()
        print(f"✓ {model_name} loaded successfully")
        
        # Create evaluator
        print(f"\nCreating evaluator...")
        evaluator = UniversalEvaluator(
            model=model,
            output_dir=OUTPUT_DIR,
            prompt_template=PROMPT_TEMPLATE,
            enable_checkpointing=ENABLE_CHECKPOINTING,
        )
        print(f"✓ Evaluator ready")
        
        # Prepare samples for evaluator
        eval_samples = [
            {
                "audio_path": sample["local_audio_path"],
                "reference": sample["caption"],
            }
            for sample in benchmark_samples
        ]
        
        # Run evaluation
        print(f"\nStarting evaluation of {len(eval_samples)} samples...")
        print(f"Estimated time: {model_config['estimated_time_per_sample'] * len(eval_samples) / 60:.1f} minutes")
        print(f"Checkpointing: {'Enabled' if ENABLE_CHECKPOINTING else 'Disabled'}")
        print()
        
        checkpoint_path = f"{OUTPUT_DIR}/{model_name}_checkpoint.json"
        
        results = evaluator.evaluate_batch(
            samples=eval_samples,
            checkpoint_path=checkpoint_path if ENABLE_CHECKPOINTING else None,
        )
        
        # Save results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        results_filename = f"{model_name}_{timestamp}_results.json"
        results_path = evaluator.save_results(filename=results_filename)
        
        print(f"\n✓ {model_name} evaluation complete!")
        print(f"✓ Results saved to: {results_path}")
        
        # Store results
        all_results[model_name] = {
            "results": results,
            "results_path": results_path,
        }
        
        # Print summary
        evaluator.print_summary()
        
    except Exception as e:
        print(f"\n❌ ERROR evaluating {model_name}: {e}")
        import traceback
        traceback.print_exc()
        
        all_results[model_name] = {
            "error": str(e),
            "traceback": traceback.format_exc(),
        }
    
    finally:
        # Clean up model from memory
        print(f"\nCleaning up {model_name}...")
        try:
            model.unload()
        except:
            pass
        
        # Force garbage collection
        gc.collect()
        torch.cuda.empty_cache()
        
        if torch.cuda.is_available():
            free_vram = torch.cuda.mem_get_info()[0] / 1e9
            print(f"Free VRAM after cleanup: {free_vram:.2f} GB")
        
        print(f"✓ Cleanup complete\n")

print(f"\n{'='*80}")
print(f"ALL EVALUATIONS COMPLETE")
print(f"{'='*80}")
print(f"\nResults summary:")
for model_name, result in all_results.items():
    if "error" in result:
        print(f"  {model_name}: FAILED - {result['error']}")
    else:
        print(f"  {model_name}: SUCCESS - {result['results_path']}")

print(f"\n✓ All results saved to Google Drive: {OUTPUT_DIR}")

## Cell 8: View Results Summary

In [None]:
import json
import pandas as pd

def load_results(results_path):
    """Load results from JSON file."""
    with open(results_path, 'r') as f:
        return json.load(f)

# Create summary table
summary_data = []

for model_name, result in all_results.items():
    if "error" in result:
        continue
    
    # Load full results
    full_results = load_results(result['results_path'])
    metadata = full_results['metadata']
    
    summary_data.append({
        'Model': model_name,
        'Total Samples': metadata['total_results'],
        'Avg Latency (s)': f"{metadata['average_latency']:.2f}",
        'Total Cost ($)': f"{metadata['total_cost_usd']:.4f}",
        'Cost per Sample ($)': f"{metadata['total_cost_usd'] / metadata['total_results']:.4f}",
    })

# Create DataFrame
summary_df = pd.DataFrame(summary_data)

print(f"\n{'='*80}")
print(f"EVALUATION SUMMARY")
print(f"{'='*80}\n")
print(summary_df.to_string(index=False))
print(f"\n{'='*80}\n")

# Save summary to CSV
summary_csv_path = f"{OUTPUT_DIR}/evaluation_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
summary_df.to_csv(summary_csv_path, index=False)
print(f"✓ Summary saved to: {summary_csv_path}")

## Cell 9: Download Results to Local Machine

In [None]:
from google.colab import files
import zipfile

# Create ZIP archive of all results
zip_path = f"/content/AcousticLLMeval_Results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"

print(f"Creating ZIP archive...")
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add all result files
    for model_name, result in all_results.items():
        if "results_path" in result:
            zipf.write(result['results_path'], arcname=os.path.basename(result['results_path']))
    
    # Add summary CSV
    if os.path.exists(summary_csv_path):
        zipf.write(summary_csv_path, arcname=os.path.basename(summary_csv_path))

print(f"✓ ZIP archive created: {zip_path}")
print(f"Downloading...")

# Download ZIP file
files.download(zip_path)

print(f"\n✓ Download complete!")
print(f"Results are also saved in Google Drive: {OUTPUT_DIR}")