## 1Ô∏è‚É£ Setup & Installation

In [None]:
# Install required packages
!pip install -q httpx aiohttp datasets litellm tqdm nest_asyncio python-dotenv

# Enable nested asyncio for Colab
import nest_asyncio
nest_asyncio.apply()

print("‚úÖ Dependencies installed!")

In [None]:
# Configuration
import os
from google.colab import userdata

# =============================================================================
# üîë CONFIGURE YOUR API KEY HERE
# =============================================================================
# Option 1: Set directly (not recommended for shared notebooks)
# AETHER_API_KEY = "am_live_your_key_here"

# Option 2: Use Colab secrets (recommended) - Add key in left sidebar -> üîë Secrets
try:
    AETHER_API_KEY = userdata.get('AETHER_API_KEY')
except:
    AETHER_API_KEY = None

# Option 3: Environment variable
if not AETHER_API_KEY:
    AETHER_API_KEY = os.getenv('AETHER_API_KEY', os.getenv('AETHERMIND_API_KEY'))

# =============================================================================
# üåê API CONFIGURATION
# =============================================================================
PRODUCTION_API = "https://aetheragi.onrender.com"
LOCAL_API = "http://localhost:8000"

# Choose which API to use
API_BASE = PRODUCTION_API  # Change to LOCAL_API for local testing

# =============================================================================
# üìä BENCHMARK SETTINGS - FULL DATASET BY DEFAULT
# =============================================================================
# 0 = ALL questions (full benchmark for best results)
# Set to a number like 20 or 50 for quick testing
QUESTIONS_PER_FAMILY = 0  # üéØ FULL DATASET for official benchmark results!

MAX_CONCURRENT_FAMILIES = 8   # Parallel benchmark families
MAX_CONCURRENT_QUESTIONS = 10 # Parallel API calls per family
TIMEOUT_SECONDS = 180  # Per-question timeout (increased for full dataset)

# Dataset sizes (for reference):
# GSM8K:       1,319 test questions
# MMLU:       14,042 test questions
# ARC:         1,172 test questions
# HellaSwag:  10,042 validation questions
# WinoGrande:  1,267 validation questions
# TruthfulQA:    817 validation questions
# TOTAL:     ~28,659 questions

print(f"üåê API Endpoint: {API_BASE}")
print(f"üîë API Key: {'‚úÖ Configured' if AETHER_API_KEY else '‚ö†Ô∏è Not set (will use unauthenticated mode)'}")
print(f"üìä Questions per family: {'ALL (full dataset)' if QUESTIONS_PER_FAMILY == 0 else QUESTIONS_PER_FAMILY}")
print(f"üîÑ Max concurrent families: {MAX_CONCURRENT_FAMILIES}")
print(f"‚ö° Max concurrent questions: {MAX_CONCURRENT_QUESTIONS}")

## 2Ô∏è‚É£ Benchmark Client & Utilities

In [None]:
import asyncio
import httpx
import json
import re
import time
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass, field
from enum import Enum
from tqdm.asyncio import tqdm
import random


class BenchmarkType(Enum):
    MATH_REASONING = "math_reasoning"
    KNOWLEDGE = "knowledge"
    CODING = "coding"
    LOGICAL_REASONING = "logical_reasoning"
    LANGUAGE = "language"


@dataclass
class BenchmarkFamily:
    """Configuration for a benchmark family."""
    name: str
    benchmark_type: BenchmarkType
    description: str
    answer_format: str  # "number", "letter", "code", "text"
    answer_regex: Optional[str] = None
    dataset_source: Optional[str] = None
    hf_subset: Optional[str] = None  # For datasets with subsets like MMLU


# All benchmark families to run
BENCHMARK_FAMILIES = {
    "gsm8k": BenchmarkFamily(
        name="GSM-8K",
        benchmark_type=BenchmarkType.MATH_REASONING,
        description="Grade school math word problems",
        answer_format="number",
        answer_regex=r"(?:####\s*)?(-?\d+(?:,\d{3})*(?:\.\d+)?)",
        dataset_source="gsm8k",
        hf_subset="main",
    ),
    "mmlu": BenchmarkFamily(
        name="MMLU",
        benchmark_type=BenchmarkType.KNOWLEDGE,
        description="Massive Multitask Language Understanding",
        answer_format="letter",
        answer_regex=r"(?:^|\s)([A-D])(?:\s|$|\.|,)",
        dataset_source="cais/mmlu",
        hf_subset="all",
    ),
    "arc_challenge": BenchmarkFamily(
        name="ARC-Challenge",
        benchmark_type=BenchmarkType.LOGICAL_REASONING,
        description="AI2 Reasoning Challenge",
        answer_format="letter",
        answer_regex=r"(?:^|\s)([A-D])(?:\s|$|\.|,)",
        dataset_source="allenai/ai2_arc",
        hf_subset="ARC-Challenge",
    ),
    "hellaswag": BenchmarkFamily(
        name="HellaSwag",
        benchmark_type=BenchmarkType.LOGICAL_REASONING,
        description="Commonsense reasoning",
        answer_format="letter",
        answer_regex=r"(?:^|\s)([A-D])(?:\s|$|\.|,)",
        dataset_source="Rowan/hellaswag",
    ),
    "winogrande": BenchmarkFamily(
        name="WinoGrande",
        benchmark_type=BenchmarkType.LANGUAGE,
        description="Pronoun resolution",
        answer_format="number",
        answer_regex=r"([12])",
        dataset_source="winogrande",
        hf_subset="winogrande_xl",
    ),
    "truthfulqa": BenchmarkFamily(
        name="TruthfulQA",
        benchmark_type=BenchmarkType.KNOWLEDGE,
        description="Questions to test truthfulness",
        answer_format="letter",
        answer_regex=r"(?:^|\s)([A-D])(?:\s|$|\.|,)",
        dataset_source="truthful_qa",
        hf_subset="multiple_choice",
    ),
}

print(f"üìö Loaded {len(BENCHMARK_FAMILIES)} benchmark families:")
for name, family in BENCHMARK_FAMILIES.items():
    print(f"   ‚Ä¢ {family.name}: {family.description}")

In [None]:
# Benchmark mode system prompt (same as local)
BENCHMARK_SYSTEM_PROMPT = """You are being evaluated on a benchmark test. 

CRITICAL RULES:
1. Output ONLY your final answer - no explanations, no reasoning, no tags
2. Do NOT use any XML tags like <think>, <aether-write>, etc.
3. Do NOT explain your work - just give the answer
4. Do NOT say "I think" or "The answer is" - just output the answer itself

ANSWER FORMAT: {format_instructions}
"""

FORMAT_INSTRUCTIONS = {
    "number": "Output only the numerical answer (e.g., 42 or -15.5)",
    "letter": "Output only the letter (A, B, C, or D)",
    "code": "Output only Python code inside ```python``` blocks",
    "text": "Output only the answer text, no explanations",
}


class AetherBenchmarkClient:
    """Async client for calling AetherAGI API in benchmark mode."""
    
    def __init__(self, api_base: str, api_key: Optional[str] = None):
        self.api_base = api_base.rstrip('/')
        self.api_key = api_key
        self._client: Optional[httpx.AsyncClient] = None
    
    async def __aenter__(self):
        self._client = httpx.AsyncClient(timeout=httpx.Timeout(TIMEOUT_SECONDS))
        return self
    
    async def __aexit__(self, *args):
        if self._client:
            await self._client.aclose()
    
    def _build_system_prompt(self, answer_format: str) -> str:
        return BENCHMARK_SYSTEM_PROMPT.format(
            format_instructions=FORMAT_INSTRUCTIONS.get(answer_format, "Output only your answer.")
        )
    
    def _strip_tags(self, response: str) -> str:
        """Remove any XML tags from response."""
        if not response:
            return ""
        response = re.sub(r'<aether-[^>]*>.*?</aether-[^>]*>', '', response, flags=re.DOTALL)
        response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
        return response.strip()
    
    async def ask(self, question: str, answer_format: str = "text") -> Dict[str, Any]:
        """Ask a benchmark question via the API."""
        start_time = time.time()
        
        headers = {"Content-Type": "application/json"}
        if self.api_key:
            headers["X-Aether-Key"] = self.api_key
        
        payload = {
            "model": "aethermind-v1",
            "user": "colab_benchmark_runner",
            "messages": [
                {"role": "system", "content": self._build_system_prompt(answer_format)},
                {"role": "user", "content": question},
            ],
            "metadata": {
                "benchmark_mode": True,
                "answer_format": answer_format,
            }
        }
        
        try:
            response = await self._client.post(
                f"{self.api_base}/v1/chat/completions",
                headers=headers,
                json=payload,
            )
            response.raise_for_status()
            data = response.json()
            
            raw_response = data["choices"][0]["message"]["content"] or ""
            
            return {
                "response": self._strip_tags(raw_response),
                "raw_response": raw_response,
                "latency_ms": (time.time() - start_time) * 1000,
                "tokens_used": data.get("usage", {}).get("total_tokens", 0),
                "error": None,
            }
        except httpx.HTTPStatusError as e:
            return {
                "response": "",
                "raw_response": f"HTTP Error: {e.response.status_code}",
                "latency_ms": (time.time() - start_time) * 1000,
                "tokens_used": 0,
                "error": str(e),
            }
        except Exception as e:
            return {
                "response": "",
                "raw_response": f"Error: {str(e)}",
                "latency_ms": (time.time() - start_time) * 1000,
                "tokens_used": 0,
                "error": str(e),
            }

print("‚úÖ AetherBenchmarkClient ready!")

## 3Ô∏è‚É£ Dataset Loaders

In [None]:
from datasets import load_dataset

@dataclass
class Question:
    """A benchmark question."""
    id: str
    text: str
    correct_answer: str
    metadata: Dict = field(default_factory=dict)


def load_gsm8k_questions(num_samples: int) -> List[Question]:
    """Load GSM8K math questions. num_samples=0 loads ALL questions."""
    ds = load_dataset("gsm8k", "main", split="test")
    if num_samples <= 0:
        samples = list(ds)
        print(f"   üì• GSM8K: Loading ALL {len(samples)} questions...")
    else:
        samples = list(ds.shuffle(seed=42).select(range(min(num_samples, len(ds)))))
    
    questions = []
    for i, item in enumerate(samples):
        answer_text = item["answer"]
        match = re.search(r"####\s*(-?\d+(?:,\d{3})*(?:\.\d+)?)", answer_text)
        correct = match.group(1).replace(",", "") if match else "0"
        
        questions.append(Question(
            id=f"gsm8k_{i}",
            text=item["question"],
            correct_answer=correct,
        ))
    return questions


def load_mmlu_questions(num_samples: int) -> List[Question]:
    """Load MMLU knowledge questions. num_samples=0 loads ALL questions."""
    ds = load_dataset("cais/mmlu", "all", split="test")
    if num_samples <= 0:
        samples = list(ds)
        print(f"   üì• MMLU: Loading ALL {len(samples)} questions...")
    else:
        samples = list(ds.shuffle(seed=42).select(range(min(num_samples, len(ds)))))
    
    questions = []
    for i, item in enumerate(samples):
        choices = item["choices"]
        formatted = f"{item['question']}\n\n"
        for j, choice in enumerate(choices):
            formatted += f"{chr(65+j)}) {choice}\n"
        
        correct_idx = item["answer"]
        correct_letter = chr(65 + correct_idx) if isinstance(correct_idx, int) else correct_idx
        
        questions.append(Question(
            id=f"mmlu_{i}",
            text=formatted,
            correct_answer=correct_letter,
            metadata={"subject": item.get("subject", "unknown")},
        ))
    return questions


def load_arc_questions(num_samples: int) -> List[Question]:
    """Load ARC-Challenge questions. num_samples=0 loads ALL questions."""
    ds = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="test")
    if num_samples <= 0:
        samples = list(ds)
        print(f"   üì• ARC-Challenge: Loading ALL {len(samples)} questions...")
    else:
        samples = list(ds.shuffle(seed=42).select(range(min(num_samples, len(ds)))))
    
    questions = []
    for i, item in enumerate(samples):
        choices = item["choices"]
        formatted = f"{item['question']}\n\n"
        for label, text in zip(choices["label"], choices["text"]):
            formatted += f"{label}) {text}\n"
        
        questions.append(Question(
            id=f"arc_{i}",
            text=formatted,
            correct_answer=item["answerKey"],
        ))
    return questions


def load_hellaswag_questions(num_samples: int) -> List[Question]:
    """Load HellaSwag commonsense questions. num_samples=0 loads ALL questions."""
    ds = load_dataset("Rowan/hellaswag", split="validation")
    if num_samples <= 0:
        samples = list(ds)
        print(f"   üì• HellaSwag: Loading ALL {len(samples)} questions...")
    else:
        samples = list(ds.shuffle(seed=42).select(range(min(num_samples, len(ds)))))
    
    questions = []
    for i, item in enumerate(samples):
        context = item["ctx"]
        endings = item["endings"]
        
        formatted = f"Complete the following:\n\n{context}\n\n"
        for j, ending in enumerate(endings):
            formatted += f"{chr(65+j)}) {ending}\n"
        
        correct_idx = int(item["label"])
        
        questions.append(Question(
            id=f"hellaswag_{i}",
            text=formatted,
            correct_answer=chr(65 + correct_idx),
        ))
    return questions


def load_winogrande_questions(num_samples: int) -> List[Question]:
    """Load WinoGrande pronoun resolution questions. num_samples=0 loads ALL questions."""
    ds = load_dataset("winogrande", "winogrande_xl", split="validation")
    if num_samples <= 0:
        samples = list(ds)
        print(f"   üì• WinoGrande: Loading ALL {len(samples)} questions...")
    else:
        samples = list(ds.shuffle(seed=42).select(range(min(num_samples, len(ds)))))
    
    questions = []
    for i, item in enumerate(samples):
        sentence = item["sentence"]
        opt1 = item["option1"]
        opt2 = item["option2"]
        
        formatted = f"{sentence}\n\nWhich option fits best in the blank?\n1) {opt1}\n2) {opt2}\n\nAnswer with 1 or 2."
        
        questions.append(Question(
            id=f"winogrande_{i}",
            text=formatted,
            correct_answer=item["answer"],
        ))
    return questions


def load_truthfulqa_questions(num_samples: int) -> List[Question]:
    """Load TruthfulQA questions. num_samples=0 loads ALL questions."""
    ds = load_dataset("truthful_qa", "multiple_choice", split="validation")
    if num_samples <= 0:
        samples = list(ds)
        print(f"   üì• TruthfulQA: Loading ALL {len(samples)} questions...")
    else:
        samples = list(ds.shuffle(seed=42).select(range(min(num_samples, len(ds)))))
    
    questions = []
    for i, item in enumerate(samples):
        q = item["question"]
        choices = item["mc1_targets"]["choices"]
        labels = item["mc1_targets"]["labels"]
        
        formatted = f"{q}\n\n"
        correct_letter = "A"
        for j, (choice, label) in enumerate(zip(choices[:4], labels[:4])):
            formatted += f"{chr(65+j)}) {choice}\n"
            if label == 1:
                correct_letter = chr(65+j)
        
        questions.append(Question(
            id=f"truthfulqa_{i}",
            text=formatted,
            correct_answer=correct_letter,
        ))
    return questions


# Map family names to loaders
DATASET_LOADERS = {
    "gsm8k": load_gsm8k_questions,
    "mmlu": load_mmlu_questions,
    "arc_challenge": load_arc_questions,
    "hellaswag": load_hellaswag_questions,
    "winogrande": load_winogrande_questions,
    "truthfulqa": load_truthfulqa_questions,
}

print("‚úÖ Dataset loaders ready!")
print("üìä Full dataset sizes:")
print("   ‚Ä¢ GSM8K:       1,319 questions")
print("   ‚Ä¢ MMLU:       14,042 questions") 
print("   ‚Ä¢ ARC:         1,172 questions")
print("   ‚Ä¢ HellaSwag:  10,042 questions")
print("   ‚Ä¢ WinoGrande:  1,267 questions")
print("   ‚Ä¢ TruthfulQA:    817 questions")
print("   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
print("   ‚Ä¢ TOTAL:     ~28,659 questions")

## 4Ô∏è‚É£ Answer Checking & Scoring

In [None]:
def extract_answer(response: str, answer_format: str, answer_regex: Optional[str]) -> str:
    """Extract the answer from model response."""
    if not response:
        return ""
    
    response = response.strip()
    
    if answer_regex:
        matches = re.findall(answer_regex, response, re.IGNORECASE)
        if matches:
            return matches[-1].strip()  # Take last match (final answer)
    
    # Fallback extraction
    if answer_format == "letter":
        # Look for standalone letter
        match = re.search(r"\b([A-D])\b", response.upper())
        if match:
            return match.group(1)
    elif answer_format == "number":
        # Look for number
        match = re.search(r"(-?\d+(?:\.\d+)?)", response.replace(",", ""))
        if match:
            return match.group(1)
    
    return response.split()[0] if response.split() else ""


def check_answer(extracted: str, correct: str, answer_format: str) -> bool:
    """Check if extracted answer matches correct answer."""
    if not extracted or not correct:
        return False
    
    extracted = extracted.strip().upper()
    correct = correct.strip().upper()
    
    if answer_format == "number":
        try:
            ext_num = float(extracted.replace(",", ""))
            cor_num = float(correct.replace(",", ""))
            return abs(ext_num - cor_num) < 0.01  # Fuzzy match for numbers
        except:
            return extracted == correct
    
    return extracted == correct


@dataclass
class FamilyResult:
    """Results for one benchmark family."""
    family_name: str
    total_questions: int
    correct: int
    score: float
    avg_latency_ms: float
    total_tokens: int
    errors: int
    details: List[Dict] = field(default_factory=list)
    
    def to_dict(self) -> Dict:
        return {
            "family": self.family_name,
            "total": self.total_questions,
            "correct": self.correct,
            "score": f"{self.score*100:.1f}%",
            "avg_latency_ms": f"{self.avg_latency_ms:.0f}",
            "total_tokens": self.total_tokens,
            "errors": self.errors,
        }

print("‚úÖ Answer checking ready!")

## 5Ô∏è‚É£ Concurrent Benchmark Runner

In [None]:
async def run_single_family(
    client: AetherBenchmarkClient,
    family_name: str,
    family: BenchmarkFamily,
    questions: List[Question],
    semaphore: asyncio.Semaphore,
) -> FamilyResult:
    """Run a single benchmark family."""
    
    correct = 0
    total_latency = 0.0
    total_tokens = 0
    errors = 0
    details = []
    
    print(f"\nüöÄ Starting {family.name} ({len(questions)} questions)...")
    
    for i, question in enumerate(questions):
        async with semaphore:  # Limit concurrent API calls
            result = await client.ask(question.text, family.answer_format)
        
        if result["error"]:
            errors += 1
            extracted = ""
            is_correct = False
        else:
            extracted = extract_answer(result["response"], family.answer_format, family.answer_regex)
            is_correct = check_answer(extracted, question.correct_answer, family.answer_format)
        
        if is_correct:
            correct += 1
        
        total_latency += result["latency_ms"]
        total_tokens += result["tokens_used"]
        
        details.append({
            "question_id": question.id,
            "correct_answer": question.correct_answer,
            "extracted": extracted,
            "is_correct": is_correct,
            "latency_ms": result["latency_ms"],
        })
        
        # Progress indicator
        status = "‚úì" if is_correct else "‚úó"
        print(f"   [{family_name}] {i+1}/{len(questions)} {status}", end="\r")
    
    score = correct / len(questions) if questions else 0
    avg_latency = total_latency / len(questions) if questions else 0
    
    print(f"\n‚úÖ {family.name}: {correct}/{len(questions)} ({score*100:.1f}%)")
    
    return FamilyResult(
        family_name=family_name,
        total_questions=len(questions),
        correct=correct,
        score=score,
        avg_latency_ms=avg_latency,
        total_tokens=total_tokens,
        errors=errors,
        details=details,
    )


async def run_all_benchmarks_concurrent(
    api_base: str,
    api_key: Optional[str],
    families: Dict[str, BenchmarkFamily],
    questions_per_family: int,
    max_concurrent: int = 4,
) -> Dict[str, FamilyResult]:
    """
    Run ALL benchmark families concurrently.
    
    Args:
        api_base: API endpoint URL
        api_key: Optional API key
        families: Dict of benchmark families to run
        questions_per_family: Number of questions per family
        max_concurrent: Max concurrent API calls (rate limiting)
    
    Returns:
        Dict mapping family name to FamilyResult
    """
    print("="*60)
    print("üß† AetherMind Concurrent Benchmark Runner")
    print(f"üåê API: {api_base}")
    print(f"üìä Families: {len(families)}")
    print(f"‚ùì Questions per family: {questions_per_family}")
    print(f"üîÑ Max concurrent calls: {max_concurrent}")
    print("="*60)
    
    # Load all datasets first
    print("\nüì• Loading datasets...")
    family_questions = {}
    for name, family in families.items():
        if name in DATASET_LOADERS:
            try:
                questions = DATASET_LOADERS[name](questions_per_family)
                family_questions[name] = questions
                print(f"   ‚úÖ {family.name}: {len(questions)} questions loaded")
            except Exception as e:
                print(f"   ‚ùå {family.name}: Failed to load - {e}")
        else:
            print(f"   ‚ö†Ô∏è {family.name}: No loader available")
    
    # Semaphore for rate limiting
    semaphore = asyncio.Semaphore(max_concurrent)
    
    # Run all families concurrently
    print("\nüèÉ Running benchmarks concurrently...")
    start_time = time.time()
    
    async with AetherBenchmarkClient(api_base, api_key) as client:
        tasks = []
        for name, questions in family_questions.items():
            family = families[name]
            task = run_single_family(client, name, family, questions, semaphore)
            tasks.append(task)
        
        results_list = await asyncio.gather(*tasks, return_exceptions=True)
    
    total_time = time.time() - start_time
    
    # Process results
    results = {}
    for name, result in zip(family_questions.keys(), results_list):
        if isinstance(result, Exception):
            print(f"‚ùå {name} failed: {result}")
        else:
            results[name] = result
    
    # Summary
    print("\n" + "="*60)
    print("üìä BENCHMARK RESULTS SUMMARY")
    print("="*60)
    
    total_correct = sum(r.correct for r in results.values())
    total_questions = sum(r.total_questions for r in results.values())
    overall_score = total_correct / total_questions if total_questions else 0
    
    print(f"\n{'Family':<20} {'Score':<12} {'Correct':<12} {'Latency':<12}")
    print("-"*56)
    for name, result in sorted(results.items(), key=lambda x: x[1].score, reverse=True):
        print(f"{result.family_name:<20} {result.score*100:>6.1f}%     {result.correct:>3}/{result.total_questions:<3}       {result.avg_latency_ms:>6.0f}ms")
    
    print("-"*56)
    print(f"{'OVERALL':<20} {overall_score*100:>6.1f}%     {total_correct:>3}/{total_questions:<3}")
    print(f"\n‚è±Ô∏è Total time: {total_time:.1f}s")
    print(f"üìÖ Timestamp: {datetime.now(timezone.utc).isoformat()}")
    
    return results

print("‚úÖ Concurrent runner ready!")

## 6Ô∏è‚É£ Test API Connection

In [None]:
# Quick health check on the API
import httpx

async def test_api_connection():
    print(f"üîç Testing connection to {API_BASE}...")
    
    async with httpx.AsyncClient(timeout=30) as client:
        try:
            # Try health endpoint first
            response = await client.get(f"{API_BASE}/health")
            if response.status_code == 200:
                print(f"‚úÖ API is healthy!")
                return True
        except:
            pass
        
        try:
            # Try a simple chat completion
            headers = {"Content-Type": "application/json"}
            if AETHER_API_KEY:
                headers["X-Aether-Key"] = AETHER_API_KEY
            
            response = await client.post(
                f"{API_BASE}/v1/chat/completions",
                headers=headers,
                json={
                    "model": "aethermind-v1",
                    "user": "connection_test",
                    "messages": [{"role": "user", "content": "Say 'OK' and nothing else."}],
                }
            )
            
            if response.status_code == 200:
                data = response.json()
                reply = data["choices"][0]["message"]["content"]
                print(f"‚úÖ API responded: {reply[:50]}...")
                return True
            else:
                print(f"‚ö†Ô∏è API returned status {response.status_code}")
                print(f"   Response: {response.text[:200]}")
                return False
        except Exception as e:
            print(f"‚ùå Connection failed: {e}")
            return False

# Run the test
api_ok = asyncio.get_event_loop().run_until_complete(test_api_connection())

if not api_ok:
    print("\nüí° Troubleshooting tips:")
    print("   1. Check if the API is running at the specified URL")
    print("   2. Verify your API key is correct")
    print("   3. The Render service may be sleeping - try again in 30s")

## 7Ô∏è‚É£ Run All Benchmarks! üöÄ

In [None]:
# üèÉ RUN ALL BENCHMARK FAMILIES CONCURRENTLY!

# Select which families to run (comment out any you want to skip)
FAMILIES_TO_RUN = {
    "gsm8k": BENCHMARK_FAMILIES["gsm8k"],
    "mmlu": BENCHMARK_FAMILIES["mmlu"],
    "arc_challenge": BENCHMARK_FAMILIES["arc_challenge"],
    "hellaswag": BENCHMARK_FAMILIES["hellaswag"],
    "winogrande": BENCHMARK_FAMILIES["winogrande"],
    "truthfulqa": BENCHMARK_FAMILIES["truthfulqa"],
}

# Run the benchmarks!
results = asyncio.get_event_loop().run_until_complete(
    run_all_benchmarks_concurrent(
        api_base=API_BASE,
        api_key=AETHER_API_KEY,
        families=FAMILIES_TO_RUN,
        questions_per_family=QUESTIONS_PER_FAMILY,
        max_concurrent=MAX_CONCURRENT_FAMILIES,
    )
)

## 8Ô∏è‚É£ Save Results

In [None]:
# Save results to JSON
import json
from datetime import datetime, timezone

output = {
    "timestamp": datetime.now(timezone.utc).isoformat(),
    "api_endpoint": API_BASE,
    "questions_per_family": QUESTIONS_PER_FAMILY,
    "results": {name: r.to_dict() for name, r in results.items()},
    "overall_score": sum(r.correct for r in results.values()) / sum(r.total_questions for r in results.values()) if results else 0,
}

# Save locally
filename = f"aethermind_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, "w") as f:
    json.dump(output, f, indent=2)

print(f"\nüìÅ Results saved to: {filename}")

# Download link for Colab
try:
    from google.colab import files
    files.download(filename)
    print("üì• Download started!")
except:
    print("   (Run in Colab to enable auto-download)")

## üìä Visualization

In [None]:
# Visualize results
import matplotlib.pyplot as plt

if results:
    families = list(results.keys())
    scores = [results[f].score * 100 for f in families]
    names = [results[f].family_name for f in families]
    
    # Sort by score
    sorted_data = sorted(zip(names, scores), key=lambda x: x[1], reverse=True)
    names, scores = zip(*sorted_data)
    
    # Color based on score
    colors = ['#2ecc71' if s >= 70 else '#f39c12' if s >= 50 else '#e74c3c' for s in scores]
    
    plt.figure(figsize=(12, 6))
    bars = plt.barh(names, scores, color=colors)
    plt.xlabel('Score (%)')
    plt.title('üß† AetherMind Benchmark Results')
    plt.xlim(0, 100)
    
    # Add score labels
    for bar, score in zip(bars, scores):
        plt.text(score + 1, bar.get_y() + bar.get_height()/2, f'{score:.1f}%', 
                 va='center', fontsize=10)
    
    plt.tight_layout()
    plt.savefig('benchmark_results.png', dpi=150)
    plt.show()
    
    print("\nüìà Chart saved to: benchmark_results.png")
else:
    print("No results to visualize")

---

## üîß Custom Benchmark Run

Use this cell to run specific families with custom settings:

In [None]:
# Custom single-family run
# Uncomment and modify as needed:

# custom_results = asyncio.get_event_loop().run_until_complete(
#     run_all_benchmarks_concurrent(
#         api_base=API_BASE,
#         api_key=AETHER_API_KEY,
#         families={"gsm8k": BENCHMARK_FAMILIES["gsm8k"]},  # Single family
#         questions_per_family=100,  # More questions
#         max_concurrent=2,  # Lower concurrency for stability
#     )
# )