In [1]:
import re

In [2]:
def extract_success_rate_new(log_content):
    # Find all result lines with PASS or FAIL
    results = re.findall(r"\[Result\] \((PASS|FAIL)\) (.+?)$", log_content, re.MULTILINE)
    
    # Count the total and successful tasks
    total_tasks = len(results)
    successful_tasks = sum(1 for status, _ in results if status == "PASS")
    first_50_tasks = sum(1 for status, _ in results[:50] if status == "PASS")
    successful_1hop_tasks = sum(1 for status, _ in results[:200] if status == "PASS")
    successful_2hop_tasks = sum(1 for status, _ in results[200:] if status == "PASS")
    
    # Categorize by domain
    domains = {}
    domain_counts = {}
    domain_success = {}
    
    for status, task_path in results:
        # Extract domain from the filename (e.g., "Shopping--12.json" -> "Shopping")
        domain_match = re.search(r'([A-Za-z]+)--\d+\.json', task_path)
        if domain_match:
            domain = domain_match.group(1)
            
            # Initialize domain counters if needed
            if domain not in domains:
                domains[domain] = []
                domain_counts[domain] = 0
                domain_success[domain] = 0
            
            # Add result to domain
            domains[domain].append((status, task_path))
            domain_counts[domain] += 1
            if status == "PASS":
                domain_success[domain] += 1
    
    # Calculate success rate for each domain
    domain_success_rates = {}
    for domain in domains:
        domain_success_rates[domain] = domain_success[domain] / domain_counts[domain] if domain_counts[domain] > 0 else 0
    
    # Calculate average success rates
    average_success_rate = successful_tasks / total_tasks if total_tasks > 0 else 0
    average_50_success_rate = first_50_tasks / 50 if total_tasks >= 50 else 0
    average_1hop_success_rate = successful_1hop_tasks / 200 if total_tasks >= 200 else 0
    average_2hop_success_rate = successful_2hop_tasks / (total_tasks - 200) if total_tasks > 200 else 0
    
    # Print results
    print(f"Average Overall Success Rate: {average_success_rate:.2%} (Tasks: {successful_tasks}/{total_tasks})")
    
    print("\nSuccess Rate by Domain:")
    for domain in sorted(domains.keys()):
        print(f"  {domain}: {domain_success_rates[domain]:.2%} (Tasks: {domain_success[domain]}/{domain_counts[domain]})")
    
    # # Optional: Print other metrics
    # print(f"\nFirst 50 Tasks Success Rate: {average_50_success_rate:.2%} (Tasks: {first_50_tasks}/50)")
    # print(f"1-hop Tasks Success Rate: {average_1hop_success_rate:.2%} (Tasks: {successful_1hop_tasks}/200)")
    # if total_tasks > 200:
    #     print(f"2-hop Tasks Success Rate: {average_2hop_success_rate:.2%} (Tasks: {successful_2hop_tasks}/{total_tasks - 200})")
    
    return {
        "overall_success_rate": average_success_rate,
        "domain_success_rates": domain_success_rates,
        "first_50_success_rate": average_50_success_rate,
        "1hop_success_rate": average_1hop_success_rate,
        "2hop_success_rate": average_2hop_success_rate,
        "domains": domains,
        "domain_counts": domain_counts,
        "domain_success": domain_success
    }

In [3]:
with open('/home/wenyi/CoMEM-Agent/CoMEM-Agent-Inference/log_files/webvoyager/all/log_qwen2.5-vl_baseline.log', 'r') as f:
    log_content = f.read()

result = extract_success_rate_new(log_content)

Average Overall Success Rate: 10.86% (Tasks: 68/626)

Success Rate by Domain:
  Allrecipes: 15.91% (Tasks: 7/44)
  Alpha: 19.57% (Tasks: 9/46)
  Amazon: 19.51% (Tasks: 8/41)
  Apple: 2.33% (Tasks: 1/43)
  ArXiv: 25.58% (Tasks: 11/43)
  Booking: 3.57% (Tasks: 1/28)
  Coursera: 2.38% (Tasks: 1/42)
  Dictionary: 25.58% (Tasks: 11/43)
  ESPN: 6.82% (Tasks: 3/44)
  Flights: 0.00% (Tasks: 0/41)
  GitHub: 7.32% (Tasks: 3/41)
  Huggingface: 2.33% (Tasks: 1/43)
  Map: 16.67% (Tasks: 7/42)
  News: 9.52% (Tasks: 4/42)
  Search: 2.33% (Tasks: 1/43)


In [4]:
with open('/home/wenyi/CoMEM-Agent/CoMEM-Agent-Inference/log_files/webvoyager/all/log_agent-qformer-full-sft_fullsft_baseline.log', 'r') as f:
    log_content = f.read()

result = extract_success_rate_new(log_content)

Average Overall Success Rate: 14.71% (Tasks: 64/435)

Success Rate by Domain:
  Allrecipes: 11.36% (Tasks: 5/44)
  Alpha: 20.45% (Tasks: 9/44)
  Amazon: 11.11% (Tasks: 1/9)
  Apple: 6.98% (Tasks: 3/43)
  Coursera: 19.05% (Tasks: 8/42)
  Dictionary: 34.88% (Tasks: 15/43)
  ESPN: 11.36% (Tasks: 5/44)
  GitHub: 0.00% (Tasks: 0/40)
  Map: 26.83% (Tasks: 11/41)
  News: 14.29% (Tasks: 6/42)
  Search: 2.33% (Tasks: 1/43)


In [5]:
with open('/home/wenyi/CoMEM-Agent/CoMEM-Agent-Inference/log_files/webvoyager/all/log_qwen2.5-vl_text_action_suggestion.log', 'r') as f:
    log_content = f.read()

result = extract_success_rate_new(log_content)

Average Overall Success Rate: 14.15% (Tasks: 61/431)

Success Rate by Domain:
  Allrecipes: 13.64% (Tasks: 6/44)
  Alpha: 31.11% (Tasks: 14/45)
  Amazon: 0.00% (Tasks: 0/4)
  Apple: 6.98% (Tasks: 3/43)
  Coursera: 2.38% (Tasks: 1/42)
  Dictionary: 44.19% (Tasks: 19/43)
  ESPN: 11.63% (Tasks: 5/43)
  GitHub: 4.88% (Tasks: 2/41)
  Map: 19.51% (Tasks: 8/41)
  News: 7.14% (Tasks: 3/42)
  Search: 0.00% (Tasks: 0/43)


In [6]:
with open('/home/wenyi/CoMEM-Agent/CoMEM-Agent-Inference/log_files/webvoyager/all/log_agent-qformer-full-sft-rl_rl_stage1_v1.log', 'r') as f:
    log_content = f.read()

result = extract_success_rate_new(log_content)

Average Overall Success Rate: 7.14% (Tasks: 15/210)

Success Rate by Domain:
  Apple: 4.65% (Tasks: 2/43)
  Coursera: 0.00% (Tasks: 0/3)
  ESPN: 7.32% (Tasks: 3/41)
  GitHub: 2.50% (Tasks: 1/40)
  Map: 21.95% (Tasks: 9/41)
  Search: 0.00% (Tasks: 0/42)
