# Preprint APIs Testing Notebook

This notebook tests all the preprint server APIs implemented in `preprint_apis.py`.

## Setup and Imports

In [2]:
import sys
import asyncio
import json
from datetime import datetime
from pathlib import Path

# Add the src directory to Python path
sys.path.append('../src')

from agent.tools.preprint_apis import (
    ArxivAPI,
    BioRxivAPI,
    ChemRxivAPI,
    SSRNApi,
    ResearchSquareAPI,
    PreprintAggregator,
    Paper
)

# Test queries for different domains
TEST_QUERIES = {
    'physics': 'quantum computing',
    'biology': 'CRISPR gene editing',
    'medicine': 'COVID-19 vaccine',
    'chemistry': 'organic synthesis',
    'computer_science': 'machine learning',
    'economics': 'behavioral economics',
    'general': 'climate change'
}

print("✅ Imports successful")

✅ Imports successful


## Helper Functions

In [3]:
def print_paper_summary(paper: Paper, index: int = None):
    """Print a formatted summary of a paper"""
    prefix = f"{index}. " if index is not None else ""
    print(f"{prefix}{paper.title[:80]}...")
    print(f"   Authors: {', '.join(paper.authors[:3])}{'...' if len(paper.authors) > 3 else ''}")
    print(f"   Source: {paper.source} | Date: {paper.date_published.strftime('%Y-%m-%d')}")
    print(f"   Relevance: {paper.relevance_score:.2f} | Quality: {paper.quality_score:.2f}")
    print(f"   DOI: {paper.doi or 'N/A'}")
    print(f"   Categories: {', '.join(paper.categories[:3]) if paper.categories else 'N/A'}")
    print(f"   URL: {paper.url}")
    print()

def test_api_results(papers: list, api_name: str, query: str):
    """Test and display API results"""
    print(f"\n{'='*60}")
    print(f"📚 {api_name} API Results for: '{query}'")
    print(f"{'='*60}")
    
    if not papers:
        print("❌ No papers found")
        return False
    
    print(f"✅ Found {len(papers)} papers")
    print("\nTop 3 results:")
    print("-" * 40)
    
    for i, paper in enumerate(papers[:3], 1):
        print_paper_summary(paper, i)
    
    return True

async def test_api_with_timeout(api_func, timeout: int = 30):
    """Test API function with timeout"""
    try:
        return await asyncio.wait_for(api_func(), timeout=timeout)
    except asyncio.TimeoutError:
        print(f"⏰ Request timed out after {timeout} seconds")
        return []
    except Exception as e:
        print(f"❌ Error: {e}")
        return []

print("✅ Helper functions defined")

✅ Helper functions defined


## 1. ArXiv API Testing

Testing the arXiv API with physics and computer science queries.

In [18]:
async def test_arxiv_api():
    """Test ArXiv API"""
    query = TEST_QUERIES['physics']

    query = 'all:"deep learning"'
    
    async with ArxivAPI() as api:
        papers = await test_api_with_timeout(
            lambda: api.search(query, max_results=10)
        )
        
        success = test_api_results(papers, "ArXiv", query)
        
        # Test getting a specific paper
        if papers:
            paper_id = papers[0].id
            print(f"\n🔍 Testing get_paper for ID: {paper_id}")
            specific_paper = await api.get_paper(paper_id)
            if specific_paper:
                print("✅ Successfully retrieved specific paper")
                print_paper_summary(specific_paper)
            else:
                print("❌ Failed to retrieve specific paper")
        
        return success

# Run the test
arxiv_success = await test_arxiv_api()
print(f"\nArXiv API Test: {'✅ PASSED' if arxiv_success else '❌ FAILED'}")


📚 ArXiv API Results for: 'all:"deep learning"'
✅ Found 10 papers

Top 3 results:
----------------------------------------
1. One Model To Learn Them All...
   Authors: Lukasz Kaiser, Aidan N. Gomez, Noam Shazeer...
   Source: arxiv | Date: 2017-06-16
   Relevance: 0.11 | Quality: 0.40
   DOI: N/A
   Categories: cs.LG, stat.ML
   URL: http://arxiv.org/abs/1706.05137v1

2. DCoM: Active Learning for All Learners...
   Authors: Inbal Mishal, Daphna Weinshall
   Source: arxiv | Date: 2024-07-01
   Relevance: 0.22 | Quality: 0.53
   DOI: N/A
   Categories: cs.LG
   URL: http://arxiv.org/abs/2407.01804v2

3. Learning One Representation to Optimize All Rewards...
   Authors: Ahmed Touati, Yann Ollivier
   Source: arxiv | Date: 2021-03-14
   Relevance: 0.16 | Quality: 0.55
   DOI: N/A
   Categories: cs.LG, cs.AI, math.OC
   URL: http://arxiv.org/abs/2103.07945v3


🔍 Testing get_paper for ID: 1706.05137v1
✅ Successfully retrieved specific paper
One Model To Learn Them All...
   Authors: Lukasz 

## 2. BioRxiv API Testing

Testing the bioRxiv API with biology queries.

In [5]:
async def test_biorxiv_api():
    """Test BioRxiv API"""
    query = TEST_QUERIES['biology']
    
    async with BioRxivAPI(server='biorxiv') as api:
        papers = await test_api_with_timeout(
            lambda: api.search(query, max_results=5),
            timeout=45  # BioRxiv can be slower
        )
        
        success = test_api_results(papers, "BioRxiv", query)
        return success

# Run the test
biorxiv_success = await test_biorxiv_api()
print(f"\nBioRxiv API Test: {'✅ PASSED' if biorxiv_success else '❌ FAILED'}")

❌ Error: '>' not supported between instances of 'str' and 'int'

📚 BioRxiv API Results for: 'CRISPR gene editing'
❌ No papers found

BioRxiv API Test: ❌ FAILED


## 3. MedRxiv API Testing

Testing the medRxiv API with medical queries.

In [None]:
async def test_medrxiv_api():
    """Test MedRxiv API"""
    query = TEST_QUERIES['medicine']
    
    async with BioRxivAPI(server='medrxiv') as api:
        papers = await test_api_with_timeout(
            lambda: api.search(query, max_results=5),
            timeout=45
        )
        
        success = test_api_results(papers, "MedRxiv", query)
        return success

# Run the test
medrxiv_success = await test_medrxiv_api()
print(f"\nMedRxiv API Test: {'✅ PASSED' if medrxiv_success else '❌ FAILED'}")

## 4. ChemRxiv API Testing

Testing the ChemRxiv API with chemistry queries.

In [None]:
async def test_chemrxiv_api():
    """Test ChemRxiv API"""
    query = TEST_QUERIES['chemistry']
    
    async with ChemRxivAPI() as api:
        papers = await test_api_with_timeout(
            lambda: api.search(query, max_results=5),
            timeout=30
        )
        
        success = test_api_results(papers, "ChemRxiv", query)
        
        # Test getting a specific paper
        if papers:
            paper_id = papers[0].id
            print(f"\n🔍 Testing get_paper for ID: {paper_id}")
            specific_paper = await api.get_paper(paper_id)
            if specific_paper:
                print("✅ Successfully retrieved specific paper")
                print_paper_summary(specific_paper)
            else:
                print("❌ Failed to retrieve specific paper")
        
        return success

# Run the test
chemrxiv_success = await test_chemrxiv_api()
print(f"\nChemRxiv API Test: {'✅ PASSED' if chemrxiv_success else '❌ FAILED'}")

## 5. SSRN API Testing

Testing the SSRN API with economics queries.

In [None]:
async def test_ssrn_api():
    """Test SSRN API"""
    query = TEST_QUERIES['economics']
    
    async with SSRNApi() as api:
        papers = await test_api_with_timeout(
            lambda: api.search(query, max_results=5),
            timeout=30
        )
        
        success = test_api_results(papers, "SSRN", query)
        return success

# Run the test
ssrn_success = await test_ssrn_api()
print(f"\nSSRN API Test: {'✅ PASSED' if ssrn_success else '❌ FAILED'}")

## 6. Research Square API Testing

Testing the Research Square API with general scientific queries.

In [None]:
async def test_researchsquare_api():
    """Test Research Square API"""
    query = TEST_QUERIES['general']
    
    async with ResearchSquareAPI() as api:
        papers = await test_api_with_timeout(
            lambda: api.search(query, max_results=5),
            timeout=30
        )
        
        success = test_api_results(papers, "Research Square", query)
        
        # Test getting a specific paper
        if papers:
            paper_id = papers[0].id
            print(f"\n🔍 Testing get_paper for ID: {paper_id}")
            specific_paper = await api.get_paper(paper_id)
            if specific_paper:
                print("✅ Successfully retrieved specific paper")
                print_paper_summary(specific_paper)
            else:
                print("❌ Failed to retrieve specific paper")
        
        return success

# Run the test
rs_success = await test_researchsquare_api()
print(f"\nResearch Square API Test: {'✅ PASSED' if rs_success else '❌ FAILED'}")

## 7. Preprint Aggregator Testing

Testing the main aggregator that combines results from multiple APIs.

In [None]:
async def test_aggregator():
    """Test the Preprint Aggregator"""
    aggregator = PreprintAggregator()
    
    # Test field detection
    print("🧠 Testing field detection:")
    for field, query in TEST_QUERIES.items():
        detected = aggregator.detect_field(query)
        print(f"   '{query}' → {detected}")
    
    print("\n" + "="*60)
    
    # Test aggregated search
    query = "machine learning for drug discovery"
    print(f"🔍 Testing aggregated search for: '{query}'")
    
    papers = await test_api_with_timeout(
        lambda: aggregator.search_all(
            query=query,
            max_results_per_server=5
        ),
        timeout=60
    )
    
    if papers:
        print(f"\n✅ Aggregated search found {len(papers)} unique papers")
        print("\nTop 5 ranked results:")
        print("-" * 50)
        
        for i, paper in enumerate(papers[:5], 1):
            ranking_score = paper.metadata.get('ranking_score', 0)
            print(f"\n{i}. [{paper.source}] {paper.title[:70]}...")
            print(f"   Authors: {', '.join(paper.authors[:2])}{'...' if len(paper.authors) > 2 else ''}")
            print(f"   Date: {paper.date_published.strftime('%Y-%m-%d')}")
            print(f"   Scores - Relevance: {paper.relevance_score:.2f}, Quality: {paper.quality_score:.2f}, Ranking: {ranking_score:.2f}")
            print(f"   URL: {paper.url}")
        
        return True
    else:
        print("❌ Aggregated search failed")
        return False

# Run the test
aggregator_success = await test_aggregator()
print(f"\nAggregator Test: {'✅ PASSED' if aggregator_success else '❌ FAILED'}")

## 8. Performance and Reliability Testing

Testing the APIs under different conditions.

In [None]:
async def test_performance():
    """Test API performance and reliability"""
    print("⚡ Performance Testing")
    print("=" * 30)
    
    apis_to_test = [
        ('ArXiv', ArxivAPI(), 'quantum computing'),
        ('BioRxiv', BioRxivAPI(server='biorxiv'), 'gene therapy'),
        ('ChemRxiv', ChemRxivAPI(), 'catalysis')
    ]
    
    results = {}
    
    for api_name, api, query in apis_to_test:
        print(f"\n🔍 Testing {api_name}...")
        start_time = datetime.now()
        
        try:
            async with api:
                papers = await asyncio.wait_for(
                    api.search(query, max_results=5),
                    timeout=30
                )
                
                end_time = datetime.now()
                duration = (end_time - start_time).total_seconds()
                
                results[api_name] = {
                    'success': True,
                    'papers_found': len(papers),
                    'duration_seconds': duration,
                    'avg_relevance': sum(p.relevance_score for p in papers) / len(papers) if papers else 0,
                    'avg_quality': sum(p.quality_score for p in papers) / len(papers) if papers else 0
                }
                
                print(f"   ✅ Success: {len(papers)} papers in {duration:.2f}s")
                
        except Exception as e:
            end_time = datetime.now()
            duration = (end_time - start_time).total_seconds()
            
            results[api_name] = {
                'success': False,
                'error': str(e),
                'duration_seconds': duration
            }
            
            print(f"   ❌ Failed: {e}")
    
    # Summary
    print("\n📊 Performance Summary:")
    print("-" * 40)
    
    for api_name, result in results.items():
        if result['success']:
            print(f"{api_name:15} | {result['papers_found']:2d} papers | {result['duration_seconds']:5.2f}s | Rel: {result['avg_relevance']:.2f} | Qual: {result['avg_quality']:.2f}")
        else:
            print(f"{api_name:15} | FAILED after {result['duration_seconds']:.2f}s")
    
    return results

# Run performance test
perf_results = await test_performance()

## 9. Edge Case Testing

Testing APIs with edge cases and unusual queries.

In [None]:
async def test_edge_cases():
    """Test edge cases and error handling"""
    print("🧪 Edge Case Testing")
    print("=" * 25)
    
    edge_cases = [
        ('empty_query', ''),
        ('single_char', 'a'),
        ('special_chars', '!@#$%^&*()'),
        ('very_long', 'machine learning artificial intelligence deep neural networks ' * 10),
        ('non_english', 'машинное обучение'),
        ('numbers_only', '12345'),
        ('very_specific', 'CRISPR-Cas9 mediated knockout of BRCA1 in HEK293T cells')
    ]
    
    # Test with ArXiv API (most reliable)
    async with ArxivAPI() as api:
        for case_name, query in edge_cases:
            print(f"\n🔍 Testing: {case_name}")
            print(f"   Query: '{query[:50]}{'...' if len(query) > 50 else ''}'")
            
            try:
                papers = await asyncio.wait_for(
                    api.search(query, max_results=3),
                    timeout=15
                )
                print(f"   ✅ Success: {len(papers)} papers found")
                
            except Exception as e:
                print(f"   ❌ Error: {type(e).__name__}: {e}")
    
    print("\n✅ Edge case testing completed")

# Run edge case tests
await test_edge_cases()

## 10. Final Test Summary

Summary of all test results.

In [None]:
def print_test_summary():
    """Print final test summary"""
    print("\n" + "="*60)
    print("📋 FINAL TEST SUMMARY")
    print("="*60)
    
    # Individual API tests
    api_tests = [
        ('ArXiv API', arxiv_success),
        ('BioRxiv API', biorxiv_success),
        ('MedRxiv API', medrxiv_success),
        ('ChemRxiv API', chemrxiv_success),
        ('SSRN API', ssrn_success),
        ('Research Square API', rs_success),
        ('Aggregator', aggregator_success)
    ]
    
    passed = sum(1 for _, success in api_tests if success)
    total = len(api_tests)
    
    print(f"\n🧪 API Tests: {passed}/{total} passed")
    for test_name, success in api_tests:
        status = "✅ PASS" if success else "❌ FAIL"
        print(f"   {test_name:20} | {status}")
    
    # Performance summary
    if 'perf_results' in globals():
        successful_apis = sum(1 for result in perf_results.values() if result['success'])
        print(f"\n⚡ Performance Tests: {successful_apis}/{len(perf_results)} APIs responded")
    
    # Overall assessment
    if passed >= total * 0.7:  # 70% pass rate
        print(f"\n🎉 OVERALL: GOOD - {passed}/{total} APIs working properly")
    elif passed >= total * 0.5:  # 50% pass rate
        print(f"\n⚠️  OVERALL: PARTIAL - {passed}/{total} APIs working, some issues detected")
    else:
        print(f"\n🚨 OVERALL: POOR - Only {passed}/{total} APIs working, significant issues")
    
    print("\n💡 Next Steps:")
    print("   - Fix any failing APIs")
    print("   - Monitor rate limits during heavy usage")
    print("   - Consider implementing caching for frequently accessed papers")
    print("   - Add retry logic for transient failures")

# Print final summary
print_test_summary()

## 11. Save Test Results

Save test results to a JSON file for analysis.

In [None]:
def save_test_results():
    """Save test results to file"""
    test_results = {
        'test_timestamp': datetime.now().isoformat(),
        'api_tests': {
            'arxiv': arxiv_success,
            'biorxiv': biorxiv_success,
            'medrxiv': medrxiv_success,
            'chemrxiv': chemrxiv_success,
            'ssrn': ssrn_success,
            'research_square': rs_success,
            'aggregator': aggregator_success
        },
        'performance_results': perf_results if 'perf_results' in globals() else {},
        'test_queries': TEST_QUERIES,
        'summary': {
            'total_tests': 7,
            'passed_tests': sum([
                arxiv_success, biorxiv_success, medrxiv_success,
                chemrxiv_success, ssrn_success, rs_success, aggregator_success
            ]),
            'success_rate': sum([
                arxiv_success, biorxiv_success, medrxiv_success,
                chemrxiv_success, ssrn_success, rs_success, aggregator_success
            ]) / 7
        }
    }
    
    output_file = f"api_test_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    
    with open(output_file, 'w') as f:
        json.dump(test_results, f, indent=2, default=str)
    
    print(f"\n💾 Test results saved to: {output_file}")
    return output_file

# Save results
results_file = save_test_results()
print(f"\n🏁 Testing complete! Results saved to {results_file}")

In [None]:
import urllib, urllib.request
import xmltodict
url = 'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1'
data = urllib.request.urlopen(url)
parsed_data = xmltodict.parse(data.read())
print(parsed_data)


<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <link href="http://arxiv.org/api/query?search_query%3Dall%3Aelectron%26id_list%3D%26start%3D0%26max_results%3D1" rel="self" type="application/atom+xml"/>
  <title type="html">ArXiv Query: search_query=all:electron&amp;id_list=&amp;start=0&amp;max_results=1</title>
  <id>http://arxiv.org/api/cHxbiOdZaP56ODnBPIenZhzg5f8</id>
  <updated>2025-08-22T00:00:00-04:00</updated>
  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">228634</opensearch:totalResults>
  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>
  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:itemsPerPage>
  <entry>
    <id>http://arxiv.org/abs/cond-mat/0102536v1</id>
    <updated>2001-02-28T20:12:09Z</updated>
    <published>2001-02-28T20:12:09Z</published>
    <title>Impact of Electron-Electron Cusp on Configur