In [1]:
import requests
import json
import os

In [2]:
def test_bern_api(text):
    """
    Tests the BERN2 API with the given text.

    Args:
        text (str): The text to send to the BERN2 API.

    Returns:
        dict or None: The JSON response from the API if successful, None otherwise.
    """
    url = "http://bern2.korea.ac.kr/plain"
    headers = {"Content-Type": "application/json"}
    payload = {"text": text}

    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()  # Raise an exception for bad status codes
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error during API request: {e}")
        return None

In [3]:
def test_multiple_captions():
    """Test BERN2 with a variety of different captions."""
    test_cases = [
        "Patients with HER2-positive breast cancer responded well to trastuzumab.",
        "Fig. 1: H&E staining revealed lung adenocarcinoma with KRAS mutation.",
        "The mice developed metastases in the liver after 4 weeks of treatment with doxorubicin.",
        "IL-6 and TNF-alpha levels were elevated in patients with severe COVID-19.",
        "Immunohistochemistry showed positive staining for p53 in 80% of the tumor cells."
    ]
    
    for i, caption in enumerate(test_cases):
        print(f"\n=== Test Case {i+1} ===")
        print(f"Caption: {caption}")
        results = test_bern_api(caption)
        
        if results and "annotations" in results:
            if results["annotations"]:
                print(f"Found {len(results['annotations'])} entities:")
                for entity in results["annotations"]:
                    print(f"  {entity['mention']} ({entity['obj']})")
            else:
                print("No entities found")
        else:
            print("API request failed")

In [4]:
def test_empty_and_edge_cases():
    """Test BERN2 with edge cases."""
    edge_cases = [
        "",  # Empty string
        "No biomedical terms here.",  # No entities
        "A" * 1000,  # Very long input
        "HER2+ NSCLC pt w/ mets to liver tx w/ TKI",  # Medical abbreviations
        "Figure shows %-change in CD4+ T-cells after 3μg/kg dose"  # Special characters
    ]
    
    for i, case in enumerate(edge_cases):
        print(f"\n=== Edge Case {i+1} ===")
        print(f"Text: {case[:50]}{'...' if len(case) > 50 else ''}")
        results = test_bern_api(case)
        if results:
            print(f"Status: {'Success' if 'annotations' in results else 'Failed'}")
            if 'annotations' in results:
                print(f"Entities found: {len(results['annotations'])}")
        else:
            print("Status: Request failed")

In [5]:
if __name__ == "__main__":
    test_caption = "The tumor showed increased expression of the EGFR protein rna dna."
    results = test_bern_api(test_caption)

    if results and "annotations" in results:
        print("BERN2 API Response:")
        print(json.dumps(results, indent=4))

        if results["annotations"]:
            print("\nExtracted Entities:")
            for entity in results["annotations"]:
                print(f"  Mention: {entity['mention']}")
                print(f"  Type: {entity['obj']}")
                if 'id' in entity and len(entity['id']) > 1:
                    print(f"  Identifier: {entity['id'][1]}")
                print(f"  Span: {entity['span']['begin']}-{entity['span']['end']}")
                print("-" * 20)
        else:
            print("\nNo entities found in the caption.")
    else:
        print("Failed to get a valid response from the BERN2 API.")

BERN2 API Response:
{
    "annotations": [
        {
            "id": [
                "mesh:D009369"
            ],
            "is_neural_normalized": false,
            "mention": "tumor",
            "obj": "disease",
            "prob": 0.999995231628418,
            "span": {
                "begin": 4,
                "end": 9
            }
        },
        {
            "id": [
                "NCBIGene:100507500"
            ],
            "is_neural_normalized": true,
            "mention": "EGFR protein rna",
            "obj": "gene",
            "prob": 0.830147922039032,
            "span": {
                "begin": 45,
                "end": 61
            }
        },
        {
            "id": [
                "CUI-less"
            ],
            "is_neural_normalized": false,
            "mention": "rna dna",
            "obj": "DNA",
            "prob": 0.6136531829833984,
            "span": {
                "begin": 58,
                "end": 65
          

In [None]:
import requests
from typing import List, Optional
import os
import yaml
from src.utils.logger import get_logger
from src.config.config_loader import load_config
from src.models.FigureData import Entity
logger = get_logger()

class PubTatorClient:
    def __init__(self):
        """Initialize the PubTator client with API key from settings"""
        self.base_url = "https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/pubtator"
        self.api_key = "9cd9f00c758ee566a4013f90d58df62c5408"
        if self.api_key:
            logger.info("PubTator client initialized with NCBI API key")
        else:
            logger.warning("No NCBI API key found - rate limiting will be strict")

    def fetch_entities(self, pmid_or_pmcid: str) -> List[Entity]:
        """
        Fetch entities from PubTator for a given PMID or PMCID
        """
        # If it's a PMC ID, try to process it properly
        original_id = pmid_or_pmcid
        if pmid_or_pmcid.startswith("PMC"):
            pmid_or_pmcid = pmid_or_pmcid.replace("PMC", "")
            logger.info(f"Modified PMC ID to numeric format: {pmid_or_pmcid}")

        # PubTator endpoint expects comma-separated IDs
        url = f"{self.base_url}?pmids={pmid_or_pmcid}"
        logger.info(f"Fetching entities from PubTator URL: {url}")

        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            pubtator_text = response.text
            logger.info(f"PubTator response length: {len(pubtator_text)} characters")

            if not pubtator_text.strip():
                logger.warning(f"Empty response from PubTator for ID: {original_id}")
                return []

        except Exception as e:
            logger.error(f"PubTator request failed: {e}")
            return []

        entities = []
        for line in pubtator_text.strip().split("\n"):
            # Debug raw line
            logger.debug(f"Processing line: {line}")

            # Skip title and abstract lines which contain | character
            if "|" in line:
                continue

            # Skip comment lines
            if line.startswith("#"):
                continue

            parts = line.split("\t")
            if len(parts) < 6:
                logger.debug(f"Skipping line with insufficient parts: {line}")
                continue  # skip malformed lines

            try:
                entity_id = parts[0].strip()
                start, end = int(parts[1]), int(parts[2])
                mention = parts[3]
                entity_type = parts[4]

                logger.debug(f"Found entity: {mention} ({entity_type})")
                entities.append(Entity(text=mention, type=entity_type, start=start, end=end))
            except ValueError:
                logger.warning(f"Skipping line with invalid integers: {line}")
                continue
            except Exception as e:
                logger.warning(f"Error processing entity line: {e}")
                continue

        logger.info(f"Retrieved {len(entities)} entities from PubTator")
        return entities

In [2]:
if __name__ == "__main__":
    client = PubTatorClient()
    sample_id = "PMC8719639"
    entities = client.fetch_entities(sample_id)

    print(f"\nEntities found in {sample_id}:")
    for e in entities:
        print(f"- {e.text} [{e.type}] at {e.start}-{e.end}")

[2025-05-20 09:09:53,974] INFO in 1939632837: PubTator client initialized with NCBI API key
[2025-05-20 09:09:53,976] INFO in 1939632837: Modified PMC ID to numeric format: 8719639
[2025-05-20 09:09:53,977] INFO in 1939632837: Fetching entities from PubTator URL: https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/pubtator?pmids=8719639
[2025-05-20 09:09:55,957] INFO in 1939632837: PubTator response length: 854 characters
[2025-05-20 09:09:55,958] INFO in 1939632837: Retrieved 4 entities from PubTator



Entities found in PMC8719639:
- cat [Species] at 108-111
- Fluoro-Gold [Chemical] at 173-184
- Evans Blue [Chemical] at 194-204
- EB [Chemical] at 206-208
