In [None]:
import requests
import xml.etree.ElementTree as ET

In [102]:
def fetch_pmc_full_paper(pmcid):
    base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'

    params = {
        'db': 'pmc',
        "id": pmcid,
        'retmode': 'text'
    }

    response = requests.get(base_url, params)

    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to fetch full text for {pmcid}: {response.status_code}")

In [103]:
def get_full_text(element):
    """Recursively extracts all text inside an XML element, including nested tags."""
    text_parts = []
    
    if element.text:
        text_parts.append(element.text)
    
    for child in element:
        text_parts.append(get_full_text(child))
        if child.tail:
            text_parts.append(child.tail)
    
    return ''.join(text_parts)

def extract_text_from_pmc_xml(xml_text):
    root = ET.fromstring(xml_text)
    body = root.find('.//body')
    if body is None:
        print("No body found in XML.")
        return ""

    def process_section(section, level=1):
        output = []
        
        title_elem = section.find('title')
        if title_elem is not None:
            title_text = get_full_text(title_elem).strip()
            if title_text:
                if level == 1:
                    output.append(f"\n**{title_text.upper()}**")
                elif level == 2:
                    output.append(f"\n**{title_text}**")
                else:
                    output.append(f"\n**{title_text}**")

        # Extract paragraphs
        for elem in section:
            if elem.tag == 'p':
                para_text = get_full_text(elem).strip()
                if para_text:
                    output.append(f"{para_text}")
            elif elem.tag == 'sec':
                output.append(process_section(elem, level + 1))

        return "\n\n".join(output)

    return process_section(body)

In [115]:
pmcid = 'PMC9693805'
xml_text = fetch_pmc_full_paper(pmcid)
# print(xml_text)
full_text = extract_text_from_pmc_xml(xml_text)
print(full_text)

Effective growth-promoting treatment with pituitary-derived human growth hormone (hGH) was first reported more than 60 years ago (1). In 1985, Creutzfeldt-Jakob disease was linked to the use of pituitary-derived hGH leading to its discontinuation, which, shortly thereafter, was followed by the first full regulatory approval worldwide of recombinant hGH (rhGH) by the US Food and Drug Administration and by the European Medicines Agency in the same year (2). Initially approved only for pediatric GH deficiency (GHD), rhGH treatment currently is approved, depending on the country/region, for 8 indications in children (GHD, Prader-Willi syndrome [PWS], children born small for gestational age [SGA], Turner syndrome [TS], Noonan syndrome, idiopathic short stature [ISS], chronic renal failure [CRF], and short stature homeobox-containing gene deficiency [SHOX-D]); and 3 in adults (GHD, short bowel syndrome, and HIV wasting syndrome). Various registries/postmarketing surveillance databases were d

In [112]:
def fetch_pmc_papers(query: str, top_k: int = 5):
    search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    esummary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"

    pmc_ids = []
    total_checked = 0
    retstart = 0
    batch_size = 50

    while len(pmc_ids) < top_k:
        search_params = {
            'db': 'pubmed',
            'term': query,
            'retstart': retstart,
            'retmax': batch_size,
            'retmode': 'xml'
        }
        search_response = requests.get(search_url, params=search_params)
        if search_response.status_code != 200:
            raise Exception(f"Failed to search PubMed: {search_response.status_code}")

        search_root = ET.fromstring(search_response.text)
        id_list = [id_elem.text for id_elem in search_root.findall(".//Id")]

        if not id_list:
            print("No more papers to search.")
            break

        # esummary to get PMC IDs
        for i in range(0, len(id_list), 10): 
            batch_ids = id_list[i:i+10]
            esummary_params = {
                'db': 'pubmed',
                'id': ','.join(batch_ids),
                'retmode': 'xml'
            }
            esummary_response = requests.get(esummary_url, params=esummary_params)
            if esummary_response.status_code != 200:
                raise Exception(f"Failed to fetch esummary: {esummary_response.status_code}")

            summary_root = ET.fromstring(esummary_response.text)

            for docsum in summary_root.findall('.//DocSum'):
                total_checked += 1
                article_ids = docsum.find('.//Item[@Name="ArticleIds"]')
                pmc_id = None
                if article_ids is not None:
                    for item in article_ids.findall('Item'):
                        if item.attrib.get('Name') == 'pmc':
                            pmc_id = item.text
                            break

                if pmc_id:
                    pmc_ids.append(pmc_id)

                if len(pmc_ids) >= top_k:
                    break
            if len(pmc_ids) >= top_k:
                break

        retstart += batch_size

    print(f"Searched {total_checked} papers. Found {len(pmc_ids)} papers with PMC IDs.")
    return pmc_ids

In [120]:
papers = fetch_pmc_papers("Pfizer", top_k=5)
for paper in papers:
    print(f"PMC ID: {paper}")
    pmcid = paper
    xml_text = fetch_pmc_full_paper(pmcid)
    full_text = extract_text_from_pmc_xml(xml_text)
    print(full_text)
    print("-------------------------------------------------")