In [2]:
import pandas as pd
import importlib
import functions.pubmed_api
importlib.reload(functions.pubmed_api)
from functions.pubmed_api import get_pmcids, translate_pmid_to_pmcid, get_full_xml, extract_article_data

In [3]:
search_term = '(avian influenza[MeSH Terms]) AND (disease outbreak[MeSH Terms])'

pubmed_ids = get_pmcids(search_term, 5)
pubmed_ids

['39535188', '28915920', '39053575', '18455796', '16566867']

In [4]:
pmc_ids = []

for pmid in pubmed_ids:
    pmc_id = translate_pmid_to_pmcid(pmid)
    pmc_ids.append(pmc_id)

pmc_ids

['PMC11633217', 'PMC5603032', 'PMC11485258', 'PMC7133821', 'PMC7089356']

In [5]:
df_full_texts = pd.DataFrame(columns=['pmid', 'pmcid', 'title', 'abstract', 'full_text', 'authors'])

for pmc_id in pmc_ids:
    df_temp = extract_article_data(get_full_xml(pmc_id))
    df_full_texts = pd.concat([df_full_texts, df_temp], ignore_index=True)

df_full_texts['pmcid'] = pmc_ids

# Drop Articles that contain neither an abstract nor a full text
df_full_texts.dropna(subset=['abstract', 'full_text'], inplace=True, thresh=1)
df_full_texts

Unnamed: 0,pmid,pmcid,title,abstract,full_text,authors
0,39535188,PMC11633217,Avian influenza A (H5N1) virus in dairy cattle...,ABSTRACTSince the emergence of highly pathogen...,INTRODUCTION Influenza viruses are classified ...,https://orcid.org/0000-0002-2878-5714 Mostafa ...
1,28915920,PMC5603032,Current situation of H9N2 subtype avian influe...,"In China, H9N2 subtype avian influenza outbrea...",Introduction Avian influenza (AI) is initially...,"Gu Min gumin@yzu.edu.cn 1 2 3, Xu Lijun xuliju..."
2,39053575,PMC11485258,Spillover of highly pathogenic avian influenza...,The highly pathogenic avian influenza (HPAI) H...,Main The HPAI virus H5Nx goose/Guangdong linea...,http://orcid.org/0000-0003-1643-8560 Caserta L...
4,16566867,PMC7089356,Avian influenza,The current epidemic of H5N1 highly pathogenic...,,"Zeitlin Gary A. MD, Maslow Melanie J. MD, FACP..."


In [6]:
df_full_texts.to_csv('full_text.tsv', sep='\t', mode='w')