In [3]:
import sys
import os

sys.path.append(os.path.abspath(".."))

from final_code.pubmed_esearch import get_pmcids
from final_code.pubmed_esearch import create_df_pmcids
import pandas as pd
import requests
import time
import xml.etree.ElementTree as ET

In [9]:
search_terms = ["Avian influenza outbreak", "EHEC outbreak"]

df = create_df_pmcids(search_terms=search_terms, no_of_results=50)
df["PMCID"] = "PMC" + df["PMCID"].astype(str)
df.head()

Unnamed: 0,PMCID,search_term
0,PMC11934876,Avian influenza outbreak
1,PMC11934309,Avian influenza outbreak
2,PMC11934220,Avian influenza outbreak
3,PMC11932954,Avian influenza outbreak
4,PMC11932466,Avian influenza outbreak


In [23]:
def check_if_pmcid_is_available(pmcid):
    search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pmc", 
        "id": pmcid, 
        "retmode": "xml"
    }
    response = requests.get(search_url, params=params)
    time.sleep(0.5)
    
    if response.status_code == 200:
        try:
            xml_content = response.content.decode('utf-8').strip()
            
            if xml_content.startswith("<article"):
                return True
            
            root = ET.fromstring(xml_content)
            return root.find("article") is not None
        except ET.ParseError:
            return False
    return False

In [21]:
# available: PMC11719993
# not available: 

a = check_if_pmcid_is_available("PMC11719993")
print(a)

TypeError: startswith first arg must be bytes or a tuple of bytes, not str

In [24]:
df["has_result"] = df["PMCID"].apply(check_if_pmcid_is_available)

df

Unnamed: 0,PMCID,search_term,has_result
0,PMC11934876,Avian influenza outbreak,True
1,PMC11934309,Avian influenza outbreak,True
2,PMC11934220,Avian influenza outbreak,True
3,PMC11932954,Avian influenza outbreak,True
4,PMC11932466,Avian influenza outbreak,True
...,...,...,...
95,PMC11834125,EHEC outbreak,True
96,PMC11733599,EHEC outbreak,True
97,PMC11764226,EHEC outbreak,True
98,PMC11758462,EHEC outbreak,True


In [25]:
df.to_csv("check_for_results.csv")

In [15]:
def get_call_link(pmcid):
    search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pmc", 
        "id": "open access[filter]+" + pmcid, 
        "retmode": "xml"
    }
    response = requests.get(search_url, params=params)
    time.sleep(0.5)

    return response.url

# u = get_call_link("PMC11719993")
u = get_call_link("PMC11719993")
print(u)
   

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=open+access%5Bfilter%5D%2BPMC11719993&retmode=xml


In [None]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd

def get_full_xml(pmcid):
    search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pmc", 
        "id": pmcid, 
        "retmode": "xml"
    }
    response = requests.get(search_url, params=params)
    time.sleep(0.5)

    if response.status_code == 200:
        return response.content
    else:
        return "error"
    

def extract_article_data(xml):
    soup = BeautifulSoup(xml, features="xml")  # Verwende den XML-Parser von lxml

    data = pd.DataFrame(columns=['pmid', 'title', 'abstract', 'full_text', 'authors'])

    pmid = soup.select_one('[pub-id-type="pmid"]').text.strip() if soup.select_one('[pub-id-type="pmid"]') else "N/A"
    title = soup.select_one("article-title").text.strip() if soup.select_one("article-title") else "N/A"
    abstract = soup.select_one("abstract").text.strip() if soup.select_one("abstract") else "N/A"

    full_text = "\n".join(
        sec.get_text(strip=True, separator=" ") for sec in soup.select("body sec")
    )
    authors = ", ".join(
        a.get_text(strip=True, separator=" ") for a in soup.select('[contrib-type="author"]')
    )

    data.append(
        {"pmid": pmid, "title": title, "abstract": abstract, "full_text": full_text, "authors": authors}
    )

    return data


df = extract_article_data(get_full_xml("PMC11719993"))
print(df)


NameError: name 'pd' is not defined