In [60]:
import requests
import pandas as pd
from collections import Counter
from typing import List, Tuple, Dict, Any
from tqdm import tqdm

# ---------- 1. Fetch articles from Europe PMC ----------


In [67]:
def fetch_epmc_articles(query: str,
                        from_year: int = 2024,
                        to_year: int = 2025,
                        max_results: int = 2000) -> pd.DataFrame:
    
    """Fetches articles from Europe PMC.
    https://europepmc.org/RestfulWebService#!/Europe32PMC32Articles32RESTful32API/search_articles_get

    Parameters
    ----------
    query : str
        Search query
    from_year : int
        From year, default 2024
    to_year : int
        To year, default 2025
    max_results : int
        Maximum number of results to fetch, default 2000

    Returns
    -------
    pd.DataFrame
        DataFrame of articles with columns:
        ['id', 'source', 'pmid', 'pmcid', 'doi', 'title', 'abstract', 'pubYear', 'primary_url']
    """

    url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search" # RestFul API endpoint
    all_rows = []                                                   # Accumulated results
    page_size = 1000                                                # Max allowed per page. WebService limits extraction to 1000 per request. Do not increase beyond 1000.
    page = 0                                                        # To store page number  
    

    pbar = tqdm(total=max_results, desc="Fetching articles", unit="articles")

    while len(all_rows) < max_results:                              # Loop until we reach max_results, default 2000. Increasing this may lead to timeouts.  
        params = {
            "query": f"{query} AND PUB_YEAR:[{from_year} TO {to_year}]",
            "format": "json",
            "pageSize": page_size,
            "page": page,
            "resultType": "core",                                   # core: returns full metadata for a given publication ID; including abstract, full text links, and MeSH terms
        }
        
        response = requests.get(url, params=params, timeout=30)
        if not response.ok:
            print(f"Request failed on page {page}: {response.status_code}")
            break
            
        results = response.json()                                  # Parse JSON response
        #print(type(results))
        articles = results.get("resultList", {}).get("result", []) # Extract articles
        #print(f"Articles type: {type(articles)}")
        #print(f"Number of articles fetched: {len(articles)}")
        #print(articles[0])  # Print the first article to inspect its structure
        
        if not articles:  # No more results
            break
            
        # Process current page
        for art in articles:
            if len(all_rows) >= max_results:
                break

            primary_url = ""
            
            # 1) First fullTextUrl if available (highest priority)
            ft_list = art.get("fullTextUrlList", {})
            if ft_list and ft_list.get("fullTextUrl"):
                first_ft_url = ft_list["fullTextUrl"][0].get("url")
                if first_ft_url:
                    primary_url = first_ft_url
            
            # 2) Fallback to canonical links if no full text
            if not primary_url:
                pmcid = (art.get("pmcid") or "").strip()
                pmid = (art.get("pmid") or "").strip()
                doi = (art.get("doi") or "").strip()
                
                if pmcid:
                    core = pmcid.replace("PMC", "") if pmcid.upper().startswith("PMC") else pmcid
                    primary_url = f"https://europepmc.org/article/PMC/{core}"
                elif pmid:
                    primary_url = f"https://europepmc.org/abstract/MED/{pmid}"
                elif doi:
                    primary_url = f"https://doi.org/{doi}"
            # -------------------------------------

            all_rows.append({
                "id": art.get("id", ""),
                "source": art.get("source", ""),
                "pmid": art.get("pmid", ""),
                "pmcid": art.get("pmcid", ""),
                "doi": art.get("doi", ""),
                "title": art.get("title", ""),
                "abstract": art.get("abstractText", art.get("abstract", "")),
                "pubYear": art.get("pubYear", ""),
                "primary_url": primary_url,  
            })
        
        page += 1
        pbar.update(len(articles))
        pbar.set_postfix({"page": page, "total": len(all_rows)})
    
    pbar.close()  # Clean up progress bar
    
    df = pd.DataFrame(all_rows[:max_results])  # Trim to requested max
    return df

df_articles = fetch_epmc_articles("obesity targets", 2023, 2025,50)

Fetching articles: 1000articles [00:09, 106.88articles/s, page=1, total=50]


In [74]:
def fetch_epmc_articles(query: str,
                        from_year: int = 2024,
                        to_year: int = 2025,
                        max_results: int = 2000) -> pd.DataFrame:
    url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
    all_rows = []
    page_size = 1000  # Max allowed per page
    page = 0
    
    while len(all_rows) < max_results:
        params = {
            "query": f"{query} AND PUB_YEAR:[{from_year} TO {to_year}]",
            "format": "json",
            "pageSize": page_size,
            "page": page,
            "resultType": "core",
        }
        
        response = requests.get(url, params=params, timeout=30)
        if not response.ok:
            print(f"Request failed on page {page}: {response.status_code}")
            break
            
        results = response.json()
        articles = results.get("resultList", {}).get("result", [])
        
        if not articles:  # No more results
            break
            
        # Process current page
        for art in articles:
            if len(all_rows) >= max_results:
                break

            urls = []

            # 1) direct fullTextUrlList if present
            ft_list = art.get("fullTextUrlList", {})
            if ft_list:
                for u in ft_list.get("fullTextUrl", []):
                    url_val = u.get("url")
                    if url_val:
                        urls.append(url_val)

            # 2) canonical Europe PMC / DOI links from IDs
            pmcid = (art.get("pmcid") or "").strip()
            pmid  = (art.get("pmid") or "").strip()
            doi   = (art.get("doi") or "").strip()

            if pmcid:
                core = pmcid.replace("PMC", "") if pmcid.upper().startswith("PMC") else pmcid
                urls.append(f"https://europepmc.org/article/PMC/{core}")
            if pmid:
                urls.append(f"https://europepmc.org/abstract/MED/{pmid}")
            if doi:
                urls.append(f"https://doi.org/{doi}")

            # de‑duplicate while preserving order
            seen = set()
            urls = [u for u in urls if not (u in seen or seen.add(u))]
            # -------------------------------------

            all_rows.append({
                "id": art.get("id", ""),
                "source": art.get("source", ""),
                "pmid": art.get("pmid", ""),
                "pmcid": art.get("pmcid", ""),
                "doi": art.get("doi", ""),
                "title": art.get("title", ""),
                "abstract": art.get("abstractText", art.get("abstract", "")),
                "pubYear": art.get("pubYear", ""),
                "urls": urls,          # <- NEW COLUMN
                "primary_url": urls[0] if urls else "",  # convenient single link
            })
        
        page += 1
        print(f"Fetched page {page} ({len(articles)} articles, total: {len(all_rows)})")
    
    df = pd.DataFrame(all_rows[:max_results])  # Trim to requested max
    return df

df_articles = fetch_epmc_articles("obesity targets", 2023, 2025,50)

Fetched page 1 (1000 articles, total: 50)


In [86]:
ANN_URL = "https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds"


def get_gene_annotations_for_articles(article_ids: List[str],
                                      chunk_size: int = 8) -> Dict[str, List[Dict[str, Any]]]:
    """
    Call annotationsByArticleIds in small chunks to avoid 414 and API limits. [web:74]
    Returns mapping articleId -> list of gene/protein annotations.
    """
    print("Fetching gene annotations for articles...")
    out: Dict[str, List[Dict[str, Any]]] = {}

    # Convert range to list for tqdm
    chunks = list(range(0, len(article_ids), chunk_size))
    for start in tqdm(chunks, desc="Processing article ID chunks"):
        chunk = article_ids[start:start + chunk_size]
        params = {
            "articleIds": ",".join(chunk),
            "type": "Gene_Proteins",
            "section": "Abstract",
            "provider": "Europe PMC",   
            "format": "JSON",
        }
   #     print(f"Annotations API Request params: {params}")
        r = requests.get(ANN_URL, params=params, timeout=60)
        if not r.ok:
            tqdm.write(f"Annotations API error {r.status_code} for chunk starting at {start}: {r.url}")
            continue
   #     print(f"Annotations API Request text: {r.text}")

        data = r.json()  # list of {"articleId": "...", "annotations": [...]}
        if isinstance(data, dict):
            data = data.get("annotationsByArticle", [])

        for entry in data:
            source = entry.get("source")
            ext_id = entry.get("extId")
            if source and ext_id:
                 aid = f"{source}:{ext_id}"
            else:
                aid = ext_id or source  # fallback if one is missing
            anns = entry.get("annotations", [])
            gene_anns = [a for a in anns if a.get("type", "").lower().startswith("gene_proteins")]
            out[aid] = gene_anns
            print(f"Article ID: {aid}, Gene Annotations: {(gene_anns)}")
    #print(f"Fetched gene annotations for {len(out)} articles.")
    print(f"Sample articleId and annotations: {list(out.items())}")
    return out


In [87]:
def build_article_id_token(row: pd.Series) -> str:
    """
    Build a Europe PMC annotations API ID of the form 'SOURCE:ext_id'. [web:88][web:124]

    Priority:
      1) MED:PMID   for PubMed records
      2) PMC:PMCID  for full-text PubMed Central (remove leading 'PMC' if present)
      3) source:id  as a generic fallback (e.g. PPR:xxxx, AGR:xxxx).
    """
    pmid = (row.get("pmid") or "").strip()
    pmcid = (row.get("pmcid") or "").strip()
    source = (row.get("source") or "").strip()
    eid = (row.get("id") or "").strip()

    # PubMed
    if pmid:
        return f"MED:{pmid}"

    # PubMed Central (pmcid often like 'PMC1234567')
    if pmcid:
        core = pmcid.replace("PMC", "") if pmcid.upper().startswith("PMC") else pmcid
        return f"PMC:{core}"

    # Other sources (preprints, Agricola, etc.) [web:70][web:59]
    if source and eid:
        return f"{source}:{eid}"

    return ""


In [113]:
from collections import Counter, defaultdict
from typing import List, Tuple, Dict, Any, Set
from urllib.parse import urlparse

def _extract_uniprot_accession(uri: str) -> str:
    path = urlparse(uri).path.strip("/")
    parts = path.split("/")
    return parts[1] if len(parts) > 1 else parts[0]

def build_top_targets_from_epmc(df_articles: pd.DataFrame,
                                top_k: int = 100
                                ) -> Tuple[List[Tuple[str, int]], Dict[str, Any]]:
    """
    Function to transform the article and annotation data from Europe PMC into a list of top targets 
    based on frequency of mentions across articles and number of articles mentioning each target. 
        
    Parameters
    ----------
    df_articles : pd.DataFrame
        DataFrame of articles with columns: ['id', 'source', 'pmid', 'pmcid', 'doi', 'title', 'abstract', 'pubYear', 'primary_url']
    top_k : int
        Number of top targets to return based on frequency. Default is 100.
    Returns
    -------
    Tuple[List[Tuple[str, int]], Dict[str, Any]]
        - top_targets : List of top_k targets as (key, frequency) tuples sorted by frequency descending.
        - target_info : Dictionary mapping target key to metadata including name, accession, uniprot_url, frequency, n_articles, and list of articleIdTokens.

    """
    print("Building articleIdTokens...")
    df = df_articles.copy()                                                               # Copy to avoid modifying original DataFrame
    df["articleIdToken"] = df.apply(build_article_id_token, axis=1)                       # Build articleIdTokens to send to Annotations API
    df = df[df["articleIdToken"] != ""]                                                   # Filter out rows with empty articleIdTokens
    print(f"Filtered to {len(df)} articles with valid articleIdTokens from original {len(df_articles)}.")
    tokens = df["articleIdToken"].tolist()                                                # Extract list of articleIdTokens  
    print("Fetching gene/protein annotations from Europe PMC...")
    freq = Counter()                                                                      # Counter to track target frequencies
    ann_map = get_gene_annotations_for_articles(tokens, chunk_size=8)                     # Fetch gene/protein annotations for articles   
    if ann_map:
        n_anns = sum(len(anns) for anns in ann_map.values())
        print(f"Total gene/protein annotations fetched: {n_anns}")
    else:
        print("No annotations returned for any article.")

    # Extract gene/protein targets and calculate frequency
    print("First pass: counting target frequencies...") 
    for aid, anns in tqdm(ann_map.items(),desc="Counting annotations"):                  # Loop through each article
        for ann in anns:                                                                 # Loop through each annotation
            tags = ann.get("tags") or []                                                 # Extract tags 
            if not tags:
                continue
            for tag in tags:                                                             # Loop through each gene / protein tag 
                name = (tag.get("name") or "").strip()                                   # Extract name if available
                uri = (tag.get("uri") or "").strip()                                     # Extract URI if available
                if not uri and not name:
                    continue
                acc = _extract_uniprot_accession(uri) if uri else ""                     # Extract Uniprot accession if URI available 
                key = acc.lower() if acc else name.lower()                               # Use accession as key if available, else name (case insensitive)
                freq[key] += 1                                                           # Increment frequency counter for this target  

    top_targets = freq.most_common(top_k)                                                # Get top_k targets by frequency
    
    # Build set of keys of top k targets for quick lookup
    top_keys: Set[str] = {k for k, _ in top_targets}

    # Build metadata for top targets
    print("Second pass: building target metadata...")
    target_info: Dict[str, Dict[str, Any]] = {}                                          # Mapping from target key to metadata  
    for aid, anns in tqdm(ann_map.items(),desc="Building target metadata"):
        for ann in anns:
            tags = ann.get("tags") or []                                                 # Extract tags
            if not tags:                                                                 # Skip if no tags  
                continue
            for tag in tags:                                                            # Loop through each gene / protein tag
                name = (tag.get("name") or "").strip()                                  # Extract name if available
                uri = (tag.get("uri") or "").strip()                                    # Extract URI if available
                if not uri and not name:
                    continue
                acc = _extract_uniprot_accession(uri) if uri else ""                    # Extract Uniprot accession if URI available
                key = acc.lower() if acc else name.lower()                              # Use accession as key if available, else name (case insensitive)   

                if key not in top_keys:                                                # Skip if not in top k targets (to limit computation and memory)
                    continue

                if key not in target_info:                                             # Initialize metadata for this target if not already present
                    target_info[key] = {
                        "key": key,
                        "frequency": 0,
                        "articles": {}  # {aid: primary_url}
                        "name": name,
                        "accession": acc,
                        "uniprot_url": uri
                }

                articles = target_info[key]["articles"]                                 # Get existing articles dict already stored for this target 
                if aid not in articles:                                                 # Initialize list for this article if not already present
                    primary_url = id_to_primary.get(aid, "")                            # Lookup once per article
                    articles[aid] = primary_url                                         # Store primary_url for this articleIdToken

                target_info[key]["frequency"] += 1                                      # Increment frequency for this target

    # 4) Convert article dicts to sorted lists for serialization
    for key, info in target_info.items():
        info["n_articles"] = len(info["articles"])
        info["article_links"] = info["articles"]

    return top_targets, target_info                                                                # Extract first tag    
            
                

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2127031668.py, line 89)

In [114]:
# Build mapping from articleIdToken -> URLs / primary_url
df_with_tokens = df_articles.copy()
df_with_tokens["articleIdToken"] = df_with_tokens.apply(build_article_id_token, axis=1)

id_to_primary = (
    df_with_tokens
    .set_index("articleIdToken")["primary_url"]
    .to_dict()
)


In [115]:
# Example usage:

print(f"Fetched {len(df_articles)} articles from Europe PMC.")
print(df_articles.head())
top_targets, target_info = build_top_targets_from_epmc(df_articles, top_k=500)

rows = []
for key, count in top_targets:
    info = target_info.get(key, {})
    article_tokens = info.get("articles", [])

      # Extract representative metadata from FIRST article's FIRST tag
    first_article_tags = list(article_tokens.values())[0] if article_tokens else []
    first_tag = first_article_tags[0] if first_article_tags else {}
    
    article_links = []
    for aid in article_tokens.keys():  # Use .keys() for article IDs
        primary = id_to_primary.get(aid, "")
        if primary:
            article_links.append({
                "articleIdToken": aid,
                "primary_url": primary,
            })
    
    rows.append({
        "name": first_tag.get("name", key),           # ✅ From first tag
        "accession": first_tag.get("accession"),      # ✅ From first tag
        "uniprot_url": first_tag.get("uri"),          # ✅ Use "uri" not "uniprot_url"
        "frequency": info.get("frequency", count),
        "n_articles": info.get("n_articles", 0),
        "articles": article_tokens,
        "article_links": article_links,
    })


Fetched 50 articles from Europe PMC.
         id source      pmid pmcid                           doi  \
0  41366037    MED  41366037          10.1038/s41598-025-31533-w   
1  41379482    MED  41379482                   10.2337/dc25-1093   
2  41398384    MED  41398384          10.1038/s41366-025-01979-z   
3  41387344    MED  41387344                   10.1111/obr.70059   
4  41396202    MED  41396202        10.1097/cm9.0000000000003919   

                                               title  \
0  Network pharmacology identifies AKR1C3 and KYN...   
1  Midchildhood Obesity After Exposure to Gestati...   
2  Genome-wide meta-analysis with 2,206,440 indiv...   
3  Circulating Proteins Link Obesity With Cardiac...   
4  Obesity and emerging intervention strategies: ...   

                                            abstract pubYear  \
0  This study aimed to identify and validate key ...    2025   
1  <h4>Objective</h4>To assess how treatment type...    2025   
2  <h4>Background</h4>The

Processing article ID chunks:   0%|          | 0/7 [00:00<?, ?it/s]

Processing article ID chunks:  14%|█▍        | 1/7 [00:01<00:06,  1.04s/it]

Article ID: MED:41366037, Gene Annotations: [{'prefix': 'The genes', 'exact': 'AKR1C3', 'postfix': 'and KYNU were', 'tags': [{'name': 'AKR1C3', 'uri': 'https://www.uniprot.org/uniprotkb/Q5R7C9/entry'}], 'id': 'http://europepmc.org/article/MED/41366037#europepmc-e7624b3b574e9c52e73c2c5f6ea68537', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'genes AKR1C3 and', 'exact': 'KYNU', 'postfix': 'were identified as', 'tags': [{'name': 'KYNU', 'uri': 'https://www.uniprot.org/uniprotkb/Q16719/entry'}], 'id': 'http://europepmc.org/article/MED/41366037#europepmc-f40a7f4b564c801a6e5c22e8242d5725', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'elevated expression of', 'exact': 'AKR1C3', 'postfix': 'and KYNU in', 'tags': [{'name': 'AKR1C3', 'uri': 'https://www.uniprot.org/uniprotkb/Q5R7C9/entry'}], 'id': 'http://europepmc.org/article/MED/41366037#eur

Processing article ID chunks:  29%|██▊       | 2/7 [00:02<00:06,  1.28s/it]

Article ID: MED:41032481, Gene Annotations: [{'prefix': 'also assessed.<h4>Results</h4>', 'exact': 'Phosphoinositide-3-kinase regulatory subunit 1', 'postfix': 'regulatory subunit 1', 'tags': [{'name': 'DDB_G0283385', 'uri': 'https://www.uniprot.org/uniprotkb/P18160/entry'}], 'id': 'http://europepmc.org/article/MED/41032481#europepmc-8e82f6cfb0b9c39a4a6aa062663eea22', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'regulatory subunit 1 (', 'exact': 'PIK3R1', 'postfix': '), a key mediator', 'tags': [{'name': 'PIK3R1', 'uri': 'https://www.uniprot.org/uniprotkb/Q5R685/entry'}], 'id': 'http://europepmc.org/article/MED/41032481#europepmc-ce99230990fadedb80630ddd8ecba2c2', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'mediator in the', 'exact': 'phosphoinositide 3-kinase', 'postfix': '3-kinase (PI3K)/ Protein', 'tags': [{'name': 'SPI3K-1', 'u

Processing article ID chunks:  43%|████▎     | 3/7 [00:03<00:04,  1.11s/it]

Article ID: MED:41344603, Gene Annotations: [{'prefix': 'the presence of', 'exact': 'GLP-1 receptor', 'postfix': 'receptor agonists with', 'tags': [{'name': 'Glpr', 'uri': 'https://www.uniprot.org/uniprotkb/P32301/entry'}], 'id': 'http://europepmc.org/article/MED/41344603#europepmc-df4ffefb62b7e99b617cd3f9531a7eb3', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': '', 'exact': 'Amylin', 'postfix': ', a neuroendocrine hormone', 'tags': [{'name': 'IAPP', 'uri': 'https://www.uniprot.org/uniprotkb/Q28605/entry'}], 'id': 'http://europepmc.org/article/MED/41344603#europepmc-244bf19d318b9f0b082d3b6bffa83044', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'hormone co-released with', 'exact': 'insulin', 'postfix': ', controls hunger, gastric', 'tags': [{'name': 'LEG', 'uri': 'https://www.uniprot.org/uniprotkb/Q9FRT8/entry'}], 'id': 'http://europepm

Processing article ID chunks:  57%|█████▋    | 4/7 [00:04<00:03,  1.10s/it]

Article ID: MED:41228448, Gene Annotations: [{'prefix': 'lipid metabolism, and', 'exact': 'insulin', 'postfix': 'signaling.', 'tags': [{'name': 'LEG', 'uri': 'https://www.uniprot.org/uniprotkb/Q9FRT8/entry'}], 'id': 'http://europepmc.org/article/MED/41228448#europepmc-5d79181871f85068c82eaf467ea0349a', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}]
Article ID: MED:40884426, Gene Annotations: [{'prefix': 'Network analysis highlighted', 'exact': 'JUN', 'postfix': ', TOP2A, APOE, and', 'tags': [{'name': 'Transcription factor AP-1 subunit Jun', 'uri': 'https://www.uniprot.org/uniprotkb/P54864/entry'}], 'id': 'http://europepmc.org/article/MED/40884426#europepmc-78492fed7f8478cfa717d8197bc61376', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'analysis highlighted JUN,', 'exact': 'TOP2A', 'postfix': ', APOE, and LEP', 'tags': [{'name': 'TOP2A', 'uri': 'ht

Processing article ID chunks:  71%|███████▏  | 5/7 [00:05<00:02,  1.02s/it]

Article ID: MED:41306340, Gene Annotations: [{'prefix': 'mass by suppressing', 'exact': 'PPARγ', 'postfix': 'and C/EBPα expression,', 'tags': [{'name': 'PPAR', 'uri': 'https://www.uniprot.org/uniprotkb/Q07869/entry'}], 'id': 'http://europepmc.org/article/MED/41306340#europepmc-89c5c162fba38faa7efd2f5ddadf3ff7', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'suppressing PPARγ and', 'exact': 'C/EBPα', 'postfix': 'expression, and increased', 'tags': [{'name': 'stk3', 'uri': 'https://www.uniprot.org/uniprotkb/Q6IP06/entry'}], 'id': 'http://europepmc.org/article/MED/41306340#europepmc-3f4b25bf5d66238edb50a654cfb7a2af', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'through upregulation of', 'exact': 'UCP1', 'postfix': 'and PGC-1α.', 'tags': [{'name': 'SLC25A7', 'uri': 'https://www.uniprot.org/uniprotkb/Q18P97/entry'}], 'id': 'http://europepm

Processing article ID chunks:  86%|████████▌ | 6/7 [00:07<00:01,  1.34s/it]

Article ID: MED:41046314, Gene Annotations: []
Article ID: MED:41216852, Gene Annotations: []
Article ID: MED:41240330, Gene Annotations: [{'prefix': 'monosaccharide composition of', 'exact': 'CYP', 'postfix': 'was determined using', 'tags': [{'name': 'CYP', 'uri': 'https://www.uniprot.org/uniprotkb/Q26551/entry'}], 'id': 'http://europepmc.org/article/MED/41240330#europepmc-96a7c867e7fe52581054baf1438399b0', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'results showed that', 'exact': 'CYP', 'postfix': 'was composed of', 'tags': [{'name': 'CYP', 'uri': 'https://www.uniprot.org/uniprotkb/Q26551/entry'}], 'id': 'http://europepmc.org/article/MED/41240330#europepmc-fff321eac986bd78ea5b575359ca332f', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'anti-obesity effects of', 'exact': 'CYP', 'postfix': 'were evaluated using', 'tags': [{'name': '

Processing article ID chunks: 100%|██████████| 7/7 [00:08<00:00,  1.22s/it]


Article ID: MED:40868627, Gene Annotations: []
Article ID: PPR:PPR1136007, Gene Annotations: []
Sample articleId and annotations: [('MED:41366037', [{'prefix': 'The genes', 'exact': 'AKR1C3', 'postfix': 'and KYNU were', 'tags': [{'name': 'AKR1C3', 'uri': 'https://www.uniprot.org/uniprotkb/Q5R7C9/entry'}], 'id': 'http://europepmc.org/article/MED/41366037#europepmc-e7624b3b574e9c52e73c2c5f6ea68537', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'genes AKR1C3 and', 'exact': 'KYNU', 'postfix': 'were identified as', 'tags': [{'name': 'KYNU', 'uri': 'https://www.uniprot.org/uniprotkb/Q16719/entry'}], 'id': 'http://europepmc.org/article/MED/41366037#europepmc-f40a7f4b564c801a6e5c22e8242d5725', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'elevated expression of', 'exact': 'AKR1C3', 'postfix': 'and KYNU in', 'tags': [{'name': 'AKR1C3', 'uri': 

Counting annotations: 100%|██████████| 50/50 [00:00<00:00, 8466.16it/s]


top_targets sample: [('q9i8a9', 10), ('p23968', 10), ('q91285', 10), ('q9frt8', 8), ('p70031', 8), ('q26551', 8), ('p54646', 7), ('q28605', 7), ('q99808', 7), ('q96eb6', 6), ('o54968', 6), ('q5r7c9', 5), ('p32301', 5), ('q19s50', 4), ('q07869', 4), ('q91573', 4), ('q25bc2', 3), ('q5r685', 3), ('a0a0k0mjn3', 3), ('q4h0t5', 3), ('c8vn86', 3), ('p42336', 3), ('p32214', 3), ('p17165', 3), ('p35637', 3), ('p06762', 3), ('p57790', 3), ('o62685', 2), ('p54676', 2), ('p31750', 2), ('q9bea1', 2), ('q29524', 2), ('p28321', 2), ('q15848', 2), ('q8hyl6', 2), ('q0jkv1', 2), ('q01687', 2), ('q5rd31', 2), ('a1ry32', 2), ('p54864', 2), ('p17405', 2), ('q9ntg7', 2), ('o15034', 2), ('q6yfq2', 2), ('q8ng75', 2), ('p81626', 2), ('o14788', 2), ('q9ese2', 2), ('q16719', 1), ('p43220', 1), ('q5pxe2', 1), ('q0z8i9', 1), ('q8k299', 1), ('q6zmj2', 1), ('p12725', 1), ('q5r536', 1), ('p06759', 1), ('p27917', 1), ('q6dce3', 1), ('p46368', 1), ('p32872', 1), ('q9y337', 1), ('q6wn34', 1), ('o02466', 1), ('q9nya3', 1

Building target metadata: 100%|██████████| 50/50 [00:00<00:00, 10016.01it/s]


In [116]:
print(pd.DataFrame(rows).head(200))

          name accession                                     uniprot_url  \
0        hif1a    Q9I8A9  https://www.uniprot.org/uniprotkb/Q9I8A9/entry   
1      YHR026W    P23968  https://www.uniprot.org/uniprotkb/P23968/entry   
2         PFR1    Q91285  https://www.uniprot.org/uniprotkb/Q91285/entry   
3          LEG    Q9FRT8  https://www.uniprot.org/uniprotkb/Q9FRT8/entry   
4        cckar    P70031  https://www.uniprot.org/uniprotkb/P70031/entry   
..         ...       ...                                             ...   
155    C40H1.4    Q03574  https://www.uniprot.org/uniprotkb/Q03574/entry   
156      PVRL1    Q9GL76  https://www.uniprot.org/uniprotkb/Q9GL76/entry   
157     COL6A2    P12110  https://www.uniprot.org/uniprotkb/P12110/entry   
158      MADH3    P84024  https://www.uniprot.org/uniprotkb/P84024/entry   
159  F11F19.25    Q9SJ66  https://www.uniprot.org/uniprotkb/Q9SJ66/entry   

     frequency  n_articles                                           articles  \
0     

In [117]:
import pandas as pd
import json

# rows already built above
df_rows = pd.DataFrame(rows)

# If you have list/dict columns (e.g. articles, article_links), stringify them
for col in ["articles", "article_links"]:
    if col in df_rows.columns:
        df_rows[col] = df_rows[col].apply(lambda x: json.dumps(x) if isinstance(x, (list, dict)) else x)

# Write to CSV
df_rows.to_csv("epmc_top_targets.csv", index=False)
