In [1]:
import requests
import pandas as pd
from collections import Counter
from typing import List, Tuple, Dict, Any
from tqdm import tqdm

# ---------- 1. Fetch articles from Europe PMC ----------


In [15]:
def fetch_epmc_articles(query: str,
                        from_year: int = 2024,
                        to_year: int = 2025,
                        max_results: int = 2000) -> pd.DataFrame:
    
    """Fetches articles from Europe PMC.
    https://europepmc.org/RestfulWebService#!/Europe32PMC32Articles32RESTful32API/search_articles_get

    Parameters
    ----------
    query : str
        Search query
    from_year : int
        From year, default 2024
    to_year : int
        To year, default 2025
    max_results : int
        Maximum number of results to fetch, default 2000

    Returns
    -------
    pd.DataFrame
        DataFrame of articles with columns:
        ['id', 'source', 'pmid', 'pmcid', 'doi', 'title', 'abstract', 'pubYear', 'primary_url']
    """

    url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search" # RestFul API endpoint
    all_rows = []                                                   # Accumulated results
    page_size = 10                                                # Max allowed per page. WebService limits extraction to 1000 per request. Do not increase beyond 1000.
    page = 0                                                        # To store page number  
    

    pbar = tqdm(total=max_results, desc="Fetching articles", unit="articles")

    while len(all_rows) < max_results:                              # Loop until we reach max_results, default 2000. Increasing this may lead to timeouts.  
        params = {
            "query": f"{query} AND PUB_YEAR:[{from_year} TO {to_year}]",
            "format": "json",
            "pageSize": page_size,
            "page": page,
            "resultType": "core",                                   # core: returns full metadata for a given publication ID; including abstract, full text links, and MeSH terms
        }
        
        response = requests.get(url, params=params, timeout=30)
        if not response.ok:
            print(f"Request failed on page {page}: {response.status_code}")
            break
            
        results = response.json()                                  # Parse JSON response
        #print(type(results))
        articles = results.get("resultList", {}).get("result", []) # Extract articles
        #print(f"Articles type: {type(articles)}")
        #print(f"Number of articles fetched: {len(articles)}")
        #print(articles[0])  # Print the first article to inspect its structure
        
        if not articles:  # No more results
            break
            
        # Process current page
        for art in articles:
            if len(all_rows) >= max_results:
                break

            primary_url = ""
            
            # 1) First fullTextUrl if available (highest priority)
            ft_list = art.get("fullTextUrlList", {})
            if ft_list and ft_list.get("fullTextUrl"):
                first_ft_url = ft_list["fullTextUrl"][0].get("url")
                if first_ft_url:
                    primary_url = first_ft_url
            
            # 2) Fallback to canonical links if no full text
            if not primary_url:
                pmcid = (art.get("pmcid") or "").strip()
                pmid = (art.get("pmid") or "").strip()
                doi = (art.get("doi") or "").strip()
                
                if pmcid:
                    core = pmcid.replace("PMC", "") if pmcid.upper().startswith("PMC") else pmcid
                    primary_url = f"https://europepmc.org/article/PMC/{core}"
                elif pmid:
                    primary_url = f"https://europepmc.org/abstract/MED/{pmid}"
                elif doi:
                    primary_url = f"https://doi.org/{doi}"
            # -------------------------------------

            all_rows.append({
                "id": art.get("id", ""),
                "source": art.get("source", ""),
                "pmid": art.get("pmid", ""),
                "pmcid": art.get("pmcid", ""),
                "doi": art.get("doi", ""),
                "title": art.get("title", ""),
                "abstract": art.get("abstractText", art.get("abstract", "")),
                "pubYear": art.get("pubYear", ""),
                "primary_url": primary_url,  
            })
        
        page += 1
        pbar.update(len(articles))
        pbar.set_postfix({"page": page, "total": len(all_rows)})
    
    pbar.close()  # Clean up progress bar
    
    df = pd.DataFrame(all_rows[:max_results])  # Trim to requested max
    return df

df_articles = fetch_epmc_articles("obesity AND targets", 2023, 2025,10)

Fetching articles:   0%|          | 0/10 [00:00<?, ?articles/s]

Fetching articles: 100%|██████████| 10/10 [00:01<00:00,  5.29articles/s, page=1, total=10]


In [None]:
def fetch_epmc_articles(query: str,
                        from_year: int = 2024,
                        to_year: int = 2025,
                        max_results: int = 2000) -> pd.DataFrame:
    
    """Fetches articles from Europe PMC.
    https://europepmc.org/RestfulWebService#!/Europe32PMC32Articles32RESTful32API/search_articles_get

    Parameters
    ----------
    query : str
        Search query
    from_year : int
        From year, default 2024
    to_year : int
        To year, default 2025
    max_results : int
        Maximum number of results to fetch, default 2000

    Returns
    -------
    pd.DataFrame
        DataFrame of articles with columns:
        ['id', 'source', 'pmid', 'pmcid', 'doi', 'title', 'abstract', 'pubYear', 'primary_url']
        e.g. 
          id          source    pmid      pmcid  doi                            title                     abstract       pubYear  primary_url
          41366037    MED       41366037         10.1038/s41598-025-31533-w     Network pharmacology...   This study...  2025     https://doi.org/10.1038/s41598-025-31533-w
    """

    url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search" # RestFul API endpoint
    all_rows = []                                                   # Accumulated results
    page_size = 1000                                                # Max allowed per page. WebService limits extraction to 1000 per request. Do not increase beyond 1000.
    page = 0                                                        # To store page number  
    
    pbar = tqdm(total=max_results, desc="Fetching articles", unit="articles")
    
    while len(all_rows) < max_results:                              # Loop until we reach max_results, default 2000. Increasing this may lead to timeouts.  
        params = {
            "query": f"{query} AND PUB_YEAR:[{from_year} TO {to_year}]",
            "format": "json",
            "pageSize": page_size,
            "page": page,
            "resultType": "core",                                   # core: returns full metadata for a given publication ID; including abstract, full text links, and MeSH terms
        }
        
        response = requests.get(url, params=params, timeout=30)
        if not response.ok:
            print(f"Request failed on page {page}: {response.status_code}")
            break
            
        results = response.json()                                  # Parse JSON response into a dictionary 
        articles = results.get("resultList", {}).get("result", []) # Extract articles and return as a list of dictionaries
        
        if not articles:  # No more results
            break
            
        # Process current page
        for art in articles:                                        # Loop through each article dictionary
            if len(all_rows) >= max_results:                        # Check if we've reached max_results
                break
        
            # Extract primary URL of the article
            primary_url = ""
            
            # First preference : fullTextUrl if available 
            ft_list = art.get("fullTextUrlList", {})                # Get fullTextUrlList dictionary
            if ft_list and ft_list.get("fullTextUrl"):              # If fullTextUrl key exists
                first_ft_url = ft_list["fullTextUrl"][0].get("url") # Extract URL
                if first_ft_url:                                    # If URL is not empty  
                    primary_url = first_ft_url                      # Set as primary URL       
            
            # Fallback : canonical links if no full text
            if not primary_url:
                pmcid = (art.get("pmcid") or "").strip()            # Extract PMCID if available
                pmid = (art.get("pmid") or "").strip()              # Extract PMID if available
                doi = (art.get("doi") or "").strip()                # Extract DOI if available
                
                # PMC123456 → "123456"  → https://europepmc.org/article/PMC/123456
                # MEDABC123 → "ABC123"  → https://europepmc.org/article/MED/ABC123
                # 123456    → "123456"  → https://europepmc.org/article/PMC/123456

                if pmcid:
                    core = pmcid.replace("PMC", "") if pmcid.upper().startswith("PMC") else pmcid
                    primary_url = f"https://europepmc.org/article/PMC/{core}"
                elif pmid:
                    primary_url = f"https://europepmc.org/abstract/MED/{pmid}"
                elif doi:
                    primary_url = f"https://doi.org/{doi}"
            # Collect all relevant fields into a row : extract needed fields from article dictionary with "" as default if key not present 
            all_rows.append({
                "id": art.get("id", ""),
                "source": art.get("source", ""),
                "pmid": art.get("pmid", ""),
                "pmcid": art.get("pmcid", ""),
                "doi": art.get("doi", ""),
                "title": art.get("title", ""),
                "abstract": art.get("abstractText", art.get("abstract", "")),
                "pubYear": art.get("pubYear", ""),
                "primary_url": primary_url,  
            })
        
        page += 1
        pbar.update(len(articles))
        pbar.set_postfix({"page": page, "total": len(all_rows)})
    
    pbar.close()  # Clean up progress bar
    
    df = pd.DataFrame(all_rows[:max_results])  # Trim to requested max
    return df

df_articles = fetch_epmc_articles("obesity AND targets", 2023, 2025,50)

Fetched page 1 (1000 articles, total: 50)


In [86]:
ANN_URL = "https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds"


def get_gene_annotations_for_articles(article_ids: List[str],
                                      chunk_size: int = 8) -> Dict[str, List[Dict[str, Any]]]:
    """
    Call annotationsByArticleIds in small chunks to avoid 414 and API limits. [web:74]
    Returns mapping articleId -> list of gene/protein annotations.
    """
    print("Fetching gene annotations for articles...")
    out: Dict[str, List[Dict[str, Any]]] = {}

    # Convert range to list for tqdm
    chunks = list(range(0, len(article_ids), chunk_size))
    for start in tqdm(chunks, desc="Processing article ID chunks"):
        chunk = article_ids[start:start + chunk_size]
        params = {
            "articleIds": ",".join(chunk),
            "type": "Gene_Proteins",
            "section": "Abstract",
            "provider": "Europe PMC",   
            "format": "JSON",
        }
   #     print(f"Annotations API Request params: {params}")
        r = requests.get(ANN_URL, params=params, timeout=60)
        if not r.ok:
            tqdm.write(f"Annotations API error {r.status_code} for chunk starting at {start}: {r.url}")
            continue
   #     print(f"Annotations API Request text: {r.text}")

        data = r.json()  # list of {"articleId": "...", "annotations": [...]}
        if isinstance(data, dict):
            data = data.get("annotationsByArticle", [])

        for entry in data:
            source = entry.get("source")
            ext_id = entry.get("extId")
            if source and ext_id:
                 aid = f"{source}:{ext_id}"
            else:
                aid = ext_id or source  # fallback if one is missing
            anns = entry.get("annotations", [])
            gene_anns = [a for a in anns if a.get("type", "").lower().startswith("gene_proteins")]
            out[aid] = gene_anns
            print(f"Article ID: {aid}, Gene Annotations: {(gene_anns)}")
    #print(f"Fetched gene annotations for {len(out)} articles.")
    print(f"Sample articleId and annotations: {list(out.items())}")
    return out


In [87]:
def build_article_id_token(row: pd.Series) -> str:
    """
    Build a Europe PMC annotations API ID of the form 'SOURCE:ext_id'. [web:88][web:124]

    Priority:
      1) MED:PMID   for PubMed records
      2) PMC:PMCID  for full-text PubMed Central (remove leading 'PMC' if present)
      3) source:id  as a generic fallback (e.g. PPR:xxxx, AGR:xxxx).
    """
    pmid = (row.get("pmid") or "").strip()
    pmcid = (row.get("pmcid") or "").strip()
    source = (row.get("source") or "").strip()
    eid = (row.get("id") or "").strip()

    # PubMed
    if pmid:
        return f"MED:{pmid}"

    # PubMed Central (pmcid often like 'PMC1234567')
    if pmcid:
        core = pmcid.replace("PMC", "") if pmcid.upper().startswith("PMC") else pmcid
        return f"PMC:{core}"

    # Other sources (preprints, Agricola, etc.) [web:70][web:59]
    if source and eid:
        return f"{source}:{eid}"

    return ""


In [113]:
from collections import Counter, defaultdict
from typing import List, Tuple, Dict, Any, Set
from urllib.parse import urlparse

def _extract_uniprot_accession(uri: str) -> str:
    path = urlparse(uri).path.strip("/")
    parts = path.split("/")
    return parts[1] if len(parts) > 1 else parts[0]

def build_top_targets_from_epmc(df_articles: pd.DataFrame,
                                top_k: int = 100
                                ) -> Tuple[List[Tuple[str, int]], Dict[str, Any]]:
    """
    Function to transform the article and annotation data from Europe PMC into a list of top targets 
    based on frequency of mentions across articles and number of articles mentioning each target. 
        
    Parameters
    ----------
    df_articles : pd.DataFrame
        DataFrame of articles with columns: ['id', 'source', 'pmid', 'pmcid', 'doi', 'title', 'abstract', 'pubYear', 'primary_url']
    top_k : int
        Number of top targets to return based on frequency. Default is 100.
    Returns
    -------
    Tuple[List[Tuple[str, int]], Dict[str, Any]]
        - top_targets : List of top_k targets as (key, frequency) tuples sorted by frequency descending.
        - target_info : Dictionary mapping target key to metadata including name, accession, uniprot_url, frequency, n_articles, and list of articleIdTokens.

    """
    print("Building articleIdTokens...")
    df = df_articles.copy()                                                               # Copy to avoid modifying original DataFrame
    df["articleIdToken"] = df.apply(build_article_id_token, axis=1)                       # Build articleIdTokens to send to Annotations API
    df = df[df["articleIdToken"] != ""]                                                   # Filter out rows with empty articleIdTokens
    print(f"Filtered to {len(df)} articles with valid articleIdTokens from original {len(df_articles)}.")
    tokens = df["articleIdToken"].tolist()                                                # Extract list of articleIdTokens  
    print("Fetching gene/protein annotations from Europe PMC...")
    freq = Counter()                                                                      # Counter to track target frequencies
    ann_map = get_gene_annotations_for_articles(tokens, chunk_size=8)                     # Fetch gene/protein annotations for articles   
    if ann_map:
        n_anns = sum(len(anns) for anns in ann_map.values())
        print(f"Total gene/protein annotations fetched: {n_anns}")
    else:
        print("No annotations returned for any article.")

    # Extract gene/protein targets and calculate frequency
    print("First pass: counting target frequencies...") 
    for aid, anns in tqdm(ann_map.items(),desc="Counting annotations"):                  # Loop through each article
        for ann in anns:                                                                 # Loop through each annotation
            tags = ann.get("tags") or []                                                 # Extract tags 
            if not tags:
                continue
            for tag in tags:                                                             # Loop through each gene / protein tag 
                name = (tag.get("name") or "").strip()                                   # Extract name if available
                uri = (tag.get("uri") or "").strip()                                     # Extract URI if available
                if not uri and not name:
                    continue
                acc = _extract_uniprot_accession(uri) if uri else ""                     # Extract Uniprot accession if URI available 
                key = acc.lower() if acc else name.lower()                               # Use accession as key if available, else name (case insensitive)
                freq[key] += 1                                                           # Increment frequency counter for this target  

    top_targets = freq.most_common(top_k)                                                # Get top_k targets by frequency
    
    # Build set of keys of top k targets for quick lookup
    top_keys: Set[str] = {k for k, _ in top_targets}

    # Build metadata for top targets
    print("Second pass: building target metadata...")
    target_info: Dict[str, Dict[str, Any]] = {}                                          # Mapping from target key to metadata  
    for aid, anns in tqdm(ann_map.items(),desc="Building target metadata"):
        for ann in anns:
            tags = ann.get("tags") or []                                                 # Extract tags
            if not tags:                                                                 # Skip if no tags  
                continue
            for tag in tags:                                                            # Loop through each gene / protein tag
                name = (tag.get("name") or "").strip()                                  # Extract name if available
                uri = (tag.get("uri") or "").strip()                                    # Extract URI if available
                if not uri and not name:
                    continue
                acc = _extract_uniprot_accession(uri) if uri else ""                    # Extract Uniprot accession if URI available
                key = acc.lower() if acc else name.lower()                              # Use accession as key if available, else name (case insensitive)   

                if key not in top_keys:                                                # Skip if not in top k targets (to limit computation and memory)
                    continue

                if key not in target_info:                                             # Initialize metadata for this target if not already present
                    target_info[key] = {
                        "key": key,
                        "frequency": 0,
                        "articles": {}  # {aid: primary_url}
                        "name": name,
                        "accession": acc,
                        "uniprot_url": uri
                }

                articles = target_info[key]["articles"]                                 # Get existing articles dict already stored for this target 
                if aid not in articles:                                                 # Initialize list for this article if not already present
                    primary_url = id_to_primary.get(aid, "")                            # Lookup once per article
                    articles[aid] = primary_url                                         # Store primary_url for this articleIdToken

                target_info[key]["frequency"] += 1                                      # Increment frequency for this target

    # 4) Convert article dicts to sorted lists for serialization
    for key, info in target_info.items():
        info["n_articles"] = len(info["articles"])
        info["article_links"] = info["articles"]

    return top_targets, target_info                                                                # Extract first tag    
            
                

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2127031668.py, line 89)

In [114]:
# Build mapping from articleIdToken -> URLs / primary_url
df_with_tokens = df_articles.copy()
df_with_tokens["articleIdToken"] = df_with_tokens.apply(build_article_id_token, axis=1)

id_to_primary = (
    df_with_tokens
    .set_index("articleIdToken")["primary_url"]
    .to_dict()
)


In [115]:
# Example usage:

print(f"Fetched {len(df_articles)} articles from Europe PMC.")
print(df_articles.head())
top_targets, target_info = build_top_targets_from_epmc(df_articles, top_k=500)

rows = []
for key, count in top_targets:
    info = target_info.get(key, {})
    article_tokens = info.get("articles", [])

      # Extract representative metadata from FIRST article's FIRST tag
    first_article_tags = list(article_tokens.values())[0] if article_tokens else []
    first_tag = first_article_tags[0] if first_article_tags else {}
    
    article_links = []
    for aid in article_tokens.keys():  # Use .keys() for article IDs
        primary = id_to_primary.get(aid, "")
        if primary:
            article_links.append({
                "articleIdToken": aid,
                "primary_url": primary,
            })
    
    rows.append({
        "name": first_tag.get("name", key),           # ✅ From first tag
        "accession": first_tag.get("accession"),      # ✅ From first tag
        "uniprot_url": first_tag.get("uri"),          # ✅ Use "uri" not "uniprot_url"
        "frequency": info.get("frequency", count),
        "n_articles": info.get("n_articles", 0),
        "articles": article_tokens,
        "article_links": article_links,
    })


Fetched 50 articles from Europe PMC.
         id source      pmid pmcid                           doi  \
0  41366037    MED  41366037          10.1038/s41598-025-31533-w   
1  41379482    MED  41379482                   10.2337/dc25-1093   
2  41398384    MED  41398384          10.1038/s41366-025-01979-z   
3  41387344    MED  41387344                   10.1111/obr.70059   
4  41396202    MED  41396202        10.1097/cm9.0000000000003919   

                                               title  \
0  Network pharmacology identifies AKR1C3 and KYN...   
1  Midchildhood Obesity After Exposure to Gestati...   
2  Genome-wide meta-analysis with 2,206,440 indiv...   
3  Circulating Proteins Link Obesity With Cardiac...   
4  Obesity and emerging intervention strategies: ...   

                                            abstract pubYear  \
0  This study aimed to identify and validate key ...    2025   
1  <h4>Objective</h4>To assess how treatment type...    2025   
2  <h4>Background</h4>The

Processing article ID chunks:   0%|          | 0/7 [00:00<?, ?it/s]

Processing article ID chunks:  14%|█▍        | 1/7 [00:01<00:06,  1.04s/it]

Article ID: MED:41366037, Gene Annotations: [{'prefix': 'The genes', 'exact': 'AKR1C3', 'postfix': 'and KYNU were', 'tags': [{'name': 'AKR1C3', 'uri': 'https://www.uniprot.org/uniprotkb/Q5R7C9/entry'}], 'id': 'http://europepmc.org/article/MED/41366037#europepmc-e7624b3b574e9c52e73c2c5f6ea68537', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'genes AKR1C3 and', 'exact': 'KYNU', 'postfix': 'were identified as', 'tags': [{'name': 'KYNU', 'uri': 'https://www.uniprot.org/uniprotkb/Q16719/entry'}], 'id': 'http://europepmc.org/article/MED/41366037#europepmc-f40a7f4b564c801a6e5c22e8242d5725', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'elevated expression of', 'exact': 'AKR1C3', 'postfix': 'and KYNU in', 'tags': [{'name': 'AKR1C3', 'uri': 'https://www.uniprot.org/uniprotkb/Q5R7C9/entry'}], 'id': 'http://europepmc.org/article/MED/41366037#eur

Processing article ID chunks:  29%|██▊       | 2/7 [00:02<00:06,  1.28s/it]

Article ID: MED:41032481, Gene Annotations: [{'prefix': 'also assessed.<h4>Results</h4>', 'exact': 'Phosphoinositide-3-kinase regulatory subunit 1', 'postfix': 'regulatory subunit 1', 'tags': [{'name': 'DDB_G0283385', 'uri': 'https://www.uniprot.org/uniprotkb/P18160/entry'}], 'id': 'http://europepmc.org/article/MED/41032481#europepmc-8e82f6cfb0b9c39a4a6aa062663eea22', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'regulatory subunit 1 (', 'exact': 'PIK3R1', 'postfix': '), a key mediator', 'tags': [{'name': 'PIK3R1', 'uri': 'https://www.uniprot.org/uniprotkb/Q5R685/entry'}], 'id': 'http://europepmc.org/article/MED/41032481#europepmc-ce99230990fadedb80630ddd8ecba2c2', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'mediator in the', 'exact': 'phosphoinositide 3-kinase', 'postfix': '3-kinase (PI3K)/ Protein', 'tags': [{'name': 'SPI3K-1', 'u

Processing article ID chunks:  43%|████▎     | 3/7 [00:03<00:04,  1.11s/it]

Article ID: MED:41344603, Gene Annotations: [{'prefix': 'the presence of', 'exact': 'GLP-1 receptor', 'postfix': 'receptor agonists with', 'tags': [{'name': 'Glpr', 'uri': 'https://www.uniprot.org/uniprotkb/P32301/entry'}], 'id': 'http://europepmc.org/article/MED/41344603#europepmc-df4ffefb62b7e99b617cd3f9531a7eb3', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': '', 'exact': 'Amylin', 'postfix': ', a neuroendocrine hormone', 'tags': [{'name': 'IAPP', 'uri': 'https://www.uniprot.org/uniprotkb/Q28605/entry'}], 'id': 'http://europepmc.org/article/MED/41344603#europepmc-244bf19d318b9f0b082d3b6bffa83044', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'hormone co-released with', 'exact': 'insulin', 'postfix': ', controls hunger, gastric', 'tags': [{'name': 'LEG', 'uri': 'https://www.uniprot.org/uniprotkb/Q9FRT8/entry'}], 'id': 'http://europepm

Processing article ID chunks:  57%|█████▋    | 4/7 [00:04<00:03,  1.10s/it]

Article ID: MED:41228448, Gene Annotations: [{'prefix': 'lipid metabolism, and', 'exact': 'insulin', 'postfix': 'signaling.', 'tags': [{'name': 'LEG', 'uri': 'https://www.uniprot.org/uniprotkb/Q9FRT8/entry'}], 'id': 'http://europepmc.org/article/MED/41228448#europepmc-5d79181871f85068c82eaf467ea0349a', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}]
Article ID: MED:40884426, Gene Annotations: [{'prefix': 'Network analysis highlighted', 'exact': 'JUN', 'postfix': ', TOP2A, APOE, and', 'tags': [{'name': 'Transcription factor AP-1 subunit Jun', 'uri': 'https://www.uniprot.org/uniprotkb/P54864/entry'}], 'id': 'http://europepmc.org/article/MED/40884426#europepmc-78492fed7f8478cfa717d8197bc61376', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'analysis highlighted JUN,', 'exact': 'TOP2A', 'postfix': ', APOE, and LEP', 'tags': [{'name': 'TOP2A', 'uri': 'ht

Processing article ID chunks:  71%|███████▏  | 5/7 [00:05<00:02,  1.02s/it]

Article ID: MED:41306340, Gene Annotations: [{'prefix': 'mass by suppressing', 'exact': 'PPARγ', 'postfix': 'and C/EBPα expression,', 'tags': [{'name': 'PPAR', 'uri': 'https://www.uniprot.org/uniprotkb/Q07869/entry'}], 'id': 'http://europepmc.org/article/MED/41306340#europepmc-89c5c162fba38faa7efd2f5ddadf3ff7', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'suppressing PPARγ and', 'exact': 'C/EBPα', 'postfix': 'expression, and increased', 'tags': [{'name': 'stk3', 'uri': 'https://www.uniprot.org/uniprotkb/Q6IP06/entry'}], 'id': 'http://europepmc.org/article/MED/41306340#europepmc-3f4b25bf5d66238edb50a654cfb7a2af', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'through upregulation of', 'exact': 'UCP1', 'postfix': 'and PGC-1α.', 'tags': [{'name': 'SLC25A7', 'uri': 'https://www.uniprot.org/uniprotkb/Q18P97/entry'}], 'id': 'http://europepm

Processing article ID chunks:  86%|████████▌ | 6/7 [00:07<00:01,  1.34s/it]

Article ID: MED:41046314, Gene Annotations: []
Article ID: MED:41216852, Gene Annotations: []
Article ID: MED:41240330, Gene Annotations: [{'prefix': 'monosaccharide composition of', 'exact': 'CYP', 'postfix': 'was determined using', 'tags': [{'name': 'CYP', 'uri': 'https://www.uniprot.org/uniprotkb/Q26551/entry'}], 'id': 'http://europepmc.org/article/MED/41240330#europepmc-96a7c867e7fe52581054baf1438399b0', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'results showed that', 'exact': 'CYP', 'postfix': 'was composed of', 'tags': [{'name': 'CYP', 'uri': 'https://www.uniprot.org/uniprotkb/Q26551/entry'}], 'id': 'http://europepmc.org/article/MED/41240330#europepmc-fff321eac986bd78ea5b575359ca332f', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'anti-obesity effects of', 'exact': 'CYP', 'postfix': 'were evaluated using', 'tags': [{'name': '

Processing article ID chunks: 100%|██████████| 7/7 [00:08<00:00,  1.22s/it]


Article ID: MED:40868627, Gene Annotations: []
Article ID: PPR:PPR1136007, Gene Annotations: []
Sample articleId and annotations: [('MED:41366037', [{'prefix': 'The genes', 'exact': 'AKR1C3', 'postfix': 'and KYNU were', 'tags': [{'name': 'AKR1C3', 'uri': 'https://www.uniprot.org/uniprotkb/Q5R7C9/entry'}], 'id': 'http://europepmc.org/article/MED/41366037#europepmc-e7624b3b574e9c52e73c2c5f6ea68537', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'genes AKR1C3 and', 'exact': 'KYNU', 'postfix': 'were identified as', 'tags': [{'name': 'KYNU', 'uri': 'https://www.uniprot.org/uniprotkb/Q16719/entry'}], 'id': 'http://europepmc.org/article/MED/41366037#europepmc-f40a7f4b564c801a6e5c22e8242d5725', 'type': 'Gene_Proteins', 'section': 'Abstract (http://purl.org/dc/terms/abstract)', 'provider': 'Europe PMC'}, {'prefix': 'elevated expression of', 'exact': 'AKR1C3', 'postfix': 'and KYNU in', 'tags': [{'name': 'AKR1C3', 'uri': 

Counting annotations: 100%|██████████| 50/50 [00:00<00:00, 8466.16it/s]


top_targets sample: [('q9i8a9', 10), ('p23968', 10), ('q91285', 10), ('q9frt8', 8), ('p70031', 8), ('q26551', 8), ('p54646', 7), ('q28605', 7), ('q99808', 7), ('q96eb6', 6), ('o54968', 6), ('q5r7c9', 5), ('p32301', 5), ('q19s50', 4), ('q07869', 4), ('q91573', 4), ('q25bc2', 3), ('q5r685', 3), ('a0a0k0mjn3', 3), ('q4h0t5', 3), ('c8vn86', 3), ('p42336', 3), ('p32214', 3), ('p17165', 3), ('p35637', 3), ('p06762', 3), ('p57790', 3), ('o62685', 2), ('p54676', 2), ('p31750', 2), ('q9bea1', 2), ('q29524', 2), ('p28321', 2), ('q15848', 2), ('q8hyl6', 2), ('q0jkv1', 2), ('q01687', 2), ('q5rd31', 2), ('a1ry32', 2), ('p54864', 2), ('p17405', 2), ('q9ntg7', 2), ('o15034', 2), ('q6yfq2', 2), ('q8ng75', 2), ('p81626', 2), ('o14788', 2), ('q9ese2', 2), ('q16719', 1), ('p43220', 1), ('q5pxe2', 1), ('q0z8i9', 1), ('q8k299', 1), ('q6zmj2', 1), ('p12725', 1), ('q5r536', 1), ('p06759', 1), ('p27917', 1), ('q6dce3', 1), ('p46368', 1), ('p32872', 1), ('q9y337', 1), ('q6wn34', 1), ('o02466', 1), ('q9nya3', 1

Building target metadata: 100%|██████████| 50/50 [00:00<00:00, 10016.01it/s]


In [116]:
print(pd.DataFrame(rows).head(200))

          name accession                                     uniprot_url  \
0        hif1a    Q9I8A9  https://www.uniprot.org/uniprotkb/Q9I8A9/entry   
1      YHR026W    P23968  https://www.uniprot.org/uniprotkb/P23968/entry   
2         PFR1    Q91285  https://www.uniprot.org/uniprotkb/Q91285/entry   
3          LEG    Q9FRT8  https://www.uniprot.org/uniprotkb/Q9FRT8/entry   
4        cckar    P70031  https://www.uniprot.org/uniprotkb/P70031/entry   
..         ...       ...                                             ...   
155    C40H1.4    Q03574  https://www.uniprot.org/uniprotkb/Q03574/entry   
156      PVRL1    Q9GL76  https://www.uniprot.org/uniprotkb/Q9GL76/entry   
157     COL6A2    P12110  https://www.uniprot.org/uniprotkb/P12110/entry   
158      MADH3    P84024  https://www.uniprot.org/uniprotkb/P84024/entry   
159  F11F19.25    Q9SJ66  https://www.uniprot.org/uniprotkb/Q9SJ66/entry   

     frequency  n_articles                                           articles  \
0     

In [117]:
import pandas as pd
import json

# rows already built above
df_rows = pd.DataFrame(rows)

# If you have list/dict columns (e.g. articles, article_links), stringify them
for col in ["articles", "article_links"]:
    if col in df_rows.columns:
        df_rows[col] = df_rows[col].apply(lambda x: json.dumps(x) if isinstance(x, (list, dict)) else x)

# Write to CSV
df_rows.to_csv("epmc_top_targets.csv", index=False)


### Integration of Google Engines


In [7]:
!pip install google-generativeai langextract requests beautifulsoup4 gilda pandas streamlit playwright
!playwright install  # Optional for JS sites


Collecting google-generativeai

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aext-shared 4.1.0 requires anaconda-cloud-auth>=0.7.1, which is not installed.
aext-share-notebook-server 4.1.0 requires anaconda-cloud-auth>=0.7.1, which is not installed.
aext-toolbox 4.1.0 requires anaconda-cloud-auth>=0.7.1, which is not installed.
aiobotocore 2.12.3 requires botocore<1.34.70,>=1.34.41, but you have botocore 1.42.15 which is incompatible.



  Downloading google_generativeai-0.8.6-py3-none-any.whl.metadata (3.9 kB)
Collecting langextract
  Downloading langextract-1.1.1-py3-none-any.whl.metadata (20 kB)
Collecting gilda
  Using cached gilda-1.4.1-py3-none-any.whl.metadata (9.2 kB)
Collecting playwright
  Downloading playwright-1.57.0-py3-none-win_amd64.whl.metadata (3.5 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.28.1-py3-none-any.whl.metadata (3.3 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.187.0-py3-none-any.whl.metadata (7.0 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Downloading google_auth-2.45.0-py2.py3-none-any.whl.metadata (6.8 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativ

In [28]:
import os
import re
from typing import List, Dict, Any
import requests
from bs4 import BeautifulSoup
import pandas as pd
import streamlit as st
import google.generativeai as genai
import langextract as lx
import gilda
from urllib.parse import urlparse

os.environ["GEMINI_API_KEY"]="AIzaSyAGFMhZ3REso3ZcOW3meeSJvlO7uo6CgZM"  #os.environ["GEMINI_API_KEY"] = st.secrets.get("GEMINI_API_KEY", "")

genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = "gemini-2.0-flash" #"gemini-1.5-pro"  # Or "gemini-1.5-flash" for speed

def fetch_web_content_from_df(df: pd.DataFrame, url_col: str = "primary_url") -> tuple:
    """Fetch content from DataFrame URLs, return contents + index mapping."""
    contents = []
    valid_indices = []
    
    for idx, row in df.iterrows():
        url = row[url_col]
        if pd.isna(url) or not isinstance(url, str):
            contents.append(f"Invalid URL at index {idx}")
            continue
            
        try:
            resp = requests.get(url, timeout=10)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, 'html.parser')
            # Clean: remove scripts, nav, footers, etc.
            for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
                tag.decompose()
            text = soup.get_text(separator=' ', strip=True)
            contents.append(text[:40])  # Gemini context limit
            valid_indices.append(idx)
        except Exception as e:
            contents.append(f"Error fetching {url}: {str(e)}")
            valid_indices.append(idx)
    
    return contents, valid_indices

def extract_bentities(contents: List[str], user_prompt: str) -> List[Dict]:
    """Extract biomedical entities using LangExtract + Gemini."""
    prompt = f"""
    {user_prompt}
    Focus on: genes/proteins (HGNC/UniProt), diseases, drugs/targets, evidence sentences.
    Output exact spans with context. Use biomedical terms only.
    """
    examples = [lx.data.ExampleData(  # Few-shot
        text="GLP1R mutations cause obesity via impaired signaling [PMC123].",
        extractions=[
            lx.data.Extraction(extraction_class="gene", extraction_text="GLP1R",
                             attributes={"type": "receptor", "evidence": "obesity signaling"}),
            lx.data.Extraction(extraction_class="disease", extraction_text="obesity",
                             attributes={"context": "impaired signaling"})
        ]
    )]
    
    results = []
    for i, text in enumerate(contents):
        result = lx.extract(
            text_or_documents=text,
            prompt_description=prompt,
            examples=examples,
            model_id=model
        )
        for ext in result.documents[0].extractions:
            results.append({
                'url_index': i,
                'entity': ext.extraction_text,
                'class': ext.extraction_class,
                'attributes': ext.attributes,
                'span_start': ext.span_start,
                'span_end': ext.span_end
            })
    return results[web:68]

def normalize_entities(entities: List[Dict]) -> List[Dict]:
    """Normalize with Gilda (your preferred tool)."""
    normalized = []
    for e in entities:
        if e['class'] in ['gene', 'protein']:
            grounds = gilda.ground(e['entity'])
            if grounds:
                best = max(grounds, key=lambda x: x.score)
                e['hgnc_id'] = getattr(best.entry, 'id', None)
                e['canonical_name'] = getattr(best.entry, 'name', e['entity'])
        normalized.append(e)
    return normalized[memory:49]
'''
# Streamlit UI (fits your PSP/TargetScraper style)
def main():
    st.title("🧬 BioLink Extractor")
    st.markdown("Gemini-powered biomedical extraction from weblinks")
    
    urls = st.text_area("URLs (one per line)", height=100).splitlines()
    prompt = st.text_area("Extraction prompt", 
                         value="Extract drug targets, diseases, and evidence.")
    
    if st.button("🚀 Extract", type="primary"):
        with st.spinner("Fetching & extracting..."):
            contents = fetch_web_content([u.strip() for u in urls if u.strip()])
            raw = extract_bentities(contents, prompt)
            df = pd.DataFrame(normalize_entities(raw))
            
            st.dataframe(df, use_container_width=True)
            st.download_button("📥 CSV", df.to_csv(index=False), "biomedical_entities.csv")
            
            if not df.empty:
                st.success(f"Extracted {len(df)} entities from {len(contents)} pages!")

if __name__ == "__main__":
    main()
'''


'\n# Streamlit UI (fits your PSP/TargetScraper style)\ndef main():\n    st.title("🧬 BioLink Extractor")\n    st.markdown("Gemini-powered biomedical extraction from weblinks")\n    \n    urls = st.text_area("URLs (one per line)", height=100).splitlines()\n    prompt = st.text_area("Extraction prompt", \n                         value="Extract drug targets, diseases, and evidence.")\n    \n    if st.button("🚀 Extract", type="primary"):\n        with st.spinner("Fetching & extracting..."):\n            contents = fetch_web_content([u.strip() for u in urls if u.strip()])\n            raw = extract_bentities(contents, prompt)\n            df = pd.DataFrame(normalize_entities(raw))\n            \n            st.dataframe(df, use_container_width=True)\n            st.download_button("📥 CSV", df.to_csv(index=False), "biomedical_entities.csv")\n            \n            if not df.empty:\n                st.success(f"Extracted {len(df)} entities from {len(contents)} pages!")\n\nif __name__ == "_

In [29]:
contents, indices = fetch_web_content_from_df(df_articles)
entities = extract_bentities(contents, "Any mouse knock out models tested?")
norm_entities = normalize_entities(entities)

[94m[1mLangExtract[0m: model=[92mgemini-2.0-flash[0m, current=[92m40[0m chars, processed=[92m0[0m chars:  [00:00]INFO: [2025-12-23 14:03:08] google_genai.models - AFC is enabled with max remote calls: 10.
INFO: [2025-12-23 14:03:08] httpx - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 429 Too Many Requests"
[94m[1mLangExtract[0m: model=[92mgemini-2.0-flash[0m, current=[92m40[0m chars, processed=[92m0[0m chars:  [00:00]


InferenceRuntimeError: Gemini API error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.0-flash\nPlease retry in 53.151898896s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash'}}, {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash'}}, {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_input_token_count', 'quotaId': 'GenerateContentInputTokensPerModelPerMinute-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash'}}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '53s'}]}}

### Test over a downloaded LLM model

In [None]:
import json
import pandas as pd
from typing import Dict, Any

QUESTIONS = {
    "disease_linkage": [
        "Does the target show genetic, expression, or pathway associations with the disease?",
        "Are there contradictory findings or context-specific effects?"
    ],
    "validation_strength": [
        "Has the target-disease relationship been validated across study types?",
        "What are the effect sizes, p-values, and model outcomes?"
    ],
    "druggability_safety": [
        "Is the target druggable with suitable binding pockets or precedents?",
        "What are off-target effects, tolerance risks, or toxicity red flags?"
    ],
    "novelty_prioritization": [
        "How novel is the target with limited prior validation?",
        "What preclinical gaps exist?"
    ]
}

In [None]:
def build_prompt(title: str, abstract: str, full_text: str = None) -> str:
    context = f"Title: {title}\nAbstract: {abstract}"
    if full_text:
        context += f"\n\nFull Text Excerpt:\n{full_text[:3500]}"
    
    prompt = f"""You are an expert drug discovery researcher. Analyze this biomedical article for target discovery and prioritization evidence.

Article:
{context}

Answer these questions precisely. For each category, provide:
- Answer: Yes/No/Partial + brief explanation (1-2 sentences)
- Evidence: Key quotes or data supporting the answer
- Confidence: Low/Medium/High

Also list all gene/protein targets mentioned.

Questions:
"""
    for category, qs in QUESTIONS.items():
        prompt += f"\n## {category.replace('_',' ').title()}\n"
        for q in qs:
            prompt += f"- {q}\n"

    prompt += """
Output ONLY valid JSON in this exact schema:
{
  "overall_targets": ["gene1", "gene2"],
  "disease_linkage": {"answer": "...", "evidence": ["..."], "confidence": "High"},
  "validation_strength": {"answer": "...", "evidence": ["..."], "confidence": "Medium"},
  "druggability_safety": {"answer": "...", "evidence": ["..."], "confidence": "Low"},
  "novelty_prioritization": {"answer": "...", "evidence": ["..."], "confidence": "Medium"},
  "summary_score": "High/Medium/Low priority target"
}
Do not include any text before or after the JSON.
"""
    return prompt


def llm_analyze_article_local(title: str, abstract: str, full_text: str = None) -> Dict[str, Any]:
    prompt = build_prompt(title, abstract, full_text)
    
    out = llm(
        prompt,
        max_tokens=1024,
        temperature=0.1,
        stop=None,
        echo=False
    )
    text = out["choices"][0]["text"].strip()
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        # optional: try to salvage JSON with a heuristic, or return raw
        return {"error": "Failed to parse JSON", "raw": text}


In [None]:
def process_articles_local(df: pd.DataFrame, batch_size: int = 3) -> pd.DataFrame:
    results = []
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        print(f"Batch {i//batch_size + 1} with {len(batch)} articles")
        for _, row in batch.iterrows():
            analysis = llm_analyze_article_local(
                title=row["title"],
                abstract=row["abstract"],
                full_text=row.get("full_text", None)
            )
            analysis["title"] = row["title"]
            analysis["pmcid"] = row.get("pmcid", "")
            results.append(analysis)
    res_df = pd.DataFrame(results)

    # optional: flatten
    for key in QUESTIONS.keys():
        if key in res_df.columns:
            res_df[f"{key}_answer"] = res_df[key].apply(
                lambda x: x.get("answer", "") if isinstance(x, dict) else ""
            )
            res_df[f"{key}_confidence"] = res_df[key].apply(
                lambda x: x.get("confidence", "") if isinstance(x, dict) else ""
            )
    return res_df


if __name__ == "__main__":
    df_articles = pd.read_csv("your_epmc_articles.csv")  # title, abstract, pmcid, full_text?
    analyzed_df = process_articles_local(df_articles.head(10))  # start small on CPU
    analyzed_df.to_json("target_analysis_local.json", orient="records", indent=2)
    analyzed_df.to_csv("target_analysis_local.csv", index=False)


In [1]:
!pip install requests pandas langextract gilda beautifulsoup4

Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.2.6-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ------------ ------------------------- 20.5/60.8 kB 108.9 kB/s eta 0:00:01
     ------------------- ------------------ 30.7/60.8 kB 146.3 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/60.8 kB 219.0 kB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 231.1 kB/s eta 0:00:00
Downloading numpy-2.2.6-cp312-cp312-win_amd64.whl (12.6 MB)
   ---------------------------------------- 0.0/12.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.6 MB 330.3 kB/s eta 0:00:39
   --

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
contourpy 1.2.0 requires numpy<2.0,>=1.20, but you have numpy 2.2.6 which is incompatible.
numba 0.59.1 requires numpy<1.27,>=1.22, but you have numpy 2.2.6 which is incompatible.
pywavelets 1.5.0 requires numpy<2.0,>=1.22.4, but you have numpy 2.2.6 which is incompatible.
streamlit 1.32.0 requires numpy<2,>=1.19.3, but you have numpy 2.2.6 which is incompatible.
tensorflow 2.19.0 requires numpy<2.2.0,>=1.26.0, but you have numpy 2.2.6 which is incompatible.


In [2]:
import os
import json
from typing import Dict, Any, List, Tuple

import requests
import pandas as pd
from bs4 import BeautifulSoup
import langextract as lx
import gilda

# ---------- CONFIG: Mistral API ----------
os.environ["MISTRAL_API_KEY"] = "Bpem6ZnjrB8PLFbzhBZFbqDzNtlqigI5"   # TODO: replace with your key or use env vars
MISTRAL_API_KEY = os.environ["MISTRAL_API_KEY"]

MISTRAL_BASE_URL = "https://api.mistral.ai/v1/chat/completions"
MISTRAL_MODEL_ID = "mistral-small-latest"   # or another Mistral chat model



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\aksha\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\aksha\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\aksha\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "c:\Users\aksha\anaconda3\Lib\site-pack

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\aksha\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\aksha\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\aksha\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "c:\Users\aksha\anaconda3\Lib\site-pack

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



ImportError: numpy.core.multiarray failed to import

In [None]:
def mistral_chat(prompt: str,
                 max_tokens: int = 1024,
                 temperature: float = 0.1) -> str:
    """Call Mistral API with a simple chat completion request."""
    headers = {
        "Authorization": f"Bearer {MISTRAL_API_KEY}",
        "Content-Type": "application/json",
    }
    payload = {
        "model": MISTRAL_MODEL_ID,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "max_tokens": max_tokens,
        "temperature": temperature,
    }
    resp = requests.post(MISTRAL_BASE_URL, headers=headers, json=payload, timeout=60)
    resp.raise_for_status()
    data = resp.json()
    return data["choices"][0]["message"]["content"]
