In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [3]:
from Bio import Entrez
from google.colab import userdata

Entrez.email = "alabir.fuad@gmail.com"
Entrez.api_key = userdata.get('ENTREZ_API')

def fetch_pubmed_drug_targets(disease, max_papers=1000):
    terms = [
        f'("{disease}"[MeSH Terms] OR "{disease}")',
        '("drug target" OR "therapeutic target" OR "molecular target" OR biomarker OR pathway OR mechanism)',
        '("protein" OR "gene" OR "enzyme" OR "receptor" OR "kinase" OR "transporter")',
        '("small molecule" OR inhibitor OR agonist OR antagonist OR modulator)'
    ]
    q = " AND ".join(terms)
    print(q)
    res = Entrez.esearch(db="pubmed", term=q, retmax=max_papers)
    ids = Entrez.read(res)["IdList"]
    return ids

ids = fetch_pubmed_drug_targets("Alzheimer's disease")
print(len(ids))

("Alzheimer's disease"[MeSH Terms] OR "Alzheimer's disease") AND ("drug target" OR "therapeutic target" OR "molecular target" OR biomarker OR pathway OR mechanism) AND ("protein" OR "gene" OR "enzyme" OR "receptor" OR "kinase" OR "transporter") AND ("small molecule" OR inhibitor OR agonist OR antagonist OR modulator)
1000


In [None]:
import requests
import xml.etree.ElementTree as ET
from typing import Dict, List, Optional
import time

class PubMedRetriever:
    """
    A class to retrieve PubMed abstracts using NCBI's E-utilities API.
    """

    def __init__(self, email: str = None, tool: str = "pubmed_retriever"):
        """
        Initialize the PubMed retriever.

        Args:
            email: Your email address (recommended by NCBI)
            tool: Name of your tool/script
        """
        self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        self.email = email
        self.tool = tool

    def get_abstract(self, pmid: str) -> Optional[Dict]:
        """
        Retrieve abstract and metadata for a given PubMed ID.

        Args:
            pmid: PubMed ID as string

        Returns:
            Dictionary containing article information or None if not found
        """
        try:
            # Construct the efetch URL
            url = f"{self.base_url}efetch.fcgi"
            params = {
                'db': 'pubmed',
                'id': pmid,
                'retmode': 'xml',
                'rettype': 'abstract'
            }

            if self.email:
                params['email'] = self.email
            if self.tool:
                params['tool'] = self.tool

            # Make the request
            response = requests.get(url, params=params)
            response.raise_for_status()

            # Parse XML response
            root = ET.fromstring(response.content)

            # Extract article information
            article_info = self._parse_article(root, pmid)
            return article_info

        except requests.RequestException as e:
            print(f"Error fetching PMID {pmid}: {e}")
            return None
        except ET.ParseError as e:
            print(f"Error parsing XML for PMID {pmid}: {e}")
            return None

    def get_multiple_abstracts(self, pmids: List[str], delay: float = 0.34) -> Dict[str, Dict]:
        """
        Retrieve abstracts for multiple PMIDs.

        Args:
            pmids: List of PubMed IDs as strings
            delay: Delay between requests in seconds (NCBI recommends max 3 requests/second)

        Returns:
            Dictionary mapping PMIDs to article information
        """
        results = {}

        for pmid in pmids:
            print(f"Fetching PMID: {pmid}")
            result = self.get_abstract(pmid)
            if result:
                results[pmid] = result
            else:
                print(f"Failed to retrieve PMID: {pmid}")

            # Be respectful to NCBI servers
            time.sleep(delay)

        return results

    def _parse_article(self, root: ET.Element, pmid: str) -> Dict:
        """
        Parse XML response and extract article information.

        Args:
            root: XML root element
            pmid: PubMed ID

        Returns:
            Dictionary containing parsed article information
        """
        article_info = {
            'pmid': pmid,
            'title': '',
            'abstract': '',
            'authors': [],
            'journal': '',
            'publication_date': '',
            'doi': '',
            'pmcid': ''
        }

        # Find the article element
        article = root.find('.//Article')
        if article is None:
            return article_info

        # Extract title
        title_elem = article.find('.//ArticleTitle')
        if title_elem is not None:
            article_info['title'] = ''.join(title_elem.itertext()).strip()

        # Extract abstract
        abstract_elem = article.find('.//Abstract/AbstractText')
        if abstract_elem is not None:
            article_info['abstract'] = ''.join(abstract_elem.itertext()).strip()
        else:
            # Handle structured abstracts
            abstract_parts = article.findall('.//Abstract/AbstractText')
            if abstract_parts:
                abstract_text = []
                for part in abstract_parts:
                    label = part.get('Label', '')
                    text = ''.join(part.itertext()).strip()
                    if label:
                        abstract_text.append(f"{label}: {text}")
                    else:
                        abstract_text.append(text)
                article_info['abstract'] = ' '.join(abstract_text)

        # Extract authors
        authors = article.findall('.//Author')
        for author in authors:
            lastname = author.find('LastName')
            forename = author.find('ForeName')
            if lastname is not None:
                name = lastname.text
                if forename is not None:
                    name = f"{forename.text} {name}"
                article_info['authors'].append(name)

        # Extract journal information
        journal = article.find('.//Journal/Title')
        if journal is not None:
            article_info['journal'] = journal.text

        # Extract publication date
        pub_date = article.find('.//PubDate')
        if pub_date is not None:
            year = pub_date.find('Year')
            month = pub_date.find('Month')
            day = pub_date.find('Day')

            date_parts = []
            if year is not None:
                date_parts.append(year.text)
            if month is not None:
                date_parts.append(month.text)
            if day is not None:
                date_parts.append(day.text)

            article_info['publication_date'] = ' '.join(date_parts)

        # Extract DOI and PMCID
        article_ids = root.findall('.//ArticleId')
        for aid in article_ids:
            id_type = aid.get('IdType')
            if id_type == 'doi':
                article_info['doi'] = aid.text
            elif id_type == 'pmc':
                article_info['pmcid'] = aid.text

        return article_info

In [None]:
retriever = PubMedRetriever()
results = retriever.get_multiple_abstracts(ids[:5])

Fetching PMID: 40946177
Fetching PMID: 40945382
Fetching PMID: 40945030
Fetching PMID: 40943644
Fetching PMID: 40942007


In [None]:
results[ids[0]]["abstract"]

"Maintenance of glutamate homeostasis is essential for synaptic plasticity and cognition. Disrupted glutamate-glutamine cycling causes chronic excitotoxicity, a key driver of cognitive deficits in Alzheimer's disease (AD), though regulatory mechanisms remain unclear. Pigment epithelium-derived factor (PEDF), a neuroprotective protein declining with age, is demonstrated here to play a novel role in synaptic glutamate clearance. Analysis of peripheral blood samples from 19 patients with AD and 75 non-dementia control subjects revealed lower levels of PEDF in patients, and loss of PEDF correlates with cognitive decline. PEDF-deficient mice exhibit defective learning and memory, and higher susceptibility to AD. Furthermore, PEDF deficiency impaired synaptic plasticity and dendritic spine morphology. Mechanistically, PEDF inhibits ubiquitin-proteasome-dependent degradation of astrocytic glutamate transporter-1 (GLT-1) and normally guarantees elimination of synaptic glutamate by modulating t

In [None]:
from google import genai

GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
client = genai.Client(api_key=GEMINI_API_KEY)

In [None]:
import json

def parse_json(json_string):
    try:
        json_string = response.text.strip().strip('```json').strip('```')
        drug_targets = json.loads(json_string)
        return drug_targets
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        print(f"Problematic string: {response.text}")
        return

In [None]:
import time, random

delay = 60
max_retries = 3
all_targets = {}

for id in ids:
    result = retriever.get_abstract(id)

    for attempt in range(max_retries):
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash-lite",
                contents=f'''
                From the following abstract, extract all **protein-coding genes or proteins** that are explicitly described as potential or validated **drug targets**, particularly for small-molecule therapeutics (e.g., inhibitors, agonists, antagonists, modulators).

                For each identified target, return an object with:
                - "name": the official gene/protein symbol (HGNC if possible)
                - "confidence": a float between 0 and 1 indicating how certain you are that this is a small-molecule druggable target in the given context.

                If there is not explicit mention of drug targets, return: {{"drug_target": []}}.

                Output only valid JSON in the exact form:
                {{"drug_target": [{{"name": "GENE", "confidence": 0.95}}, ...]}}.

                Abstract: {result["abstract"]}
                '''
            )
            drug_targets = parse_json(response.text)
            all_targets[id] = drug_targets
            print(id, "\t", drug_targets)
            break
        except Exception as e:
            print(f"Error fetching/parsing for ID {id}: {e}")
            if attempt < max_retries - 1:
                wait = delay + random.uniform(0, 1)
                print(f"Retrying in {wait:.1f}s...")
                time.sleep(wait)
            else:
                print(f"Failed after {max_retries} attempts. Skipping ID {id}.")

40946177 	 {'drug_target': [{'name': 'PEDF', 'confidence': 0.95}, {'name': 'GLT-1', 'confidence': 0.9}]}
40945382 	 {'drug_target': [{'name': 'RYR1', 'confidence': 0.95}, {'name': 'RYR3', 'confidence': 0.95}]}
40945030 	 {'drug_target': []}
40943644 	 {'drug_target': [{'name': 'AChE', 'confidence': 1.0}]}
40942007 	 {'drug_target': []}
40940766 	 {'drug_target': [{'name': 'tau', 'confidence': 0.95}, {'name': 'amyloid beta', 'confidence': 0.8}]}
40940748 	 {'drug_target': [{'name': 'HDAC', 'confidence': 1.0}]}
40940223 	 {'drug_target': [{'name': 'ERBB4', 'confidence': 1.0}, {'name': 'TLR4', 'confidence': 0.9}]}
40939527 	 {'drug_target': [{'name': 'Abl', 'confidence': 0.95}]}
40938528 	 {'drug_target': [{'name': 'GLP1R', 'confidence': 0.95}, {'name': 'GIPR', 'confidence': 0.95}]}
40938483 	 {'drug_target': [{'name': 'AChE', 'confidence': 0.95}, {'name': 'BChE', 'confidence': 0.95}]}
40938450 	 {'drug_target': [{'name': 'GST-38', 'confidence': 0.8}, {'name': 'GST-1', 'confidence': 0.8}]

In [None]:
len(all_targets.keys())

994

In [None]:
import json

with open('all_targets.json', 'w') as f:
    json.dump(all_targets, f)

In [None]:
from collections import defaultdict
import pandas as pd

def aggregate_targets(all_targets, top_n=10):
    agg = defaultdict(lambda: {"count": 0, "confidence_sum": 0.0})

    for _, data in all_targets.items():
        for t in data.get("drug_target", []):
            name = t["name"].strip().upper()
            conf = float(t["confidence"])
            agg[name]["count"] += 1
            agg[name]["confidence_sum"] += conf

    rows = []
    for name, stats in agg.items():
        avg_conf = stats["confidence_sum"] / stats["count"]
        score = stats["count"] * avg_conf
        rows.append({"name": name, "count": stats["count"], "avg_confidence": avg_conf, "score": score})

    df = pd.DataFrame(rows)
    df = df[df["count"] >= 5]
    df = df.sort_values(by="avg_confidence", ascending=False).reset_index(drop=True)
    return df.head(top_n)

top_targets = aggregate_targets(all_targets)
top_targets

Unnamed: 0,name,count,avg_confidence,score
0,MAO-B,7,0.964286,6.75
1,SGLT2,8,0.95625,7.65
2,BCHE,17,0.951765,16.18
3,CGAS,8,0.95,7.6
4,GLP-1 RECEPTOR,6,0.95,5.7
5,SEH,7,0.95,6.65
6,GLP1R,10,0.95,9.5
7,DYRK1A,7,0.941429,6.59
8,ACHE,42,0.936429,39.33
9,HDAC,7,0.935714,6.55


In [None]:
top_targets.to_csv("top_targets.csv", index=False)

In [None]:
from google import genai
from google.genai import types

# Configure the client
client = genai.Client(api_key=GEMINI_API_KEY)

# Define the grounding tool
grounding_tool = types.Tool(
    google_search=types.GoogleSearch()
)

# Configure generation settings
config = types.GenerateContentConfig(
    tools=[grounding_tool]
)

In [None]:
delay = 60
max_retries = 3
target_eval = {}

for target in top_targets["name"].values:
    for attempt in range(max_retries):
        try:
            # Make the request
            response = client.models.generate_content(
                model="gemini-2.5-pro",
                contents=f'''
                You are an expert biomedical research assistant specializing in drug discovery for Alzheimer’s Disease (AD).
                You have access to recent scientific literature and reasoning capabilities.

                TASK:
                Validate whether the following protein/gene is a promising **small-molecule drug target** for Alzheimer's Disease.

                Target: {target}

                INSTRUCTIONS:
                1. Use your search capabilities to find **recent (2020–2025)** peer-reviewed studies, reviews, or authoritative sources mentioning this target in the context of Alzheimer's Disease and small-molecule drug discovery.
                - Prefer experimental evidence (e.g., inhibition, activation, modulation, animal models, human studies).
                - Consider clinical trial data or high-quality preclinical studies.
                - Discard vague, speculative mentions without strong evidence.
                2. Evaluate the evidence considering:
                - **Confidence Score (0–1):** How likely this target is genuinely druggable by small molecules in AD (explicit experimental/clinical support, reproducibility).
                - **Novelty Score (0–1):** How recent and original the target is in the AD drug discovery landscape.
                    - 1 = very novel/emerging,
                    - 0 = heavily studied / saturated.
                - **Evidence Score (0–1):** Quality and robustness of evidence.
                    - 1 = strong experimental/clinical validation,
                    - 0 = anecdotal or speculative.
                3. Provide a **short reasoning trace** (2–3 sentences max) summarizing your judgment.

                OUTPUT FORMAT (JSON only, no explanations outside JSON):
                {{
                "target": {target},
                "confidence_score": 0.0-1.0,
                "novelty_score": 0.0-1.0,
                "evidence_score": 0.0-1.0,
                "reasoning": "Brief justification with supporting evidence."
                }}
                ''',
                config=config,
            )

            target_info = parse_json(response.text)
            target_eval[target] = target_info
            print(id, "\t", target_info)
            break
        except Exception as e:
            print(f"Error fetching/parsing for ID {id}: {e}")
            if attempt < max_retries - 1:
                wait = delay + random.uniform(0, 1)
                print(f"Retrying in {wait:.1f}s...")
                time.sleep(wait)
            else:
                print(f"Failed after {max_retries} attempts. Skipping ID {id}.")

39893485 	 {'target': 'MAO-B', 'confidence_score': 0.8, 'novelty_score': 0.1, 'evidence_score': 0.9, 'reasoning': "MAO-B is a well-established target with a long history in neurodegeneration, resulting in a low novelty score. However, its role in Alzheimer's-related oxidative stress is well-supported, and it is highly druggable with small molecules. [2, 4, 14] Confidence and evidence are high due to extensive recent preclinical development of novel inhibitors and an ongoing Phase 2a clinical trial for KDS2010, a selective MAO-B inhibitor, in patients with early Alzheimer's disease. [5, 8, 9]"}
39893485 	 {'target': 'SGLT2', 'confidence_score': 0.8, 'novelty_score': 0.9, 'evidence_score': 0.7, 'reasoning': "SGLT2 is a highly promising drug repurposing target, with strong evidence from multiple large cohort studies and meta-analyses demonstrating that its inhibition via existing small-molecule drugs is associated with a significantly lower risk of dementia and Alzheimer's Disease. [5, 9,

In [None]:
target_eval_df = pd.DataFrame.from_dict(target_eval, orient='index')
target_eval_df.index.name = 'target'
target_eval_df = target_eval_df.sort_values(by="novelty_score", ascending=False).reset_index(drop=True)

In [None]:
# Set display options to show the full content of the 'reasoning' column
pd.set_option('display.max_colwidth', None)

# Display the 'reasoning' column
display(target_eval_df)

Unnamed: 0,target,confidence_score,novelty_score,evidence_score,reasoning
0,SGLT2,0.8,0.9,0.7,"SGLT2 is a highly promising drug repurposing target, with strong evidence from multiple large cohort studies and meta-analyses demonstrating that its inhibition via existing small-molecule drugs is associated with a significantly lower risk of dementia and Alzheimer's Disease. [5, 9, 10, 12] Preclinical studies in animal models provide mechanistic support, showing SGLT2 inhibitors can reduce amyloid-beta deposition, tau pathology, and neuroinflammation. [1, 2, 11] While a recent randomized controlled trial in early AD did not meet its primary endpoint, the target's druggability is confirmed, and the wealth of supportive human observational data justifies the high confidence and novelty. [3, 6]"
1,CGAS,0.7,0.9,0.8,"CGAS is an emerging drug target with strong preclinical evidence. Its inhibition is supported by recent studies demonstrating that genetic deletion or administration of small-molecule inhibitors targeting the cGAS-STING pathway can ameliorate both amyloid-β and tau pathologies, reduce neuroinflammation, and rescue cognitive deficits in various mouse models of Alzheimer's Disease. [3, 6, 16] The target is novel and gaining significant attention, though the lack of clinical trial data and some conflicting preclinical results temper confidence. [4, 7]"
2,SEH,0.9,0.8,0.9,"Soluble Epoxide Hydrolase (sEH) is upregulated in Alzheimer's Disease (AD) patient brains and animal models. [1, 2, 3] Preclinical studies (2020-2025) robustly demonstrate that small-molecule sEH inhibitors cross the blood-brain barrier, reduce neuroinflammation, decrease amyloid and tau pathology, and reverse cognitive deficits in multiple AD rodent models, thus providing strong evidence and high confidence in this novel therapeutic target. [2, 10, 11, 18]"
3,HDAC,0.8,0.6,0.7,"HDACs are a well-validated target class with approved small-molecule drugs for other indications. Recent preclinical studies show that isoform-selective HDAC inhibitors, particularly for HDAC6 and the novel target HDAC11, can modulate core Alzheimer's pathologies including amyloid and tau accumulation, and neuroinflammation, leading to cognitive improvements in animal models. [1, 2, 5, 11, 16] While the general target class is not new, the focus on specific isoforms to improve efficacy and reduce side effects provides a novel therapeutic strategy, though clinical data for Alzheimer's disease is still limited."
4,GLP-1 RECEPTOR,0.8,0.6,0.9,"The GLP-1 receptor is a highly promising drug target for Alzheimer's Disease (AD), supported by robust preclinical evidence and compelling clinical data. While most current agonists are peptides, the development of oral small molecules is underway, confirming druggability. [14] Large-scale Phase 3 clinical trials with the GLP-1 agonist semaglutide in early AD are ongoing, with results expected in late 2025, representing a significant investment in this target. [12, 13] Evidence from real-world patient data indicates that treatment with GLP-1 receptor agonists is associated with a significantly reduced risk of AD. [11, 12]"
5,DYRK1A,0.9,0.6,0.9,"DYRK1A is a kinase strongly implicated in Alzheimer's Disease (AD) as it phosphorylates both amyloid precursor protein (APP) and tau, contributing to the formation of plaques and tangles. [1, 3, 8] Extensive preclinical evidence from animal models demonstrates that small-molecule inhibitors can reduce both core AD pathologies and reverse cognitive deficits. [3, 6] The target's druggability is high, with multiple small-molecule inhibitors developed and at least one, SM07883, having entered Phase 1 clinical trials, providing strong validation for its therapeutic potential. [10, 12]"
6,GLP1R,0.8,0.3,0.9,"GLP1R is a highly validated target supported by extensive preclinical evidence showing that its agonists reduce neuroinflammation, Aβ deposition, and tau hyperphosphorylation in Alzheimer's Disease (AD) models. [2, 6, 8] This is strongly corroborated by late-stage clinical trials, including pivotal Phase 3 studies for oral semaglutide and positive Phase 2b results for liraglutide in AD patients. [3, 10, 17] While current lead drugs are peptides, the demonstrated clinical potential and active development of oral formulations make it a highly promising target for small-molecule approaches. [14]"
7,BCHE,0.6,0.2,0.8,"BCHE (butyrylcholinesterase) is a clinically validated target, as the approved dual AChE/BCHE inhibitor rivastigmine is used for Alzheimer's Disease (AD) treatment. [2, 6] Recent preclinical studies (2022-2025) provide strong evidence, with novel selective BCHE inhibitors demonstrating significant cognitive improvements in transgenic AD mouse models and showing neuroprotective effects. [1, 4, 16, 17] The rationale is further strengthened by findings that BCHE levels increase in the AD brain and are associated with amyloid plaques, while BCHE knockout in mice reduces plaque pathology. [5, 16, 19]"
8,MAO-B,0.8,0.1,0.9,"MAO-B is a well-established target with a long history in neurodegeneration, resulting in a low novelty score. However, its role in Alzheimer's-related oxidative stress is well-supported, and it is highly druggable with small molecules. [2, 4, 14] Confidence and evidence are high due to extensive recent preclinical development of novel inhibitors and an ongoing Phase 2a clinical trial for KDS2010, a selective MAO-B inhibitor, in patients with early Alzheimer's disease. [5, 8, 9]"
9,ACHE,1.0,0.1,1.0,"ACHE is a clinically validated target with multiple FDA-approved small-molecule inhibitors (e.g., donepezil, rivastigmine, galantamine) that are standard symptomatic treatments for Alzheimer's Disease. [1, 4, 9, 13] Its druggability is therefore certain, but it is not a novel target. [11] Current research (2020-2025) focuses on developing multi-target-directed ligands that combine ACHE inhibition with other disease-modifying mechanisms, such as inhibiting amyloid aggregation or monoamine oxidase, to move beyond purely symptomatic relief. [5, 6, 10, 14]"


In [None]:
target_eval_df.to_csv("target_eval.csv", index=False)

In [None]:
target_eval_df["target"]

Unnamed: 0,target
0,SGLT2
1,CGAS
2,SEH
3,HDAC
4,GLP-1 RECEPTOR
5,DYRK1A
6,GLP1R
7,BCHE
8,MAO-B
9,ACHE
