In [85]:
import faiss
from langchain_ollama import ChatOllama,OllamaEmbeddings
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import FAISS
import faiss
from langchain.agents.middleware import dynamic_prompt, ModelRequest,wrap_model_call,ModelResponse,wrap_tool_call
import xml.etree.ElementTree as ET
import pyttsx3
from langchain_core.output_parsers import PydanticOutputParser,StrOutputParser
from typing import List
from langchain.agents import create_agent
from langchain.agents.middleware import SummarizationMiddleware
from langgraph.checkpoint.memory import InMemorySaver
from langchain_core.runnables import RunnableConfig
import re
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
import numpy as np
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate,SystemMessagePromptTemplate,HumanMessagePromptTemplate
from langchain_core.prompts import PromptTemplate
import spacy
import pickle
import requests
from langchain_core.documents import Document
from bs4 import BeautifulSoup
import re
from scispacy.linking import EntityLinker
import scispacy
import re
from langchain.messages import SystemMessage,HumanMessage,AIMessage,ToolMessage
from langchain.tools import tool
from langchain_community.docstore.in_memory import InMemoryDocstore
import pyobo
import time
from collections import defaultdict
from typing import List, Optional, Literal
import re
from typing import Optional


In [3]:
#linker = pyobo.get_scispacy_entity_linker("uniprot", filter_for_definitions=False, resolve_abbreviations=True)
nlp = spacy.load('en_ner_jnlpba_md')
nlp.disable_pipes("tagger", "parser")
nlp.add_pipe("sentencizer")
linker = pyobo.get_scispacy_entity_linker("hgnc", filter_for_definitions=False, resolve_abbreviations=True,)


In [4]:
def pubmed_query(
    include_topics: List[str],
    operator: Literal["AND", "OR"] = "AND",
    exclude_topics: Optional[List[str]] = None,
    publication_type: Optional[Literal["review", "article"]] = None,
    organism: Optional[str] = None,
    journal: Optional[str] = None,
    start_year: Optional[int] = None,
    end_year: Optional[int] = None,
    free_full_text: bool = True
) -> str:
    """
    Build a simple, elegant PubMed query.
    """

    clauses = []

    # Automatic free full text
    if free_full_text:
        clauses.append("free full text[filter]")

    # Include topics
    if include_topics:
        topic_clause = " OR ".join([f'"{t}"[Title/Abstract]' for t in include_topics])
        if len(include_topics) > 1:
            topic_clause = f'({topic_clause})'
        clauses.append(topic_clause)

    # Publication type
    if publication_type:
        pt_map = {"review": "review[Publication Type]", "article": "journal article[Publication Type]"}
        clauses.append(pt_map.get(publication_type.lower(), ""))

    # Organism
    if organism:
        org_map = {"human": "humans[MeSH Terms]", "mouse": "mice[MeSH Terms]", "rat": "rats[MeSH Terms]"}
        clauses.append(org_map.get(organism.lower(), f'"{organism}"[MeSH Terms]'))

    # Journal
    if journal:
        clauses.append(f'"{journal}"[Journal]')

    # Date range
    if start_year and end_year:
        clauses.append(f'("{start_year}"[Date - Publication] : "{end_year}"[Date - Publication])')
    elif start_year:
        clauses.append(f'"{start_year}"[Date - Publication]')
    elif end_year:
        clauses.append(f'"{end_year}"[Date - Publication]')

    # Exclude topics
    if exclude_topics:
        for t in exclude_topics:
            clauses.append(f'NOT "{t}"[Title/Abstract]')

    # Combine clauses with AND
    return " AND ".join(clauses)


In [5]:
query = pubmed_query(
    include_topics=["AKT signaling",'wnt signalling'],
    publication_type="review",
    organism="Human",
    start_year=2024,
    end_year=2025
)
query

'free full text[filter] AND ("AKT signaling"[Title/Abstract] OR "wnt signalling"[Title/Abstract]) AND review[Publication Type] AND humans[MeSH Terms] AND ("2024"[Date - Publication] : "2025"[Date - Publication])'

In [6]:
def fetch_ncbi_data(query):
    base_url_esearch='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    params_esearch = {"db": "pubmed","term": query,"retmode": "json","retstart": 0,"retmax": 10000,'email':'your_email@example.com',
              'datetype':'pdat'}
    response_esearch=requests.get(base_url_esearch, params=params_esearch)
    list_of_pubmed=response_esearch.json()['esearchresult']['idlist']
    dois=[]
    journal=[]
    pmcids=[]
    base_pmc = 'https://www.ncbi.nlm.nih.gov/pmc/articles/'
    base_url_efetch='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    for list in [list_of_pubmed[i:i+10]for i in range(0,len(list_of_pubmed),10)]:
        query=f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
        param_efetch={'retmode':'xml',"db": "pubmed","id": ','.join(list),"retstart": 0,"retmax": 1000}
        response_efetch=requests.get(base_url_efetch,params=param_efetch)
        tree = ET.fromstring(response_efetch.text)
        base='https://doi.org/'
        for i in tree.findall('PubmedArticle/PubmedData/ArticleIdList/ArticleId'):
            if i.attrib.get('IdType')=='doi':
                dois.append(base+i.text)
            elif i.attrib.get('IdType') == 'pmc':   # ✅ added
                pmcids.append(base_pmc + i.text)
        for i in tree.findall('PubmedArticle/MedlineCitation/Article/Journal/Title'):
            journal.append(i.text)
            
    return dois, journal,pmcids
            

In [7]:
doi,jornal,pmcid=fetch_ncbi_data(query)

In [8]:
def load_doi_page_raw(url: str):
    loader = WebBaseLoader(
        web_paths=[url],
        header_template={
            "User-Agent": (
                "Mozilla/5.0 (X11; Linux x86_64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/120.0 Safari/537.36"
            )
        },
    )
    docs = loader.load()
    return docs[0]

def extract_main_article_text(html_text: str) -> str:
    soup = BeautifulSoup(html_text, "lxml")

    # Remove junk
    for tag in soup([
        "script", "style", "noscript", "svg",
        "header", "footer", "nav", "aside",
        "form", "button"
    ]):
        tag.decompose()

    text = soup.get_text(separator="\n")

    # Normalize whitespace
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r"[ \t]+", " ", text)

    return text.strip()

def build_clean_document(raw_doc: Document) -> Document:
    clean_text = extract_main_article_text(raw_doc.page_content)

    return Document(
        page_content=clean_text,
        metadata={
            **raw_doc.metadata,
            "source_type": "doi_webpage",
            "cleaned": True,
        }
    )
def load_full_article_from_doi(url: str) -> Document:
    raw_doc = load_doi_page_raw(url)
    clean_doc = build_clean_document(raw_doc)
    return clean_doc


In [9]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
}
all_docs=[]
for i in pmcid: 
    try:
        doc=WebBaseLoader(i,continue_on_failure=True,requests_kwargs={'allow_redirects': True,
                                            "headers": headers}).load()[0]
        if len(doc.page_content)>2000:
            all_docs.append(doc)
    except Exception as e:
        print(e)
        continue
    
    
"""headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
}
all_docs=[]
for i in tqdm(doi): 
    try:
        doc=load_full_article_from_doi(i)
        if len(doc.page_content)>1000:
            all_docs.append(doc)
    except Exception as e:
        print(e)
        continue"""



"headers = {\n    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',\n}\nall_docs=[]\nfor i in tqdm(doi): \n    try:\n        doc=load_full_article_from_doi(i)\n        if len(doc.page_content)>1000:\n            all_docs.append(doc)\n    except Exception as e:\n        print(e)\n        continue"

In [10]:
def extract_main_content(text: str) -> Optional[str]:
    
    abstract_pat = r'\babstract\b'
    intro_pat = r'\bintroduction\b'
    ref_pat = r'\breference*\b'
    ack_pat = r'\backnowledgment*\b'

    start_matches = list(re.finditer(intro_pat, text, re.IGNORECASE))
    if not start_matches:
        start_matches = list(re.finditer(abstract_pat, text, re.IGNORECASE))

    end_matches = list(re.finditer(ack_pat, text, re.IGNORECASE))
    if not end_matches:
        end_matches = list(re.finditer(ref_pat, text, re.IGNORECASE))

    if not start_matches or not end_matches:
        return None

    candidates = []

    for start in start_matches:
        for end in end_matches:
            if end.start() > start.start():
                span = text[start.start():end.start()].strip()
                if len(span) > 300:
                    candidates.append(span)

    if not candidates:
        return None

    return max(candidates, key=len)


In [11]:
cleaned_docs=[]
for doc in all_docs:
    text=extract_main_content(doc.page_content)
    if text and len(text)>2000:
        cleaned_nlp=linker(nlp(text))
        new_text=text
        for i in reversed(cleaned_nlp.ents):
            id=i._.kb_ents
            if id:
                name=linker.kb.cui_to_entity[id[0][0]].canonical_name
                new_text = new_text[:i.start_char] + name + ' ' + i.label_ + new_text[i.end_char:]
        doc.page_content=new_text
        cleaned_docs.append(doc)



In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1024,chunk_overlap=256,separators=["\n\n", "\n", ". ", " ", ""],length_function=len)
chunks=splitter.split_documents(cleaned_docs)

In [23]:
embeddings_model = OllamaEmbeddings(model="qwen3-embedding:0.6b",validate_model_on_init=True)
embedding_dim = len(embeddings_model.embed_query("hello world"))
index = faiss.IndexFlatL2(embedding_dim)

vector_store = FAISS(
    embedding_function=embeddings_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)


In [24]:
ids = vector_store.add_documents(documents=chunks)

In [25]:
vector_store.save_local("/home/bionik/AI_ML/FDA_Project/LLM/faiss_index")

In [2]:
embeddings_model = OllamaEmbeddings(model="qwen3-embedding:0.6b",validate_model_on_init=True)

vector_store = FAISS.load_local(
    "/home/bionik/AI_ML/FDA_Project/LLM/faiss_index",
    embeddings_model,
    allow_dangerous_deserialization=True 
)

In [38]:
LLM_model = ChatOllama(num_ctx=10000,model="qwen3:4b-instruct-2507-q4_K_M",validate_model_on_init=True)


In [98]:
# -------------------------------
# 3️⃣ Retrieval Middleware
# -------------------------------
@dynamic_prompt
def retrieve_context_middleware(request: ModelRequest) -> str:
    """
    Always rewrites the user query using conversation history,
    overwrites the human message, and retrieves context using the
    rewritten query.
    """

    # -------------------------------
    # 1️⃣ Extract original query + history
    # -------------------------------
    messages = request.state["messages"]
    original_query = messages[-1].content.strip()

    history = "\n".join(
        m.content for m in messages[:-1]
        if m.type in {"human", "ai"}
    )

    # -------------------------------
    # 2️⃣ Rewrite query deterministically (NOT a tool)
    # -------------------------------
    # -------------------------------
    # 2️⃣ Classify question early (for rewrite strategy)
    # -------------------------------
    classify_prompt = f"""
Classify the following life sciences or healthcare research question.

Return ONLY one word:

FACTUAL → asks for established biological knowledge, known mechanisms,
           experimentally observed effects, reported associations,
           pathway descriptions, molecular functions, or clinical findings.

REASONING → asks for interpretation, causal explanation, prediction,
            intervention outcomes, perturbation effects, mechanistic inference,
            hypothesis generation, or answers requiring extrapolation beyond
            directly reported evidence.

Question:
"{original_query}"
"""

    qtype_for_rewrite = LLM_model.invoke(classify_prompt).content.strip().upper()
    request.state["qtype"] = qtype_for_rewrite

    # -------------------------------
    # 3️⃣ Rewrite query deterministically (NOT a tool)
    # -------------------------------
    if qtype_for_rewrite == "REASONING":
        rewrite_prompt = f"""
Rewrite the following biomedical question to optimize literature retrieval
for mechanistic reasoning and hypothesis generation.

Rules (STRICT):
- Preserve the scientific intent of the user’s question.
- Treat the question as implicitly referring to the biological context
  established in the conversation history.
- If the question is generic (e.g., "why", "how", "what happens"),
  reinterpret it as applying specifically to the entities, mechanisms,
  or biological systems discussed previously.
- Anchor the rewritten query to the SAME biological focus as the history.
- Do NOT introduce new example entities to illustrate the answer.
- Do NOT broaden to unrelated genes, proteins, pathways, or diseases.
- If generalization is required, restrict it to:
  the same molecular role, functional class, or mechanistic process
  already implied by the context — without naming new entities.
- Prefer standardized biomedical terminology.
- Expand abbreviations where appropriate.
- Do NOT include speculative language in the query itself.
- Return ONLY the rewritten query.

Conversation history:
{history}

Original question:
{original_query}

"""
    else:  # FACTUAL
        rewrite_prompt = f"""
Rewrite the following biomedical question for precise literature retrieval.

Rules (STRICT):
- Preserve the original meaning exactly.
- Interpret the question in the context of the prior conversation.
- Retain and emphasize the specific biological entities, mechanisms,
  or systems already discussed.
- Do NOT generalize beyond what is explicitly or contextually specified.
- Do NOT introduce new entities.
- Prefer standardized biomedical terminology.
- Expand abbreviations if useful.
- If already optimal, return unchanged.
- Return ONLY the rewritten query.

Conversation history:
{history}

Original question:
{original_query}

"""

    rewritten_query = LLM_model.invoke(rewrite_prompt).content.strip()


    # -------------------------------
    # 3️⃣ Overwrite human message (critical)
    # -------------------------------
    messages[-1].content = rewritten_query

    # Save both for transparency / debugging
    request.state["original_query"] = original_query
    request.state["rewritten_query"] = rewritten_query

    # -------------------------------
    # 4️⃣ Retrieve using rewritten query
    # -------------------------------
    docs = vector_store.max_marginal_relevance_search(rewritten_query,k=15,fetch_k=30)

    sources = "\n\n".join(
        f"[SOURCE {i+1} research paper: {d.metadata.get('source','UNKNOWN')}]\n{d.page_content}"
        for i, d in enumerate(docs)
    )

    # -------------------------------
    # 5️⃣ Store retrieved context
    # -------------------------------
    request.state["retrieved_context"] = sources

    return ""


@wrap_model_call
def dynamic_model_selection(request: ModelRequest, handler) -> ModelResponse:
    """
    Dynamically select model based on query type (FACTUAL vs REASONING).
    """

    qtype = request.state.get("qtype", "FACTUAL")
    if qtype == "REASONING":
        print(f"using REASONING LLM for answering")
        LLM_model = ChatOllama(
            name="reasoning",
            model="qwen3:4b-thinking-2507-q4_K_M",
            num_ctx=10000,
            reasoning=True,
        )
    else:  # FACTUAL
        print(f"using FACTUAL LLM for answering")
        LLM_model = ChatOllama(
            name="fact",
            model="qwen3:4b-instruct-2507-q4_K_M",
            num_ctx=10000)

    return handler(request.override(model=LLM_model))   

# -------------------------------
# 4️⃣ Dynamic System Prompt
# -------------------------------
@dynamic_prompt
def rag_system_prompt(request: ModelRequest) -> str:
    messages = request.state["messages"]
    user_question = messages[-1].content.strip()
    context = request.state.get("retrieved_context", "NO_RELEVANT_SOURCES_FOUND")
    history = "\n".join(
        m.content for m in messages[:-1] if m.type in {"human", "ai"}
    )

    # -----------------------
    # Query classification
    # -----------------------
    classify_prompt = f"""
Classify the following biomedical question.

Return ONLY one word:
FACTUAL  -> asks for known mechanisms or established facts
REASONING -> asks about consequences, interventions, or what-if scenarios

Question:
"{user_question}"
"""
    qtype = LLM_model.invoke(classify_prompt).content.strip().upper()
    request.state["qtype"] = qtype
    ## add dynamic_model_selection here
    # -----------------------
    # Mode-specific instructions
    # -----------------------
    if qtype == "FACTUAL":
        mode_instruction = """
FACTUAL MODE (LIFE SCIENCES / HEALTHCARE):

Objective:
Provide a complete, authoritative, evidence-based explanation grounded in
published biomedical literature.

Rules:
- Use retrieved SOURCES as the primary and preferred evidence.
- Describe mechanisms, pathways, molecular interactions, phenotypic effects,
  experimental results, or clinical observations exactly as reported.
- Cover relevant biological detail thoroughly when supported by SOURCES.

General biomedical knowledge:
- May be used ONLY when necessary to complete an explanation.
- Must be explicitly labeled as:
  "General biomedical knowledge (not from retrieved sources):"
- Must be textbook-level, widely accepted, and non-speculative.
- Must not contradict retrieved SOURCES.

Restrictions:
- Do NOT introduce hypotheses, predictions, or inferred outcomes.
- Do NOT generalize beyond what the evidence supports.
- Do NOT add commentary about evidence sufficiency unless explicitly asked.

Citations:
- Every factual claim must be cited.
- Use inline citations as [SOURCE X] with DOI or URL.

"""
    else:  # REASONING
        mode_instruction = """
REASONING MODE (BIOLOGICAL INFERENCE & HYPOTHESIS GENERATION):

Objective:
Generate biologically plausible hypotheses and mechanistic explanations by
connecting patterns across existing biomedical evidence.

Evidence handling:
- Ground all known facts in retrieved SOURCES and cite them as [SOURCE X].
- When direct evidence for the exact query is limited or absent, you MAY:
  - Identify mechanistically analogous, functionally related, or biologically
    comparable evidence present in the SOURCES.
  - Reason across shared biological roles, conserved mechanisms, or similar
    perturbation effects implied by the literature.

Hypothesis construction:
- Hypotheses must be logically derived from cited evidence or established biology.
- Clearly label all inferred or predictive statements as:
  "Hypothesis:"
- Hypotheses must remain biologically consistent and non-contradictory.
- Do NOT present hypotheses as established facts.

General biomedical knowledge:
- May be used to support reasoning when necessary.
- Must be explicitly labeled as:
  "General biomedical knowledge (not from retrieved sources):"

Restrictions:
- Do NOT invent data, mechanisms, or citations.
- Do NOT extrapolate beyond biologically reasonable inference.
- Do NOT introduce unrelated biological domains.

Citations:
- Cite all evidence-based statements and reasoning anchors as [SOURCE X]
  with DOI or URL.

"""

    # -----------------------
    # Final system prompt
    # -----------------------
    return f"""
You are a biomedical research assistant.

Global rules:
- Prefer retrieved SOURCES over general knowledge.
- Never invent citations.
- If evidence is insufficient, say so explicitly.
- Provide links to all SOURCES at the end of the answer.

Question type: {qtype}

User question:
{user_question}

Conversation history:
{history}

Retrieved SOURCES:
{context}

{mode_instruction}

Produce a clear, structured, citation-grounded answer.
"""



In [94]:
checkpointer = InMemorySaver()

agent = create_agent(
    model=LLM_model,
    tools=[],
    checkpointer=checkpointer,
    middleware=[
        retrieve_context_middleware,  # always inject retrieved context
        rag_system_prompt,# dynamic prompt
        dynamic_model_selection,#  dynamic model
        SummarizationMiddleware(
            model=LLM_model,
            trigger=("tokens", 7000),
            keep=("messages", 10),
        ),
    ],
)




In [95]:
config = {"configurable": {"thread_id": "EGFR"}}

response = agent.invoke(
    {"messages": [{"role": "user", "content": "How EGFR get signaling activated?"}]},
    config,
)

print(response["messages"][-1].content)

using FACTUAL LLM for answering
Epidermal Growth Factor Receptor (EGFR) signaling is activated through ligand binding and subsequent receptor dimerization, which induces conformational changes and initiates intracellular phosphorylation cascades.

EGFR, a member of the EGFR tyrosine kinase family (also known as the ERBB family), is activated when extracellular ligands—such as epidermal growth factor (EGF), transforming growth factor-alpha (TGF-α), or other members of the EGF family—bind to the extracellular domain of the receptor [SOURCE 7]. This binding induces a conformational change in the receptor that promotes dimerization, either as a homodimer (EGFR–EGFR) or a heterodimer with another member of the ERBB family, such as HER2 (ERBB2) [SOURCE 7].

This dimerization brings the intracellular tyrosine kinase domains of the receptors into close proximity, enabling trans-autophosphorylation of specific tyrosine residues within the intracellular domain of the receptor [SOURCE 7]. This ph

In [96]:
response = agent.invoke(
    {"messages": [{"role": "user", "content": "What happens if we added RAS inhibitor to cancer cells?"}]},
    config,
)

print(response["messages"][-1].content)

using FACTUAL LLM for answering
The effect of adding a RAS inhibitor to cancer cells that express activated EGFR signaling—specifically involving ligand-induced receptor dimerization, tyrosine kinase autophosphorylation, and downstream activation of the MAPK and PI3K/AKT pathways—is not directly supported by the retrieved sources.

While the retrieved literature confirms that EGFR activation leads to downstream signaling through the MAPK and PI3K/AKT pathways [SOURCE 2, SOURCE 7], and that RAS is a key component of the MAPK pathway (as part of the Ras/Raf/MEK/ERK cascade) [SOURCE 7, SOURCE 9], there is no evidence in the provided sources that RAS inhibitors exert a direct or significant effect on EGFR-activated cancer cells.

Specifically:
- The sources indicate that activated EGFR signaling can synergistically enhance PIK3CA signaling, and that downstream PI3K/AKT activation occurs independently of canonical membrane-bound AKT activation in some contexts [SOURCE 4, SOURCE 13].
- There

In [79]:
response = agent.invoke(
    {"messages": [{"role": "user", "content": "What are diffferent mutations associated with KRAS which are invollved in cancer progression?"}]},
    config,
)

print(response["messages"][-1].content)

using FACTUAL LLM for answering
The different KRAS mutations associated with cancer progression are primarily well-characterized in pancreatic, colorectal, and lung cancers. Based on the retrieved sources and established biomedical literature, the most common and clinically relevant KRAS mutations are:

- **KRAS G12D**  
- **KRAS G12V**  
- **KRAS G12C**  
- **KRAS G13D**  
- **KRAS Q61H**  

These mutations occur in the guanylate exchange factor (GEF) domain or the switch regions of the KRAS protein, disrupting its normal GTPase activity and leading to constitutive activation of downstream signaling pathways such as MAPK/ERK and PI3K/AKT. This persistent signaling drives uncontrolled cell proliferation, survival, and tumor progression.

Among these, **KRAS G12D**, **KRAS G12V**, and **KRAS G12C** are the most frequently reported mutations in human cancers:

- **KRAS G12D** is the most common mutation in colorectal cancer (~40% of cases) and is associated with poor prognosis and resist

In [97]:
response = agent.invoke(
    {"messages": [{"role": "user", "content": "why mutation happen only at those speific sites of protein?"}]},
    config,
)

print(response["messages"][-1].content)

using REASONING LLM for answering
Based on the retrieved sources and biomedical evidence, mutations do not occur uniformly across protein sequences but are enriched at specific sites due to **functional constraints, structural vulnerabilities, and selective pressures in disease contexts**. Below is a structured, evidence-grounded explanation.

---

### Key Mechanisms for Non-Uniform Mutation Distribution
1. **Functional Importance of Critical Domains**  
   Mutations cluster in regions directly involved in protein function (e.g., active sites, binding interfaces), where a single amino acid change can disrupt activity. For example:  
   - In EGFR, kinase domain mutations (e.g., **E542K** and **E545K**) are frequent hotspots because they impair ATP binding and Ras-dependent signaling pathways [SOURCE 1]. These mutations specifically disrupt interactions with adaptor proteins like p110α helical domain and p85α, making them functionally consequential.  
   - Similarly, *TP53* (p53) mutatio

In [87]:
response = agent.invoke(
    {"messages": [{"role": "user", "content": "why this mutation happen at only 12th amino acid of protein and it is mutating to Aspartic Acid, valine, Cysteine only?"}]},
    config,
)

print(response["messages"][-1].content)

using REASONING LLM for answering
# Why do activating mutations in the EGFR kinase domain specifically occur at certain amino acid positions within exons 19 and 20, and why are the substitutions limited to specific amino acids such as leucine, threonine, or methionine, rather than aspartic acid, valine, or cysteine?

After carefully reviewing the retrieved sources and applying biomedical knowledge, I can provide a clear answer to this question.

## Important clarification: Context of EGFR mutations

First, it's critical to understand that the most clinically significant EGFR mutations occur in **lung cancer**, specifically in the **tyrosine kinase domain** (exons 19 and 20), not at the "12th codon" as previously misunderstood in the question. These mutations are:

1. **Exon 19 deletions** (e.g., delE746-A750)
2. **L858R** (leucine to arginine at position 858)
3. **T790M** (threonine to methionine at position 790)

These mutations are activating mutations that make the kinase domain act