In [None]:
import faiss
from langchain_ollama import ChatOllama,OllamaEmbeddings
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import faiss
from langchain_core.output_parsers import PydanticOutputParser,StrOutputParser
from typing import List
import re
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
import numpy as np
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
import spacy
import requests
from scispacy.linking import EntityLinker
import scispacy
import re
from langchain.tools import tool
from langchain_community.docstore.in_memory import InMemoryDocstore
import pyobo
import time
from collections import defaultdict


In [None]:
#linker = pyobo.get_scispacy_entity_linker("uniprot", filter_for_definitions=False, resolve_abbreviations=True)
nlp = spacy.load('en_ner_jnlpba_md')
nlp.disable_pipes("tagger", "parser")
nlp.add_pipe("sentencizer")
linker = pyobo.get_scispacy_entity_linker("hgnc", filter_for_definitions=False, resolve_abbreviations=True,)


In [None]:
ncbi_base_url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
query='"signaling pathway"[Title/Abstract] AND review[Publication Type] AND (humans[MeSH Terms] AND ("disease"[MeSH Terms] OR disease[Title/Abstract]))'
params_id = {"db": "pubmed","term": query,"retmode": "json","retstart": 0,"retmax": 10000}
response=requests.get(ncbi_base_url, params=params_id)
pubmed_id=response.json()['esearchresult']['idlist']


In [None]:
pub_2_pmc={}
pubmed_2_PMC_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
for i in pubmed_id:
    params_PMC = {"dbfrom": "pubmed","db": "pmc","id": i,"retmode": "json"}
    response = requests.get(pubmed_2_PMC_url, params=params_PMC)
    id=response.json()['linksets'][0]
    print(id)
    if 'linksetdbs' in id.keys():
        pub_2_pmc[i]=int(id['linksetdbs'][0]['links'][0])

In [None]:
import time
all_docs=[]
for i in pubmed_id:
    if i in pub_2_pmc.keys():
        doc=WebBaseLoader(f'https://pmc.ncbi.nlm.nih.gov/articles/{pub_2_pmc.get(i)}/').load()
        all_docs.extend(doc)
        time.sleep(0.05)


In [None]:
doc=WebBaseLoader('https://pmc.ncbi.nlm.nih.gov/articles/PMC11554381/').load()
references=doc[0].page_content.split('References')[1]
title=doc[0].metadata['title'].strip()
cleaning=re.sub(r"\s+", " ", doc[0].page_content)
cleaning=re.split(r"Abstract",cleaning)[1]
cleaning=re.split(r"Acknowledgements",cleaning)[0]
cleaning=re.sub(r'.Keywords*\n?',' ',cleaning)
cleaning=re.sub('Open in a new tab','',cleaning)
cleaning=re.sub(r'[\r\n]+', r'\n', cleaning)
cleaning=re.sub(r"\s*\(?\s*(?:Figure|Fig\.?)\s*[\d.]+\s*\)?\s*",'',cleaning,flags=re.IGNORECASE).strip()
cleaning=re.sub(r'\[[\d\s\W]*?\].','',cleaning,flags=re.IGNORECASE)
cleaning=re.sub(r"\(([^)]*?\s*et\s*al\.\s*[^)]*?)\)",'',cleaning)
cleaning=re.sub(r'\(\d+(?:,\d+|-?\d+)*\)\.','.',cleaning)
cleaning=re.sub(r'\([\w\s]+\d{4},?\s*[\d–-]*\)','',cleaning)
cleaning=re.sub(r"\([rR]eviewed in\)",'',cleaning,flags=re.IGNORECASE)
cleaning=re.sub(r"www\.[^/]+/",' ',cleaning)


In [None]:
cleaned_nlp=linker(nlp(cleaning))

In [None]:
new_text=cleaning
for i in reversed(cleaned_nlp.ents):
    id=i._.kb_ents
    if id:
        name=linker.kb.cui_to_entity[id[0][0]].canonical_name
        new_text = new_text[:i.start_char] + name + ' ' + i.label_ + new_text[i.end_char:]

doc[0].page_content=new_text

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap=500)
chunks=splitter.split_documents(doc)

In [None]:
embeddings_model = OllamaEmbeddings(model="nomic-embed-text")
vectorstore=FAISS.from_documents(chunks,embeddings_model)
retriever=vectorstore.as_retriever(search_type='similarity',search_kwargs={"k": 4, "fetch_k": 10})

In [None]:
class Triplet(BaseModel):
    subject: str = Field(..., description="ONE protein or ONE protein complex")
    predicate: str = Field(..., description="Relationship between protein entities")
    object: str = Field(..., description="ONE protein or ONE protein complex")

class TripletList(BaseModel):
    triplets: List[Triplet]

In [None]:
template = """
You are an expert information extraction assistant. 
Your task is to extract knowledge triples in the format of (subject, predicate, object) from the provided text. A subject and object are protein entities, and the predicate describes the relationship between them.

**Instructions:**
1.  Identify all relevant subject-predicate-object triples that represent facts stated explicitly in the text.
2.  Focus on extracting facts accurately and completely.
3.  Do not hallucinate or add information that is not present in the source text.
4.  If a sentence contains multiple facts, extract each fact as a separate triple.
5.  Predicate values should be only from ['Activates','Inhibits','Reacts','Form Complex']
6.  Subject and object should be only proteins.

Normalize synonyms:
binds/interacts/phosphorylate → Reacts
forms a complex → Form Complex
suppresses/destabilizes/activates degradation → Inhibits
induces/promotes/activates/stabilizes/inhibits degradation → Activates

Retrieve triplets from this text below:
{context}

"""
prompt = ChatPromptTemplate.from_template(template)

In [None]:
llm=ChatOllama(model='qwen3:4b-instruct-2507-q4_K_M',temperature=0.0,validate_model_on_init=True,num_ctx=12000).with_structured_output(TripletList) ## this is better


In [None]:
rag_chain = (RunnableParallel(context=retriever,question=RunnablePassthrough())
    | prompt
    | llm)

In [None]:
response = rag_chain.invoke("What are the CDC37 proteins's key characteristics/interactions?")