# Load enviroment and modules

In [14]:
!source activate /ix/rboyce/iod4/envs/FM
!export HF_HOME="/ix/rboyce/shared/cache/huggingface/"

## Install or update models

In [3]:
# !pip install transformers
# !pip install accelerate
# !huggingface-cli download meta-llama/Meta-Llama-3-8B-Instruct --include "original/*" --local-dir meta-llama/Meta-Llama-3-8B-Instruct
# !pip install --upgrade transformers optimum optimum-intel
# !pip install --upgrade torch torchvision torchaudio

In [15]:
import transformers
import torch

# Model Prompts

In [5]:
# Ideas:
# How can we prompt the models to mention how recent the information cited is?
# How can we accurately ask the model to convey if the results can be extrapolated between NPs?
# Can we craft the LLM's role in a way that represents the intersection between the three personas? (clinical pharmacists, drug-drug interaction researchers, and drug interaction compendium editors)

# Alternative queries:
# "What are the potential mechanisms for the interaction between cranberry natural products and the drug warfarin as detected by an increase in their International Normalized Ratio (INR) blood test?"

In [6]:
instruction_header = "### Instructions ###\n\n"
context_header = "### Context ###\n\n"
context_instruction = "\nInclude relevant items from the following information in your synthesis:\n"
query_header = "### Query ###\n\n"

In [7]:
# instruction_header + prompt + query_header + queries[0].to_string()

In [8]:
prompt =  ""
with open('./instruction_prompt.txt', 'r') as f:
    prompt = f.read()

In [9]:
from langchain_core.prompts import PromptTemplate
import pandas as pd

In [10]:
query = "What are the potential mechanisms that cause an interaction between {NP} products and {Drug} that may cause {AE}?"

In [11]:
query_template = PromptTemplate.from_template(query)

In [12]:
mappings = pd.read_csv("./Data/CaseStudyMappings.csv")

In [13]:
queries = []
for row in mappings[["NP", "Drug", "AE"]].iterrows():
    queries.append(query_template.invoke(row[1].to_dict()))

-------------------

## Important things to target with the Spoke Approach.

1. Combining generated and retrieved knowledge.
2. Improve the relevance of the context by improving the prompt. "prompt aware context"
  - Alternative names for the NP.
  - Alternative names for the drug.
  - Alternative concepts for the AE.
  - Mesh Terms for each, if available.
3. Retrieve relevant context from NPKG.
4. Retrieve relevant EMA monographs.
5. Retrieve relevant Stockley's monographs.
6. Merge or Synthesize all these into a single context.

## Challenges:
1. Alternative names could be gathered from staging_vocabulary ("napdi repo db")
2. Splitting and merging all the information from the different sources.
3. Consider OntoGPT ()

# Spokelike from NPKG, EMA, & Stockleys

## Loading Documents for RAG

In [13]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader, CSVLoader

In [14]:
data_folder = "./Clean_data/"

In [15]:
text_loader_kwargs={'autodetect_encoding': True}

In [16]:
loader = DirectoryLoader(data_folder, glob="**/*.txt", show_progress=True, use_multithreading=True, loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)

In [17]:
docs = loader.load()

 97%|█████████▋| 260/268 [00:00<00:00, 2147.13it/s]


In [18]:
len(docs)

260

In [19]:
from unstructured.cleaners.core import clean, replace_unicode_quotes, group_broken_paragraphs, clean_non_ascii_chars

In [20]:
for i in range(len(docs)):
    docs[i].page_content = docs[i].page_content.replace("Unnamed: 1 Unnamed: 2 ", "")
    docs[i].page_content = clean(docs[i].page_content, extra_whitespace=True)
    docs[i].page_content = group_broken_paragraphs(docs[i].page_content)
    docs[i].page_content = replace_unicode_quotes(docs[i].page_content)
    # docs[i].page_content = clean_non_ascii_chars(docs[i].page_content)

In [21]:
csv_loader = CSVLoader(file_path=data_folder+'ema.csv')

In [22]:
data = csv_loader.load()

In [23]:
docs.extend(data)

In [24]:
len(docs)

838

-----------

## Create Vector Store with FAISS

In [25]:
# Helper function for printing docs
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [26]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [27]:
# import getpass
import os

In [28]:
# if not os.environ.get("OPENAI_API_KEY"):
#     os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [29]:
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_PERSONAL_API_KEY", default=False)

In [30]:
from langchain_openai import OpenAIEmbeddings

In [31]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [32]:
index = faiss.IndexFlatL2(len(embeddings.embed_query("Relationship between a natural product a drug and a adverse event.")))

In [33]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [34]:
ids = vector_store.add_documents(documents=docs)

## Create retreiver from Vector Store

In [35]:
db_retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 25, 'fetch_k': 100}
)

## Include MultiQuery Retrieval

In [36]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

In [37]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [38]:
llm = ChatOpenAI(model="gpt-4o-mini",temperature=1)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=db_retriever, 
    llm=llm,
    include_original=True
)

In [39]:
# ''.join([prompt, '\n', queries[0].text])

In [40]:
# retriever_from_llm.generate_queries(''.join([prompt, '\n', queries[0].text]), )

In [41]:
# unique_docs = retriever_from_llm.invoke(''.join([prompt, '\n', queries[0].text]))

In [42]:
# len(unique_docs)

In [43]:
# pretty_print_docs(unique_docs)

In [44]:
# !pip install --upgrade --quiet  cohere

## Split Large Documents, Filter Irrelevantt Sections, & Rerank Revelant Documents

In [45]:
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_text_splitters import CharacterTextSplitter
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever

# from langchain.retrievers.document_compressors import FlashrankRerank
from langchain_community.document_transformers import EmbeddingsRedundantFilter
# from langchain.retrievers.document_compressors import LLMChainExtractor

In [46]:
os.environ['COHERE_API_KEY'] = os.environ.get("COHERE_API_KEY", default=False)

In [77]:
splitter = CharacterTextSplitter(chunk_size=3500, chunk_overlap=0, separator=". ")
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.01, k = 15) # doc 2 query
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings, similarity_threshold=0.98, k = 10) # doc 2 doc
reranker = CohereRerank(top_n = 8)
# llm_compressor = LLMChainExtractor.from_llm(llm)

In [78]:
pipeline_compressor = DocumentCompressorPipeline(
    transformers= [splitter, relevant_filter, redundant_filter, reranker]
)

In [79]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=pipeline_compressor, base_retriever=retriever_from_llm
)

In [80]:
queries[6].text

'What are the potential mechanisms that cause an interaction between Ginger products and Tacrolimus that may cause Nephrotoxicity?'

In [81]:
compressed_docs = compression_retriever.invoke(queries[6].text)

INFO:langchain.retrievers.multi_query:Generated queries: ['What are the possible ways in which Ginger products may interact with Tacrolimus to lead to kidney toxicity?', 'How might Ginger products contribute to nephrotoxic effects when taken alongside Tacrolimus?', 'What mechanisms could explain the interaction between Ginger products and Tacrolimus that results in increased risk of kidney damage?']


In [82]:
pretty_print_docs(compressed_docs)

Document 1:

The Natural Product gingerol directly negatively regulates activity of the Enzyme or Transporter ATP-dependent translocase ABCB1 (human). The Drug tacrolimus (anhydrous) transports the Enzyme or Transporter ATP-dependent translocase ABCB1 (human). And the Drug tacrolimus (anhydrous) is causally correlated with the Adverse Event acute kidney failure. Natural Product Ontology Identifier: http://napdi.org/napdi_srs_imports:zingiber_officinale Natural Product or Constituent Ontology Identifier: http://purl.obolibrary.org/obo/CHEBI_10136 Process Ontology Identifier: http://purl.obolibrary.org/obo/RO_0002449 Enzyme or Transporter Ontology Identifier: http://purl.obolibrary.org/obo/PR_P08183 Process or Pathway Ontology Identifier: http://purl.obolibrary.org/obo/RO_0002020 Drug Ontology Identifier: http://purl.obolibrary.org/obo/CHEBI_61049 Causal Relation Ontology Identifier: http://purl.obolibrary.org/obo/RO_0002610 Adverse Event Ontology Identifier: http://purl.obolibrary.org/o

## Reorder documents and merger into one context

In [68]:
from langchain_community.document_transformers import (
    LongContextReorder,
)

In [69]:
reordering = LongContextReorder()

In [70]:
contexts = []
for query in queries:
    compressed_docs = compression_retriever.invoke(query.text)
    reordered_docs = reordering.transform_documents(compressed_docs)
    contexts.append('\n\n-----\n\n'.join(map(lambda x: x.page_content, reordered_docs)))

INFO:langchain.retrievers.multi_query:Generated queries: ['What mechanisms might explain the interaction between cranberry products and Warfarin that can lead to an increase in INR levels?', 'How do cranberry products interact with Warfarin, and what are the potential pathways that could result in elevated INR?', 'Can you elaborate on the possible ways in which cranberry products could interact with Warfarin to cause an increase in INR values?']
INFO:langchain.retrievers.multi_query:Generated queries: ['What mechanisms could explain the interaction between Green Tea products and Simvastatin that might lead to Statin intolerance?', 'How does the consumption of Green Tea potentially affect the efficacy and tolerance of Simvastatin in patients?', 'In what ways might Green Tea products influence the interaction with Simvastatin, potentially causing intolerance to Statins?']
INFO:langchain.retrievers.multi_query:Generated queries: ['What specific mechanisms might lead to interactions betwee

--------

# Reload context promps

In [13]:
import pickle

In [88]:
# with open('./contexts.pkl', 'wb') as out_file:
#     pickle.dump(contexts, out_file)

In [14]:
contexts = []

In [15]:
with open('./contexts.pkl', 'rb') as in_file:
    contexts = pickle.load(in_file)

-------------

# Results collections

In [16]:
results_dir = './Results/Rag-Results/'

--------------

# Baseline LLAMA 3.1 LLM Generation

In [72]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
dtype = torch.bfloat16

In [73]:
pipeline = transformers.pipeline(
    "text-generation", 
    model=model_id,
    torch_dtype =dtype,
    device_map="auto"
)

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [75]:
# ''.join(['\n\n', context_header, context_instruction, contexts[I], '\n\n', query_header, queries[i].text])

In [78]:
for i in range(len(queries)):
    messages = [
        {
            "role": "system", 
            "content": prompt
        },
        {
            "role": "user", 
            "content": ''.join(['\n\n', context_header, context_instruction, contexts[i], '\n\n', query_header, queries[i].text])
        },
    ]
    outputs = pipeline(
        messages,
        max_new_tokens=3500,
        temperature=1.0
    )
    with open(f'{results_dir}llama/llama_3.1_rag_({mappings.loc[i].NP},{mappings.loc[i].Drug},{mappings.loc[i].AE}).md', 'w') as f:
        f.write('# Prompts to LLM:\n\n')
        f.write(messages[0]['content'])
        f.write("\n\n")
        f.write(messages[1]['content'])
        f.write('\n\n-------\n\n')
        f.write('# Response from LLM:\n\n')
        f.write(str(outputs[0]["generated_text"][-1]["content"]))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [79]:
# print(outputs[0]["generated_text"][-1]["content"])

------------------------

# Baseline GEMMA 2 LLM Generation

In [17]:
model_id = "google/gemma-2-9b-it"
dtype = torch.bfloat16

In [18]:
pipeline = transformers.pipeline(
    "text-generation", 
    model=model_id,
    torch_dtype =dtype,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [19]:
for i in range(len(queries)):
    messages = [{
        "role": "user",
        "content": instruction_header + prompt + '\n\n' + context_header + context_instruction + contexts[i] + '\n\n' + query_header + queries[i].to_string()
    }]
    outputs = pipeline(
        messages,
        max_new_tokens=3500,
        temperature=1.0
    )
    with open(f'{results_dir}gemma/gemma_2_baseline_({mappings.loc[i].NP},{mappings.loc[i].Drug},{mappings.loc[i].AE}).md', 'w') as f:
        f.write('# Prompts to LLM:\n\n')
        f.write(messages[0]['content'])
        f.write('\n\n-------\n\n')
        f.write('# Response from LLM:\n\n')
        f.write(str(outputs[0]["generated_text"][-1]["content"]))

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (8192). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


In [20]:
# print(outputs[0]["generated_text"][-1]["content"])

------------------------

# Baseline GPT4o-mini LLM Generation

In [21]:
# import getpass
import os

In [22]:
# if not os.environ.get("OPENAI_API_KEY"):
#     os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [23]:
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_PERSONAL_API_KEY", default=False)

In [24]:
from langchain_openai import ChatOpenAI

In [25]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=1.0,
    max_tokens=3500,
)

In [26]:
for i in range(len(queries)):
    messages = [
        {
            "role": "system", 
            "content": prompt
        },
        {
            "role": "user", 
            "content": ''.join(['\n\n', context_header, context_instruction, contexts[i], '\n\n', query_header, queries[i].text])
        },
    ]
    outputs = llm.invoke(messages)
    with open(f'{results_dir}gpt/gpt4o-mini_baseline_({mappings.loc[i].NP},{mappings.loc[i].Drug},{mappings.loc[i].AE}).md', 'w') as f:
        f.write('# Prompts to LLM:\n\n')
        f.write(messages[0]['content'])
        f.write("\n")
        f.write(messages[1]['content'])
        f.write('\n\n-------\n\n')
        f.write('# Response from LLM:\n\n')
        f.write(str(outputs.content))

-------------------

# Baseline DeciLM LLM Generation

In [17]:
model_id = "Deci/DeciLM-7B-instruct"
dtype = torch.bfloat16

In [18]:
pipeline = transformers.pipeline(
    "text-generation", 
    model=model_id,
    torch_dtype =dtype,
    device_map="auto",
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
for i in range(len(queries)):
    messages = [
        {
            "role": "system", 
            "content": ''.join([instruction_header, prompt])
        },
        {
            "role": "user", 
            "content": ''.join(['\n\n', context_header, context_instruction, contexts[i], '\n\n', query_header, queries[i].text])
        },
    ]
    outputs = pipeline(
        messages,
        max_new_tokens=3500,
        temperature=1.0
    )
    with open(f'{results_dir}deci/deci_baseline_({mappings.loc[i].NP},{mappings.loc[i].Drug},{mappings.loc[i].AE}).md', 'w') as f:
        f.write('# Prompts to LLM:\n\n')
        f.write(messages[0]['content'])
        f.write("\n\n")
        f.write(messages[1]['content'])
        f.write('\n\n-------\n\n')
        f.write('# Response from LLM:\n\n')
        f.write(str(outputs[0]["generated_text"][-1]["content"]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  self.gen = func(*args, **kwds)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (8192). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


------------------------

# Baseline Mistral LLM Generation

In [24]:
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
dtype = torch.bfloat16

In [25]:
pipeline = transformers.pipeline(
    "text-generation", 
    model=model_id,
    torch_dtype =dtype,
    device_map="cuda"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [26]:
# print(''.join([instruction_header, prompt,'\n\n', query_header, queries[0].text]))

In [28]:
for i in range(len(queries)):
    messages = [
        {
            "role": "system", 
            "content": ''.join([instruction_header, prompt])
        },
        {
            "role": "user", 
            "content": ''.join(['\n\n', context_header, context_instruction, contexts[i], '\n\n', query_header, queries[i].text])
        },
    ]
    outputs = pipeline(
        messages,
        max_new_tokens=3500,
        temperature=1.0
    )
    with open(f'{results_dir}mistral/mistral_rag_({mappings.loc[i].NP},{mappings.loc[i].Drug},{mappings.loc[i].AE}).md', 'w') as f:
        f.write('# Prompts to LLM:\n\n')
        f.write(messages[0]['content'])
        f.write("\n\n")
        f.write(messages[1]['content'])
        f.write('\n\n-------\n\n')
        f.write('# Response from LLM:\n\n')
        f.write(str(outputs[0]["generated_text"][-1]["content"]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
# print(outputs[0]["generated_text"][-1]["content"])

------------------------