# Load enviroment and modules

In [4]:
!source activate /ix/rboyce/iod4/envs/FM
!export HF_HOME="/ix/rboyce/iod4/.cache/huggingface/"

## Install or update models

In [5]:
# !pip install transformers
# !pip install accelerate
# !huggingface-cli download meta-llama/Meta-Llama-3-8B-Instruct --include "original/*" --local-dir meta-llama/Meta-Llama-3-8B-Instruct
# !pip install --upgrade transformers optimum optimum-intel
# !pip install --upgrade torch torchvision torchaudio

In [6]:
import transformers
import torch

# Model Prompts

In [7]:
# Ideas:
# How can we prompt the models to mention how recent the information cited is?
# How can we accurately ask the model to convey if the results can be extrapolated between NPs?
# Can we craft the LLM's role in a way that represents the intersection between the three personas? (clinical pharmacists, drug-drug interaction researchers, and drug interaction compendium editors)

# Alternative queries:
# "What are the potential mechanisms for the interaction between cranberry natural products and the drug warfarin as detected by an increase in their International Normalized Ratio (INR) blood test?"

In [8]:
prompt =  ""
with open('./instruction_prompt.txt', 'r') as f:
    prompt = f.read()

In [9]:
from langchain_core.prompts import PromptTemplate
import pandas as pd

In [10]:
query = "What are the potential mechanisms that cause an interaction between {NP} products and {Drug} that may cause {AE}?"

In [11]:
query_template = PromptTemplate.from_template(query)

In [12]:
mappings = pd.read_csv("./Data/CaseStudyMappings.csv")

In [13]:
queries = []
for row in mappings[["NP", "Drug", "AE"]].iterrows():
    queries.append(query_template.invoke(row[1].to_dict()))

-------------------

## Important things to target with the Spoke Approach.

1. Combining generated and retrieved knowledge.
2. Improve the relevance of the context by improving the prompt. "prompt aware context"
  - Alternative names for the NP.
  - Alternative names for the drug.
  - Alternative concepts for the AE.
  - Mesh Terms for each, if available.
3. Retrieve relevant context from NPKG.
4. Retrieve relevant EMA monographs.
5. Retrieve relevant Stockley's monographs.
6. Merge or Synthesize all these into a single context.

## Challenges:
1. Alternative names could be gathered from staging_vocabulary ("napdi repo db")
2. Splitting and merging all the information from the different sources.
3. Consider OntoGPT ()

# Spokelike from NPKG, EMA, & Stockleys

## Loading Documents for RAG

In [28]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader, CSVLoader
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

In [29]:
data_folder = "./Clean_data/"

In [30]:
text_loader_kwargs={'autodetect_encoding': True}

In [31]:
loader = DirectoryLoader(data_folder, glob="**/*.txt", show_progress=True, use_multithreading=True, loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)

In [32]:
docs = loader.load()

 99%|█████████▉| 260/263 [00:01<00:00, 254.26it/s]


In [33]:
len(docs)

260

In [34]:
from unstructured.cleaners.core import clean, replace_unicode_quotes, group_broken_paragraphs , clean_non_ascii_chars

In [35]:
for i in range(len(docs)):
    docs[i].page_content = docs[i].page_content.replace("Unnamed: 1 Unnamed: 2 ", "")
    docs[i].page_content = clean(docs[i].page_content, extra_whitespace=True)
    docs[i].page_content = group_broken_paragraphs(docs[i].page_content)
    docs[i].page_content = replace_unicode_quotes(docs[i].page_content)
    # docs[i].page_content = clean_non_ascii_chars(docs[i].page_content)

In [36]:
csv_loader = CSVLoader(file_path=data_folder+'ema.csv')

In [37]:
data = csv_loader.load()

In [38]:
docs.extend(data)

In [39]:
len(docs)

838

-----------

## Create Vector Store with FAISS

In [40]:
# Helper function for printing docs
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [41]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [47]:
# import getpass
import os

In [48]:
# if not os.environ.get("OPENAI_API_KEY"):
#     os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [49]:
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_PERSONAL_API_KEY", default=False)

In [50]:
from langchain_openai import OpenAIEmbeddings

In [51]:
embeddings = OpenAIEmbeddings()

In [52]:
index = faiss.IndexFlatL2(len(embeddings.embed_query("Knowledge about a natural products involved in adverse events.")))

In [53]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [285]:
# index = faiss.IndexFlatL2(len(embeddings.embed_documents(docs)))

In [286]:
# vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

In [54]:
 vector_store.add_documents(documents=docs)

['256de1e3-f0c2-4b3c-a218-be505a347979',
 '3ef396c1-d5a3-4226-a54b-89f2a36c230e',
 '454ffc00-30e2-4fe5-916b-d944bf42ecc6',
 'b5346508-243b-494d-ac03-087f74875dfc',
 '7978ef21-dc07-4a24-910e-a8c3c18424b2',
 'ab7ec54c-0945-406c-8186-6f6e8d414c86',
 '146ce646-03de-42ef-bdc6-42c9410df9d9',
 'b57dad93-0fa7-47a5-8591-65ee042f00a7',
 '12d50ed3-cd65-498f-84cc-1912ad56d678',
 'b8af1a40-c872-4d9b-91d0-c86281eb01e7',
 '9e08a49a-e6f0-4222-9e10-5f0c666ea7a6',
 '1a78af0f-aafc-4e20-8637-68e8d158fe38',
 '996a5e08-3c38-4c91-abf0-9174fd2d04f6',
 '9b71c86c-8172-4846-bb69-5af5dc640fa8',
 'a3bf9f3a-5556-4c47-8b36-414b1b757307',
 '1f419dda-f088-43df-a8cd-17d892eb8c18',
 'd44bac8e-428f-4ce6-9e33-6a04ea980354',
 'f3fa52b9-b0e8-4ce8-ad66-499998f59704',
 '0013c1cd-3fa0-47f8-952a-d43f3ceaf0f2',
 '14760b84-80b9-4082-8992-e7eb6ef4cace',
 'cbdc97f3-2850-4dea-bb3f-16c5666bb7a0',
 'c1d1a1b9-c002-4ff7-8ba8-ce4f88696a21',
 'e7b8f565-d46b-455c-a020-0a800dcdab9f',
 '9e008887-0155-4159-ac9a-7e13b57fc911',
 '003ced82-b4d8-

In [78]:
db_retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 15, 'fetch_k': 100}
)

In [79]:
# queries[0]

StringPromptValue(text='What are the potential mechanisms that cause an interaction between Cranberry products and Warfarin that may cause INR Increase?')

In [81]:
# db_retriever.invoke(queries[0].text)

In [None]:
# for doc in results:
#     print(f"* {doc.page_content} [{doc.metadata}]")

In [55]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

In [56]:
llm = ChatOpenAI(model="gpt-4o-mini",temperature=1)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vector_store.as_retriever(), llm=llm
)

In [57]:
prompt + queries[0]

StringPromptValue(text='What are the potential mechanisms that cause an interaction between Cranberry products and Warfarin that may cause INR Increase?')

In [83]:
unique_docs = retriever_from_llm.invoke(prompt + queries[0].text)
len(unique_docs)

4

In [85]:
pretty_print_docs(unique_docs)

Document 1:

Cranberry Unnamed: 1 Unnamed: 2 Vaccinium macrocarpon Aiton (Ericaceae) Synonym(s) and related species Large cranberry European cranberry Mossberry Monograph Interpretation Citation Ciclosporin Occasional consumption of cranberry juice does not appear to affect the bioavailibility of ciclosporin. Regular daily consumption has not been studied. Grenier J, Fradette C, Morelli G, Merritt GJ, Vranderick M, Ducharme MP. Pomelo juice, but not cranberry juice, affects the pharmacokinetics of cyclosporine in humans. Clin Pharmacol Ther (2006) 79, 255–62. Flurbiprofen Limited evidence suggests that cranberry juice does not appear to affect the pharmacokinetics of flurbiprofen. Greenblatt DJ, von Moltke LL, Perloff ES, Luo Y, Harmatz JS, Zinny MA. Interaction of flurbiprofen with cranberry juice, grape juice, tea, and fluconazole: in vitro and clinical studies. Clin Pharmacol Ther (2006) 79, 125–33. Food No interactions found. Note that cranberry juice is widely used in food and bev

In [None]:
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers.document_compressors import DocumentCompressorPipeline

# from langchain_text_splitters import CharacterTextSplitter

In [None]:
from langchain_openai import OpenAI

In [None]:
splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=". ")
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[splitter, redundant_filter, relevant_filter]
)

In [312]:
compressor = LLMChainExtractor.from_llm(llm)

In [313]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_from_llm
)

In [314]:
compressed_docs = compression_retriever.invoke(queries[0])

In [306]:
pretty_print_docs(compressed_docs)

Document 1:

Warfarin and related drugs A number of case reports suggest that cranberry juice can increase the INR of patients taking warfarin, and one patient has died as a result of this interaction. Other patients have developed unstable INRs or, in one isolated case, a reduced INR. However, in four controlled studies, cranberry juice did not alter the anticoagulant effect of warfarin, or had only very minor effects on the INR. Neither cranberry juice nor the extract altered warfarin pharmacokinetics. Committee on Safety of Medicines/Medicines and Healthcare products Regulatory Agency Possible interaction between warfarin and cranberry juice. Current Problems (2003) 29, 8. Committee on Safety of Medicines/Medicines and Healthcare products Regulatory Agency Interaction between warfarin and cranberry juice: new advice. Current Problems (2004) 30, 10. Suvarna R, Pirmohamed M, Henderson L. Possible interaction between warfarin and cranberry juice. BMJ (2003) 327, 1454. Grant P. Warfarin

-------------

# Reckoning knowledge folding

---------------

In [None]:
file_path2 = (
    "./cranberry.csv"
)

In [None]:
loader2 = CSVLoader(file_path=file_path2,source_column="English common name of herbal substance")
data2 = loader2.load()

In [None]:
data2

------------

# Results collections

In [None]:
results_dir = './KF-Results/'

--------------

# Baseline LLAMA 3.1 LLM Generation

In [11]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
dtype = torch.bfloat16

In [12]:
pipeline = transformers.pipeline(
    "text-generation", 
    model=model_id,
    torch_dtype =dtype,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [13]:
for i in range(len(queries)):
    messages = [
        {
            "role": "system", 
            "content": prompt
        },
        {
            "role": "user", 
            "content": queries[i].text
        },
    ]
    outputs = pipeline(
        messages,
        max_new_tokens=3500,
        temperature=1.0
    )
    with open(f'{results}llama/llama_3.1_baseline_({mappings.loc[i].NP},{mappings.loc[i].Drug},{mappings.loc[i].AE}).md', 'w') as f:
        f.write('# Prompts to LLM:\n\n')
        f.write(messages[0]['content'])
        f.write("\n")
        f.write(messages[1]['content'])
        f.write('\n\n-------\n\n')
        f.write('# Response from LLM:\n\n')
        f.write(str(outputs[0]["generated_text"][-1]["content"]))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [14]:
# print(outputs[0]["generated_text"][-1]["content"])

------------------------

# Baseline GEMMA 2 LLM Generation

In [15]:
model_id = "google/gemma-2-9b-it"
dtype = torch.bfloat16

In [16]:
pipeline = transformers.pipeline(
    "text-generation", 
    model=model_id,
    torch_dtype =dtype,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
instruction_header = "### Instructions ###\n\n"
query_header = "### Query ###\n\n"

In [18]:
# instruction_header + prompt + query_header + queries[0].to_string()

In [19]:
for i in range(len(queries)):
    messages = [{
        "role": "user",
        "content": instruction_header + prompt + query_header + queries[i].to_string()
    }]
    outputs = pipeline(
        messages,
        max_new_tokens=3500,
        temperature=1.0
    )
    with open(f'./Results/gemma/gemma_2_baseline_({mappings.loc[i].NP},{mappings.loc[i].Drug},{mappings.loc[i].AE}).md', 'w') as f:
        f.write('# Prompts to LLM:\n\n')
        f.write(messages[0]['content'])
        f.write('\n\n-------\n\n')
        f.write('# Response from LLM:\n\n')
        f.write(str(outputs[0]["generated_text"][-1]["content"]))

In [20]:
# print(outputs[0]["generated_text"][-1]["content"])

------------------------

# Baseline GPT4o-mini LLM Generation

In [194]:
# import getpass
import os

In [195]:
# if not os.environ.get("OPENAI_API_KEY"):
#     os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [196]:
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_PERSONAL_API_KEY", default=False)

In [24]:
from langchain_openai import ChatOpenAI

In [25]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=1.0,
    max_tokens=3500,
)

In [26]:
for i in range(len(queries)):
    messages = [
        {
            "role": "system", 
            "content": prompt
        },
        {
            "role": "user", 
            "content": queries[i].text
        },
    ]
    outputs = llm.invoke(messages)
    with open(f'./Results/gpt/gpt4o-mini_baseline_({mappings.loc[i].NP},{mappings.loc[i].Drug},{mappings.loc[i].AE}).md', 'w') as f:
        f.write('# Prompts to LLM:\n\n')
        f.write(messages[0]['content'])
        f.write("\n")
        f.write(messages[1]['content'])
        f.write('\n\n-------\n\n')
        f.write('# Response from LLM:\n\n')
        f.write(str(outputs.content))

-------------------