In [7]:
!pip -q install langchain==0.0.173 openai tiktoken chromadb==0.3.23 pycryptodome


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
!pip -q show langchain

In [9]:
import os

os.environ["OPENAI_API_KEY"] = "<your_api_key_here"

In [10]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import JSONLoader
from pprint import pprint

In [11]:
# Load and process the files

loader = DirectoryLoader('./documents', glob="./*.pdf", loader_cls=PyPDFLoader, show_progress=True, use_multithreading=True)
#loader = DirectoryLoader('./json_data', glob="./*.json", loader_cls=JSONLoader, loader_kwargs = {'jq_schema':'.pages[]'}, show_progress=True, use_multithreading=True)

documents = loader.load()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 134/134 [00:30<00:00,  4.33it/s]


In [6]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [7]:
len(texts)

6657

In [8]:
texts[0]

Document(page_content='CASE REPORT Open Access\nManagement of imatinib-associated skin rash in a\npatient with metastatic gastrointestinal stromal\ntumor: a case report\nJean-Yves Blay\nAbstract\nPurpose: Long-term continuous imatinib is recommended for adult patients with unresectable and/or metastatic\nKIT+ gastrointestinal stromal tumors (GIST) as long as the patient continues to benefit. In the adjuvant setting,\nrecent findings indicate that patients at considerable risk of recurrence should receive at least 3 years of imatinib.\nBecause imatinib is often administered for prolonged periods, proper management of imatinib-associated adverse\nevents is crucial.\nCase report: We report a 56-year-old man with metastatic KIT+ GIST of the liver who had Grade 3 imatinib\nintolerance (skin rash) when treatment was started. The rash was managed with antihistamine treatment\n(Dexchlorpheniramine maleate 4 mg per day) and several temporary (up to 2 weeks) dose interruptions. The', metadata={'

In [14]:
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

In [12]:
## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

In [11]:
# Embed and store the texts
vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

Using embedded DuckDB with persistence: data will be stored in: db


In [7]:
# persiste the db to disk
vectordb.persist()
vectordb = None

NameError: name 'vectordb' is not defined

In [15]:
# Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)

Using embedded DuckDB with persistence: data will be stored in: db


In [16]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [17]:
retriever.search_type

'similarity'

In [18]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [45]:
## Cite sources
def process_llm_response(res):
    print(f"\n\x1b[34mQuery:\x1b[0m {res[1]}")
    print(f"\x1b[32mResponse:\x1b[0m {res[0]['result']}")
    #print('\nSources:')
    #for source in res[0]["source_documents"]:
        #print(source.metadata['source'])

In [20]:
queries = [
    "What are the antivirals with longest half lives in humans and a dose less that 20mg per day?",
    "What host factors are known to affect influenza replication in human cells and mice vivo?",
    "What oral TB drug that has entered clinical trials have a half-life of > 100?",
    "What are the known safety liabilities of teixobactin and are there other lipin II compounds with bettter safety?",
    "What ReFrame compounds or mechanism of action can reduce the dose of approved TB therapies by lowering the dose by >5x of the approved TB drug",
    
]
follow_up = [
    "Are there other drugs or molecules that target the same biochemical pathways?",
    "Are there any TB drugs that have significantly fewer or less severe side effects, yet maintain a similar efficacy?",
    "Are there molecules with minimal interactions that might provide a synergistic effect whose effective dose is ≤50mg/ml?",
    "How diverse are the populations in which these drugs have been studied?",
    "How confident are the results for each drug's efficacy?",
    "Which drugs have the fewest potential confounds?"
]

In [21]:
llm_response = []
for query in queries:
    response = qa_chain(query)
    llm_response.append((response, query))
    

In [42]:
import time
from IPython.display import display, HTML

start_time = time.time()

for res in llm_response:
    process_llm_response(res)
    
end_time = time.time()
elapsed_time = end_time - start_time
    
timer_html = f"<b>Elapsed Time:</b> {elapsed_time:.2f} seconds"
display(HTML(timer_html))


[34mQuery:[0m What are the antivirals with longest half lives in humans and a dose less that 20mg per day?
[32mResponse:[0m  I don't know.

[34mQuery:[0m What host factors are known to affect influenza replication in human cells and mice vivo?
[32mResponse:[0m  I don't know.

[34mQuery:[0m What oral TB drug that has entered clinical trials have a half-life of > 100?
[32mResponse:[0m  I don't know.

[34mQuery:[0m What are the known safety liabilities of teixobactin and are there other lipin II compounds with bettter safety?
[32mResponse:[0m  Lipoglycopeptides can cause hypersensitivity reactions, with cross-allergy possible with vancomycin. Telavancin (TD-6424) has been studied in a murine model of pneumonia and has been found to have multiple mechanisms of action. The safety profile of these drugs is generally comparable to their comparators, but their use should be considered with caution in patients with a history of allergy to vancomycin. There is limited safety dat

In [48]:
query = "What oral TB drug that has entered clinical trials have a half-life of > 100?"

start_time = time.time()
llm_response = qa_chain(query)
process_llm_response((llm_response, query))

end_time = time.time()
elapsed_time = end_time - start_time
    
timer_html = f"<b>Elapsed Time:</b> {elapsed_time:.2f} seconds"
display(HTML(timer_html))


[34mQuery:[0m What oral TB drug that has entered clinical trials have a half-life of > 100?
[32mResponse:[0m  I don't know.
