<a href="https://colab.research.google.com/github/Pauullamm/LC_Pill_Checker/blob/main/LC_Pill_Checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LC_Pill_Checker: Experimenting with retrieval augmented generation (RAG) with the LangChain framework to identify tablets/capsules by their text description

## General Impressions:

1. Data issues: text descriptions are not ideal - carries too much ambiguity for language model to differentiate pills e.g. "white, biconvex, round, debosed with "" on one side" carries a lot of similarities with other pill descriptions.

2. Formatting of documents is crucial to reduce noise of data.

3. Different types of embeddings models/vector stores still need to be experimented with to see if there is any improvement in performance.

4. Might need to explore dimensionality reduction (UMAP) for 'curse of dimensionality'

### DISCLAIMER: The information below is provided for private study and / or personal use purposes only, and is not intended to be a substitute for a health care provider’s consultation or advice. The information below does not constitute legal or technical advice.

In [None]:
#@title DEPENDENCIES

!pip install --upgrade openai -q
!pip install --upgrade --quiet  langchain langchain-openai faiss-cpu tiktoken
!pip install beautifulsoup4 -q
!pip install chromadb -q
!pip install aiofiles -q
!pip install lancedb -q
!pip install umap-learn -q

In [72]:
#@title IMPORTS
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
import json
from openai import OpenAI
import openai
import umap
import aiofiles
import string

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage

from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import SKLearnVectorStore
from langchain.vectorstores import LanceDB



from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor


!pip freeze > requirements.txt

# How to use:

1. Obtain api key from "https://openai.com/blog/openai-api"
2. Prepare tablet/capsules text description data in a .txt file in the following format: <br> Drug name | Drug description | Manufacturer \n (the custom document loader splits the text into smaller documents by each newline)
3. Describe a tablet/capsule under question_to_ask

In [70]:
OPENAI_API_KEY = "" # @param {type:"string"}

llm = ChatOpenAI(openai_api_key = OPENAI_API_KEY)

In [5]:
#@title CustomDocumentLoader class from LangChain
from typing import AsyncIterator, Iterator

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document


class CustomDocumentLoader(BaseLoader):
    """An example document loader that reads a file line by line."""

    def __init__(self, file_path: str) -> None:
        """Initialize the loader with a file path.

        Args:
            file_path: The path to the file to load.
        """
        self.file_path = file_path

    def lazy_load(self) -> Iterator[Document]:  # <-- Does not take any arguments
        """A lazy loader that reads a file line by line.

        When you're implementing lazy load methods, you should use a generator
        to yield documents one by one.
        """
        with open(self.file_path, encoding="utf-8") as f:
            line_number = 0
            for line in f:
                yield Document(
                    page_content=line,
                    metadata={"line_number": line_number, "source": self.file_path},
                )
                line_number += 1

    # alazy_load is OPTIONAL.
    # If you leave out the implementation, a default implementation which delegates to lazy_load will be used!
    async def alazy_load(
        self,
    ) -> AsyncIterator[Document]:  # <-- Does not take any arguments
        """An async lazy loader that reads a file line by line."""
        # Requires aiofiles
        # Install with `pip install aiofiles`
        # https://github.com/Tinche/aiofiles

        async with aiofiles.open(self.file_path, encoding="utf-8") as f:
            line_number = 0
            async for line in f:
                yield Document(
                    page_content=line,
                    metadata={"line_number": line_number, "source": self.file_path},
                )
                line_number += 1



In [71]:
#@title Load pill description data (.txt file) with document loader
path_to_file = "" # @param {type:"string"}
loader = CustomDocumentLoader(path_to_file)
docs = loader.load()

In [6]:
#@title Splitting text to documents - splitting might not be necessary if document size is small? (skip)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
splits = text_splitter.split_documents(docs)


In [None]:
#@title Text embeddings with openai and storing in vector store (currently using LanceDB)
embeddings_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

vector_store = LanceDB.from_documents(
    documents=docs,
    embedding=embeddings_model
)


Example question: '''I have tablets that are Orange, modified capsule shaped, biconvex film-coated tablet (approximately 20.6 x 8.6 mm), debossed with “ H” on one side and “ A1” on the other side.. What is this drug and its manufacturer? - Abacavir/Lamivudine 600 mg/300 mg film-coated tablets | Amarox Limited
'''

In [60]:
question_to_ask = "" # @param {type:"string"}

In [None]:
from langchain.retrievers.document_compressors import LLMChainFilter

_filter = LLMChainFilter.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=_filter, base_retriever=retriever
)

compressed_docs = compression_retriever.get_relevant_documents(
    question_to_ask
)

In [31]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )
pretty_print_docs(compressed_docs)




In [61]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
prompt = ChatPromptTemplate.from_template("""Answer the following question based on the provided context:

{context}

Question: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)

retriever = vector_store.as_retriever()
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])


retrieval_chain = (
    {"context": retriever | format_docs, "input": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [None]:
retrieval_chain.invoke(question_to_ask)