# RAG BASED Chat with PDF Application



### Installing necessary libraries


In [1]:
!pip install PyMuPDF langchain openai python-dotenv langchain_community pdfplumber PyPDF4 chromadb langchain_openai

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting langchain
  Downloading langchain-0.2.16-py3-none-any.whl.metadata (7.1 kB)
Collecting openai
  Downloading openai-1.45.0-py3-none-any.whl.metadata (22 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting langchain_community
  Downloading langchain_community-0.2.16-py3-none-any.whl.metadata (2.7 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyPDF4
  Downloading PyPDF4-1.27.0.tar.gz (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (

#### A class to extract text from the PDF

In [2]:

import fitz
import os
class PDFtoText():
    def __init__(self) -> None:
        pass

    def open_pdf(self, pdf):
        if os.path.exists(str(pdf)) or isinstance(pdf,bytes):
                self.pdf = fitz.open(pdf)
                self.page_count = self.pdf.page_count
                return self.pdf
                # self.pdf.close()
        else:
            raise ValueError(f"PDF path is incorrect", pdf)
    def extract_all_text(self, pdf):
         # Open the PDF file
        if not pdf: return None
        self.pdf = self.open_pdf(pdf)

        all_text = ''

        # Iterate through all pages
        for page_number in range(self.page_count) :
            # Get the page
            page = self.pdf[page_number]

            # Extract text from the page
            text = page.get_text()
            all_text += text

            # Print or process the extracted text as needed
            # print(f"Page {page_number + 1}:\n{text}\n")
        return all_text

    def extract_all_text_page_wise(self, pdf):
         # Open the PDF file
        if not pdf: return None
        self.pdf = self.open_pdf(pdf)

        all_text = []

        # Iterate through all pages
        for page_number in range(self.page_count) :
            # Get the page
            page = self.pdf[page_number]

            # Extract text from the page
            text = page.get_text()
            all_text.append(text)
        return all_text
    def extract_text_from_single_page(self,pdf, page_number):
        if not pdf: return None
        self.pdf = self.open_pdf(pdf)
        if page_number -1> self.page_count:
             raise ValueError("Invlaid pagenumber")
        else:
             return self.pdf[page_number-1].get_text()
    def extract_text_from_interval(self,pdf,page_number, interval =1):
        if not pdf: return None
        self.pdf = self.open_pdf(pdf)
        text = ""
        if page_number > self.page_count:
            raise ValueError("Invlaid pagenumber")
        else:
            # Calculate the start and end pages
            start_page = max(0, page_number - interval)
            end_page = min(self.page_count - 1, page_number + interval)

            for page_number in range(start_page, end_page + 1):
                text += self.extract_text_from_single_page(pdf=pdf, page_number=page_number)
        return text

### Embedding Generation using DeepInfra Models

In [3]:
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
import os

class EmbeddingGenerator:
    def __init__(self) -> None:
        pass
        # Create an OpenAI client with your deepinfra token and endpoint
        self.openai = OpenAI(
            api_key=os.getenv("DEEPINFRA_API_KEY"),
            base_url="https://api.deepinfra.com/v1/openai",
        )
    def embed_query(self, text:str):
        embedding = self.openai.embeddings.create(
        model="BAAI/bge-large-en-v1.5",
        input=text,
        encoding_format="float"
        )
        return embedding.data[0].embedding
    def embed_documents(self, texts:list):
        emb = []
        for i in range(len(texts)):
            embedding = self.embed_query(i)
            emb.append(embedding)
        return emb

if __name__ == "__main__":
    gen = EmbeddingGenerator()
    input = ["Prajwal", "loves", "sakshi"]
    embedding =  gen.embed_documents(input)
    print(len(embedding))
    print(len(embedding[0]))
    print(embedding[0])






3
1024
[0.016627227887511253, -0.013372370973229408, 0.0024965356569737196, 0.013677443377673626, 0.0020093938801437616, -0.03882081061601639, -0.0209063533693552, 0.04022735357284546, 0.01106675248593092, 0.033761464059352875, 0.025660647079348564, 0.0024868266191333532, -0.007759809959679842, 0.002239795168861747, -0.01601104810833931, 0.029302580282092094, -0.004804798401892185, -6.935988494660705e-05, -0.01914072223007679, 0.011064722202718258, -0.011854090727865696, 0.017241131514310837, -0.09745549410581589, -0.00010348513751523569, -0.03538932278752327, 0.041715819388628006, -0.021112671121954918, 0.00295909377746284, 0.0674673542380333, 0.029219962656497955, 0.0016559945652261376, -0.040265440940856934, 0.02257353439927101, -0.033009808510541916, -0.001523511717095971, -0.03798763081431389, 0.040762387216091156, -0.012441975995898247, 0.017350010573863983, -0.03681422397494316, 0.009441711939871311, -0.027741312980651855, 0.03842281550168991, -0.020189974457025528, -0.044736031

### Text Cleaning and Ingestion Pipeline

In [4]:
import re
import os
from typing import Callable, List, Tuple, Dict
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv
import pdfplumber
import os
import PyPDF4
load_dotenv()
os.getenv("DEEPINFRA_API_KEY")

embeddings = EmbeddingGenerator()
from datetime import datetime

def get_date():
# Get today's date
    today_date = datetime.today()

# Format today's date as a string
    return today_date.strftime('%Y-%m-%d')


def extract_metadata_from_pdf(file_path: str) -> dict:
    with open(file_path, "rb") as pdf_file:
        reader = PyPDF4.PdfFileReader(pdf_file)  # Change this line
        if not reader.isEncrypted:
            metadata = reader.getDocumentInfo()
        else:
            metadata = {
            "title": "Title",
            "author": "Author",
            "creation_date": get_date(),
        }

        return {
            "title": metadata.get("/Title", "").strip(),
            "author": metadata.get("/Author", "").strip(),
            "creation_date": metadata.get("/CreationDate", "").strip(),
        }
def extract_pages_from_pdf(file_path: str) -> List[Tuple[int, str]]:
    """
    Extracts the text from each page of the PDF.

    :param file_path: The path to the PDF file.
    :return: A list of tuples containing the page number and the extracted text.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    with pdfplumber.open(file_path) as pdf:
        pages = []
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text.strip():  # Check if extracted text is not empty
                pages.append((page_num + 1, text))
    return pages


def parse_pdf(file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str, str]]:
    """
    Extracts the title and text from each page of the PDF.

    :param file_path: The path to the PDF file.
    :return: A tuple containing the title and a list of tuples with page numbers and extracted text.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    metadata = extract_metadata_from_pdf(file_path)
    pages = extract_pages_from_pdf(file_path)

    return pages, metadata
def parse_pdf(file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str, str]]:
    """
    Extracts the title and text from each page of the PDF.

    :param file_path: The path to the PDF file.
    :return: A tuple containing the title and a list of tuples with page numbers and extracted text.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    metadata = extract_metadata_from_pdf(file_path)
    pages = extract_pages_from_pdf(file_path)

    return pages, metadata
def extract_metadata_from_dict(news_metadata):
        if not news_metadata: return {}

        return {
            "title": news_metadata.get("title", "").strip(),
            "author": news_metadata.get("publisher", "").strip(),
            "creation_date": get_date()
        }


def merge_hyphenated_words(text: str) -> str:
    return re.sub(r"(\w)-\n(\w)", r"\1\2", text)


def fix_newlines(text: str) -> str:
    return re.sub(r"(?<!\n)\n(?!\n)", " ", text)


def remove_multiple_newlines(text: str) -> str:
    return re.sub(r"\n{2,}", "\n", text)

def clean_text_str(text:str, cleaning_functions: List[Callable[[str], str]]):
    for cleaning_function in cleaning_functions:
            result = cleaning_function(text)

    return result
def clean_text(
    pages: List[Tuple[int, str]], cleaning_functions: List[Callable[[str], str]]
) -> List[Tuple[int, str]]:
    cleaned_pages = []
    for page_num, text in pages:
        for cleaning_function in cleaning_functions:
            text = cleaning_function(text)
        cleaned_pages.append((page_num, text))
    return cleaned_pages
def text_to_docs(text: List[str], metadata: Dict[str, str]) -> List[Document]:
    """Converts list of strings to a list of Documents with metadata."""
    doc_chunks = []
    text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=200,
        )
    for page_num, page in text:
        chunks = text_splitter.split_text(page)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={
                    "page_number": page_num,
                    "chunk": i,
                    "source": f"p{page_num}-{i}",
                    **metadata,
                },
            )
            doc_chunks.append(doc)

    return doc_chunks

def ingest_new_file(file_path, collection_name):
    # Step 1: Parse PDF
    raw_pages, metadata = parse_pdf(file_path)

    # Step 2: Create text chunks
    cleaning_functions = [
        merge_hyphenated_words,
        fix_newlines,
        remove_multiple_newlines,
    ]

    cleaned_text_pdf = clean_text(raw_pages, cleaning_functions)
    document_chunks = text_to_docs(cleaned_text_pdf, metadata)

    # Optional: Reduce embedding cost by only using the first 23 pages
    # document_chunks = document_chunks[:10]

    # Step 3 + 4: Generate embeddings and store them in DB
    vector_store = Chroma.from_documents(
        document_chunks,
        embeddings,
        collection_name=os.getenv("default_collection_name") if collection_name is None or "" else collection_name,
        persist_directory=os.getenv("default_data_directory")
    )




### The Chat model and the Retriever

In [12]:
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.schema import HumanMessage, AIMessage
from langchain.schema.vectorstore import VectorStoreRetriever
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from langchain.schema.document import Document
from typing import List
from langchain_community.llms import DeepInfra
from langchain.chains import LLMChain
import math
from dotenv import load_dotenv

import os
load_dotenv()
os.environ["DEEPINFRA_API_TOKEN"]  = os.getenv("DEEPINFRA_API_KEY")



class MyVectorStoreRetriever(VectorStoreRetriever):
    # See https://github.com/langchain-ai/langchain/blob/61dd92f8215daef3d9cf1734b0d1f8c70c1571c3/libs/langchain/langchain/vectorstores/base.py#L500
    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        docs_and_similarities = (
            self.vectorstore.similarity_search_with_relevance_scores(
                query, **self.search_kwargs
            )
        )

        # Make the score part of the document metadata
        for doc, similarity in docs_and_similarities:
            doc.metadata["score"] = similarity

        docs = [doc for doc, _ in docs_and_similarities]
        return docs



class Chat:
    def __init__(self, collection_name=""):
        '''
        return ConversationalRetrievalChain.from_llm(
            model,
            #retriever=vector_store.as_retriever(),
            retriever = MyVectorStoreRetriever(
                vectorstore=vector_store,
                search_type="similarity_score_threshold",
                search_kwargs={"score_threshold": 0.2, "k": 3},
            ),
            return_source_documents=True,
            # verbose=True,
        )
    '''
        self.embedding = EmbeddingGenerator()
        self.collection_name=collection_name
        self.vector_store = Chroma(
            collection_name=os.getenv("default_collection_name") if self.collection_name == "" or None else self.collection_name,
            embedding_function=self.embedding,
            persist_directory=os.getenv("default_data_directory"),
        )
        self.chain = RetrievalQA.from_chain_type(
            DeepInfra(model_id=os.getenv("CHAT_MODEL_NAME")),
            chain_type=os.getenv("chain_type"),

            retriever = MyVectorStoreRetriever(
                vectorstore=self.vector_store,
                search_type=os.getenv("search_type"),
                search_kwargs={"score_threshold": float(os.getenv("score_threshold")), "k": int(os.getenv("top_k_to_search"))},
            ),
            return_source_documents=True,
        )

    def chat(self, question, chat_history = [], collection_name=None):
        answer = None
        response = self.chain({"query": question, "history":chat_history})
        answer = response["result"]
        source = response["source_documents"]

        pgs = []
        for document in source:
            pgs.append(document.metadata['page_number'])
            #print(f"List after inserting:", pgs)

        for i in range(0, len(pgs)):
            for j in range(i+1, len(pgs)):
                #if(l[i] == l[j]):
                if(math.isclose(pgs[i], pgs[j], abs_tol = 2)):
                        pgs.insert(0, pgs[i])
        pgs = list(set(pgs))
        return answer, pgs






if __name__ == "__main__":
    load_dotenv()
    chat = Chat(collection_name="Praj")
    ingest_new_file("Data\IARC Sci Pub 163_Chapter 3.pdf", collection_name="Praj")
    chat_history = []
    question = "what are some usage of samples, Biomedical research and laboratory practices"
    answer, pgs = chat.chat(question = question, collection_name="Praj", chat_history=[] )
    chat_history.append(answer)
    print(answer, pgs)




 Samples are used in various ways in biomedical research and laboratory practices, including:

1. **Biorepositories**: Samples are stored in biorepositories for future research, such as studying the genetic basis of diseases.
2. **Disease diagnosis**: Samples are used to diagnose diseases, such as cancer, infectious diseases, and genetic disorders.
3. **Forensic analysis**: Samples are used in forensic analysis, such as DNA profiling, to aid in crime investigations.
4. **Population-based studies**: Samples are used to study the genetic and environmental factors that contribute to disease development in populations.
5. **Proteomic analysis**: Samples are used to study proteins and their functions in biological systems.
6. **Vaccine development**: Samples are used to develop and test vaccines against infectious diseases.
7. **Toxicology testing**: Samples are used to test the toxicity of chemicals and drugs.
8. **Genetic research**: Samples are used to study the genetic basis of diseases

### Example Queries

In [None]:
question_1  = "What are the freezers used for?"
answer_1  = "The freezers are used for storing biological samples. [16, 1, 23]"

In [None]:
question_2 = "where are the samples stored"
answer_2  = " According to the context, the samples are stored in liquid nitrogen freezers at -196°C, and also in dry ice, which is used as a refrigerant for shipping. [16, 11, 23, 15]"

In [None]:
question_3 = "Generally, what are the samples about"
answer_3 = """
The samples appear to be related to various topics, including:

1. Biomedical research and laboratory practices (e.g., specimen collection, storage, and processing).
2. Business and finance (e.g., revenue, expenses, and profitability).
3. Telecommunications and technology (e.g., wireless and broadband services).
4. Occupational safety and health regulations (e.g., OSHA guidelines).

It's challenging to pinpoint a single, overarching theme, as the samples seem to cover diverse subjects. [16, 24, 19, 23]
"""

In [None]:
question_4 = "what are some usage of samples, Biomedical research and laboratory practices"
answer_5 = """
In biomedical research and laboratory practices, samples are used for various purposes, including:

1. **Disease diagnosis and monitoring**: Biological samples, such as blood, tissue, or urine, are used to diagnose and monitor diseases, including cancer, infectious diseases, and genetic disorders.
2. **Research nd development**: Samples are used to develop new treatments, vaccines, and diagnostic tools, and to understand the underlying biology of diseases.
3. **Quality control and assurance**: Samples are used to ensure the quality of laboratory tests, reagents, and equipment.
4. **Toxicology and pharmacology studies**: Samples are used to study the effects of drugs, chemicals, and other substances on living organisms.
5. **Forensic analysis**: Biological samples, such as DNA, are used in forensic science to aid in the investigation of crimes.
6. **Population-based studies**: Samples are used to study the health and disease patterns of populations, and to identify risk factors and trends.
7. **Biobanking**: Samples are stored in biobanks for future research, enabling the study of diseases and the development of new treatments over time.

These are just a few examples of the many uses of samples in biomedical research and laboratory practices. [16, 17, 20, 23, 24]
"""

In [None]:
question_5 = "What are the parameters under which the samples are stored?"
answer_5 = "Cryoprotectants that are less toxic have been developed to allow for preservation of RNA and DNA in blood and other specimens. Whole blood may also be cryopreserved in large-scale epidemiological studies. Saliva or blood collected on treated paper cards is available for laboratory applications. Enough DNA can be obtained from a 2mm punch of a paper card for about 500 reactions. [9, 3, 12]"

In [None]:
question_6 = "List down some type of specimens"
answer_6 = """Based on the provided context, some types of specimens mentioned are:

* Blood
* Urine
* Saliva

Please note that these are the only types of specimens explicitly mentioned in the provided context. There may be other types of specimens not mentioned here. [3]"""

In [None]:
question_7 = "List down all the types and definitions of specimen you know"
answer_7 = """ Here are some types and definitions of specimens:

1. **Biological specimen**: A sample of tissue, blood, or other bodily fluid taken from a living organism for diagnostic, therapeutic, or research purposes.
2. **Formalin-fixed paraffin-embedded (FFPE) specimen**: A tissue sample that has been preserved in formalin and embedded in paraffin wax to create a solid block that can be sectioned and stained for histological examination.
3. **Frozen specimen**: A tissue or cell sample that has been frozen to preserve its molecular structure and function, often used for DNA, RNA, or protein analysis.
4. **Fresh specimen**: A tissue or cell sample that has not been fixed or preserved, often used for immediate analysis or processing.
5. **Cryopreserved specimen**: A tissue or cell sample that has been frozen at very low temperatures (typically -196°C) to preserve its molecular structure and function, often used for long-term storage.
6. **Chemically fixed specimen**: A tissue or cell sample that has been treated with a chemical fixative to preserve its structure and prevent degradation, often used for histological examination.
7. **Paraffin-embedded specimen**: A tissue sample that has been embedded in paraffin wax to create a solid block that can be sectioned and stained for histological examination.
8. **Whole blood specimen**: A sample of whole blood collected from a patient or donor for diagnostic or therapeutic purposes.
9. **Serum specimen**: A sample of blood serum (the liquid portion of blood) collected from a patient or donor for diagnostic or therapeutic purposes.
10. **Urine specimen**: A sample of urine collected from a patient or donor for diagnostic or therapeutic purposes.
11. **Tissue microarray (TMA) specimen**: A collection of tissue samples arranged in a grid pattern on a single slide, often used for high-throughput analysis of multiple tissue samples.
12. **Cell block specimen**: A sample of cells that have been aggregated and embedded in a paraffin block, often used for histological examination.
13. **Cytology specimen**: A sample of cells collected from a patient or donor for diagnostic purposes, often used for cytological examination.
14. **Histology specimen**: A tissue sample that has been processed and stained for histological examination.
15. **Molecular specimen**: A sample of tissue, blood, or other bodily fluid collected for molecular analysis, such as DNA, RNA, or protein analysis.

These are just a few examples of the [24, 17, 26, 19]"""

In [None]:
question_8 = "What are some ways to collect urine?"
answer_8 = """Here are some ways to collect urine:

1. **Clean Catch Midstream Urine Collection**: This is the most common method. The patient cleans their genital area, starts urinating, and then collects a sample in a sterile container.
2. **Catheterization**: A healthcare professional inserts a catheter into the bladder to collect a urine sample. This method is often used for patients who cannot urinate on their own or have a urinary tract infection.
3. **Urine Bag Collection**: A sterile urine collection bag is attached to the genital area to collect urine. This method is often used for infants or young children.
4. **Suprapubic Aspiration**: A healthcare professional uses a needle to collect a urine sample directly from the bladder. This method is often used for patients with urinary retention or bladder dysfunction.
5. **Urinary Catheter with a Collection Bag**: A urinary catheter is inserted into the bladder, and a collection bag is attached to collect urine. This method is often used for patients who require continuous urine drainage.

It's essential to follow proper techniques and guidelines when collecting urine to ensure the sample is sterile and accurate for laboratory testing. [18, 19, 13]"""