[Building a PDF Knowledge Bot With Open-Source LLMs](https://www.shakudo.io/blog/build-pdf-bot-open-source-llms)



In [None]:
!pip install -q -U pypdf accelerate llama-index sentence_transformers transformers einops accelerate langchain bitsandbytes chromadb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.4/802.4 kB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m8.7 MB

#Step 0: Loading LLM Embedding Models and Generative Models

In [None]:
EMB_INSTRUCTOR_XL = "hkunlp/instructor-xl"
EMB_SBERT_MPNET_BASE = "sentence-transformers/all-mpnet-base-v2"

In [None]:
LLM_FLAN_T5_XXL = "google/flan-t5-xxl"
LLM_FLAN_T5_XL = "google/flan-t5-xl"
LLM_FASTCHAT_T5_XL = "lmsys/fastchat-t5-3b-v1.0"
LLM_FLAN_T5_SMALL = "google/flan-t5-small"
LLM_FLAN_T5_BASE = "google/flan-t5-base"
LLM_FLAN_T5_LARGE = "google/flan-t5-large"
LLM_FALCON_SMALL = "tiiuae/falcon-7b-instruct"

In [None]:
config = {"persist_directory":None,
          "load_in_8bit":False,
          "embedding" : EMB_SBERT_MPNET_BASE,
          "llm":LLM_FLAN_T5_BASE,
          }

The creation of the models is governed by the configuration settings and is handled by the create_sbert_mpnet() and create_flan_t5_base() functions

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline

from langchain.embeddings.huggingface import HuggingFaceEmbeddings

def create_sbert_mpnet():
        device = "cuda" if torch.cuda.is_available() else "cpu"
        return HuggingFaceEmbeddings(model_name=EMB_SBERT_MPNET_BASE, model_kwargs={"device": device})


def create_flan_t5_base(load_in_8bit=False):
        # Wrap it in HF pipeline for use with LangChain
        model="google/flan-t5-base"
        tokenizer = AutoTokenizer.from_pretrained(model)
        return pipeline(
            task="text2text-generation",
            model=model,
            tokenizer = tokenizer,
            max_new_tokens=100,
            model_kwargs={"device_map": "auto", "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.}
        )



if config["embedding"] == EMB_SBERT_MPNET_BASE:
    embedding = create_sbert_mpnet()
load_in_8bit = config["load_in_8bit"]
if config["llm"] == LLM_FLAN_T5_BASE:
    llm = create_flan_t5_base(load_in_8bit=load_in_8bit)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def create_falcon_instruct_small(load_in_8bit=False):
        model = "tiiuae/falcon-7b-instruct"

        tokenizer = AutoTokenizer.from_pretrained(model)
        hf_pipeline = pipeline(
                task="text-generation",
                model = model,
                tokenizer = tokenizer,
                trust_remote_code = True,
                max_new_tokens=100,
                model_kwargs={
                    "device_map": "auto",
                    "load_in_8bit": load_in_8bit,
                    "max_length": 512,
                    "temperature": 0.01,
                    "torch_dtype":torch.bfloat16,
                    }
            )
        return hf_pipeline

# Step 1: Ingesting the Data into Vector Store (ChromaDB)

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain.vectorstores import Chroma

# Load the pdf
pdf_path = "/content/sample_doctors.txt"
loader = TextLoader(pdf_path)
documents = loader.load()

# Split documents and create text snippets
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base")  # This the encoding for text-embedding-ada-002
texts = text_splitter.split_documents(texts)

persist_directory = config["persist_directory"]
vectordb = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=persist_directory)

#Step 2: Retrieving Snippets and Prompt Engineering

In [None]:
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

hf_llm = HuggingFacePipeline(pipeline=llm)
retriever = vectordb.as_retriever(search_kwargs={"k":4})
qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff",retriever=retriever)

# Defining a default prompt for flan models
if config["llm"] == LLM_FLAN_T5_SMALL or config["llm"] == LLM_FLAN_T5_BASE or config["llm"] == LLM_FLAN_T5_LARGE:
    question_t5_template = """
    context: {context}
    question: {question}
    answer:
    """
    QUESTION_T5_PROMPT = PromptTemplate(
        template=question_t5_template, input_variables=["context", "question"]
    )
    qa.combine_documents_chain.llm_chain.prompt = QUESTION_T5_PROMPT

#Step 3: Querying the LLM

In [None]:
%%time
question = "List the name of doctors that Rohan can visit for his pain in chest?"
qa.combine_documents_chain.verbose = True
qa.return_source_documents = True
qa({"query":question,})



[1m> Entering new StuffDocumentsChain chain...[0m





[1m> Finished chain.[0m
CPU times: user 8.72 s, sys: 17.5 ms, total: 8.74 s
Wall time: 8.84 s


{'query': 'List the name of doctors that Rohan can visit for his pain in chest?',
 'result': 'Dr. Rahul Khanna,Orthopaedics (Bone Specialist) Dr. Meena Reddy,Cardiology (Heart Specialist) Dr. Nandini Sharma,Orthopaedics (Bone Specialist) Dr. Ashok Patel,Cardiology (Heart Specialist) Dr. Alok Patel,Orthopaedics (Bone Specialist) Dr. Shreya Gupt',
 'source_documents': [Document(page_content='Dr. Rahul Khanna,Orthopaedics (Bone Specialist)\nDr. Meena Reddy,Cardiology (Heart Specialist)', metadata={'source': '/content/sample_doctors.txt'}),
  Document(page_content='Dr. Nandini Sharma,Orthopaedics (Bone Specialist)\nDr. Ashok Patel,Cardiology (Heart Specialist)', metadata={'source': '/content/sample_doctors.txt'}),
  Document(page_content='Dr. Alok Patel,Orthopaedics (Bone Specialist)\nDr. Shreya Gupta,Cardiology (Heart Specialist)', metadata={'source': '/content/sample_doctors.txt'}),
  Document(page_content='Dr. Priya Sharma,Cardiology (Heart Specialist)\nDr. Rajesh Patel,Neurology (Neuro

In [None]:
%%time
question = "Write a letter to the principal of the college asking for 2 days
qa.combine_documents_chain.verbose = True
qa.return_source_documents = True
qa({"query":question,})



[1m> Entering new StuffDocumentsChain chain...[0m





[1m> Finished chain.[0m
CPU times: user 2.25 s, sys: 2.49 ms, total: 2.26 s
Wall time: 2.77 s


{'query': 'Write a letter to the principal of the college asking for 2 days leave',
 'result': 'i want to leave the college',
 'source_documents': [Document(page_content='Name,Category\nDr. Bharat Murthy,Orthopaedics (Bone Specialist)', metadata={'source': '/content/sample_doctors.txt'}),
  Document(page_content='Dr. Sonali Deshmukh,Pediatrics (Pediatrician)\nDr. Rajiv Kumar,Gynecology (Gynecologist)', metadata={'source': '/content/sample_doctors.txt'}),
  Document(page_content='Dr. Sunita Joshi,ENT (Ear, Nose, and Throat Specialist)\nDr. Rahul Agarwal,Pediatrics (Pediatrician)', metadata={'source': '/content/sample_doctors.txt'}),
  Document(page_content='Dr. Nisha Verma,Pediatrics (Pediatrician)\nDr. Anjali Gupta,Gynecology (Gynecologist)', metadata={'source': '/content/sample_doctors.txt'})]}