<a href="https://colab.research.google.com/github/Omkar-Rajkumar-Khade/100-days-of-machine-learning_tutorials/blob/main/Chatbot_Using_T5_Langchain_ChromaDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install langchain tiktoken chromadb pypdf transformers InstructorEmbedding
!pip -q install accelerate bitsandbytes

In [2]:
!pip show langchain

Name: langchain
Version: 0.0.267
Summary: Building applications with LLMs through composability
Home-page: https://www.github.com/hwchase17/langchain
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, async-timeout, dataclasses-json, langsmith, numexpr, numpy, openapi-schema-pydantic, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


## Load multiple and process documents

In [3]:
!pip install gdown



In [4]:
!gdown --id 1DcD9x5UjNWkm2EnTzn_PBCI1-WsvDhjb -O documents.zip
!unzip -q documents.zip -d documents

Downloading...
From: https://drive.google.com/uc?id=1DcD9x5UjNWkm2EnTzn_PBCI1-WsvDhjb
To: /content/documents.zip
100% 1.47M/1.47M [00:00<00:00, 165MB/s]


In [4]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base",
                                              load_in_8bit=True,
                                              device_map='auto',
                                            #   torch_dtype=torch.float16,
                                            #   low_cpu_mem_usage=True,

                                              )

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['decoder.embed_tokens.weight', 'encoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
import torch

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

local_llm = HuggingFacePipeline(pipeline=pipe)

In [6]:
print(local_llm('What is the capital of India?'))

chennai


In [7]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m698.2 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125924 sha256=50f177d90496df4113f220213b953a0187c45cb78045823f93606f219b7f1f5c
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_t

## Setting up LangChain

In [8]:
import os

from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [9]:
# Load and process the text files
# loader = TextLoader('single_text_file.pdf')
loader = DirectoryLoader('./documents/documents/documents/', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [10]:
len(documents)

413

In [11]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [12]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [13]:
len(texts)

1112

## HF Instructor Embeddings

In [14]:

from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base",
                                                      model_kwargs={"device": "cuda"})

Downloading (…)62736/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading (…)15e6562736/README.md:   0%|          | 0.00/66.2k [00:00<?, ?B/s]

Downloading (…)e6562736/config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)62736/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.43k [00:00<?, ?B/s]

Downloading (…)6562736/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


## Create the DB

In [15]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## Here is the new embeddings being used
embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

## Make a retriever

In [16]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

## Make a chain

In [17]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=local_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [18]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [19]:
# full example
query = "What is full form of DPM2009?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Token indices sequence length is longer than the specified maximum sequence length for this model (597 > 512). Running this sequence through the model will result in indexing errors


Defence Procurement Manual, 2009


Sources:
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf


In [20]:

query = "What is the scope of the term Procurement in the manual?"
llm_response = qa_chain(query)
process_llm_response(llm_response)


acquiring all types of goods (both scaled and non-scaled), such as equipment, stores, spares, technical
literature, etc., as well as all types of se rvices, including packing, unpacking, preservation,
transportation, insurance, delivery, special services, leasing, technical assessment, consultancy, systems
study, software development, maintenance, updates, conservancy, etc.


Sources:
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf


In [21]:
query = "Which organizations are required to follow the procedures laid down in the manual?"
llm_response = qa_chain(query)
process_llm_response(llm_response)
# llm_response

Ministry of Defence, Service Headquarters and all subordinate authorities in the Comm and Headquarters, lower
formations, establishments and units thereunder at all levels


Sources:
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf


In [22]:
query = "What does the term Indent mean in the context of procurement?"
llm_response = qa_chain(query)
process_llm_response(llm_response)
# llm_response

a requisition placed by the provisioning authority on the procurement agency to procure an item


Sources:
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf


In [23]:

query = "What is the short title of the manual and when did it come into force?"
llm_response = qa_chain(query)
process_llm_response(llm_response)
# llm_response

Defence Procurement Manual, 2009


Sources:
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf


In [24]:

query = "What is the Fall Clause in the contract, and how does it affect pricing?"
llm_response = qa_chain(query)
process_llm_response(llm_response)
# llm_response

a price safety mechanism in rate contracts


Sources:
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf


In [25]:

query = "What does the term Competent Financial Authority refer to??"
llm_response = qa_chain(query)
process_llm_response(llm_response)
# llm_response

an authority duly empowe red by the Government of India to sanction and approve expenditure from public
accounts upto a specified limit in terms of amount of such expend iture and subject to availability of funds


Sources:
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf


In [26]:

query = "What does the term Indent refer to in the context of procurement?"
llm_response = qa_chain(query)
process_llm_response(llm_response)
# llm_response

a requisition placed by the provisioning authority on the procurement agency to procure an item


Sources:
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf
documents/documents/documents/DPM2009.pdf
