### Import Dependencies & initialise env variables

In [1]:
import os
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

load_dotenv()

True

In [2]:
PDF_DIRECTORY = os.getenv("PDF_DIRECTORY")
DB_DIRECTORY = os.getenv("DB_DIRECTORY")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")

### Initialising the client objects

In [4]:
# Loading the embedding model
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
# Loading the PDF directory loader
pdf_directory_loader = PyPDFDirectoryLoader(path=PDF_DIRECTORY)
# Loading text-splitter for splitting the docs into small chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)
# Initialising chroma client to load the documents into the vector-store
chroma_client = Chroma(persist_directory=DB_DIRECTORY)

  from .autonotebook import tqdm as notebook_tqdm





In [5]:
# Loading all the PDFs in the directory
docs = pdf_directory_loader.load()
print("Number of docs loaded from the directory", len(docs))

# Splitting the documents into small chunk
doc_splits = text_splitter.split_documents(docs)
print("Number of documents after splitting", len(doc_splits))

Number of docs loaded from the directory 350
Number of documents after splitting 742


In [6]:
# Adding the document chunks to the Chroma DB vectore-store
vectorstore = chroma_client.from_documents(
    documents=doc_splits,
    embedding=embeddings,
    collection_name=COLLECTION_NAME,
    persist_directory=DB_DIRECTORY
)

### Test the results of vector-search

In [3]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vectorstore = Chroma(
    persist_directory=DB_DIRECTORY,
    embedding_function=embeddings,
    collection_name=COLLECTION_NAME
)

vectorstore.similarity_search_with_score("How to start Ford Figo?")

  from .autonotebook import tqdm as notebook_tqdm





[(Document(id='5ec4cc9b-4373-4b38-8972-319933b0e83b', metadata={'creationdate': '2015-06-12T10:48:16+00:00', 'producer': 'XEP 4.16 build 20090723', 'page_label': '1', 'author': 'Unknown', 'page': 0, 'total_pages': 245, 'title': 'Untitled', 'creator': 'Unknown', 'trapped': '/False', 'moddate': '2015-09-08T17:52:47+05:30', 'source': '..\\data\\documents\\Ford-figo-owners-manual.pdf'}, page_content="FORD FIGO Owner's Manual"),
  0.5034269094467163),
 (Document(id='25eeca33-b6ef-4230-bce2-210ba2e4605d', metadata={'total_pages': 245, 'trapped': '/False', 'title': 'Untitled', 'producer': 'XEP 4.16 build 20090723', 'page_label': '79', 'source': '..\\data\\documents\\Ford-figo-owners-manual.pdf', 'creator': 'Unknown', 'moddate': '2015-09-08T17:52:47+05:30', 'author': 'Unknown', 'page': 81, 'creationdate': '2015-06-12T10:48:16+00:00'}, page_content='1. Fully depress the clutch pedal.\n2. Start the engine.\nVehicles with Automatic Transmission\nNote: Do not touch the accelerator pedal.\n1. Depre

In [5]:
# Creating a retriever object to fetch similary search
retriever = vectorstore.as_retriever(
    search_type="mmr", search_kwargs={"k": 5, "fetch_k": 50}
)

documents = retriever.invoke("How to start Hunter 350?")

In [6]:
data = [doc.page_content for doc in documents]
data

['Royal Enfield Hunter 35052\nSTARTING\nCAUTION\n\x84 In case the malfunction \nindicator does not \nturn “OFF”, get the \nmotorcycle checked \nthrough an Royal Enfield \nAuthorised Service \nCentre for rectification.\n\x84 Disengage clutch by \npulling in the clutch \nlever and hold it in \ndepressed condition.\n\x84 Push and hold electric \nstart switch until engine \nstarts for a maximum of \n5 s.\n\x84 Check the fuel level indicator in the cluster for \nadequate fuel in the fuel tank. In case the last bar \nis blinking continuously, it indicates low fuel level in \nthe tank. Please re-fuel immediately.',
 'Royal Enfield Hunter 350 17\nTECHNICAL SPECIFICATIONS\nENGINE\nEngine type .........................   Single cylinder,4 stroke, \nSOHC\nBore .....................................  72 mm\nStroke ..................................  85.8 mm\nEngine capacity (cc) ............  349.34 cc\nCompression ratio ...............  9.5:1\nMax power ..........................   20.2 bhp @ 6100