In [None]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

In [None]:
loader = PyMuPDFLoader("paper.pdf")

In [None]:
documents = loader.load()

In [None]:
len(documents) # 1 document / page

In [None]:
documents[0].metadata # each Document within the array has a metadata and a page_content attribute

In [None]:
documents[0].page_content

In [None]:
# Splitting the documents into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

chunks = splitter.split_documents(documents)

In [None]:
len(chunks) # It produced a array of chunks

In [None]:
chunks[0].page_content

In [None]:
chunks[1].page_content
# there's overlap btwn chunks[n] & chunks[n + 1]
# overlapping is necessary to preserve context. It improves model understanding of the text.

In [None]:
embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # CPU-friendly

In [None]:
db = Chroma.from_documents(chunks, embedding=embedder, persist_directory="./chroma_db")

In [None]:
db = Chroma(persist_directory="./chroma_db", embedding_function=embedder)
question = "Author names"
results = db.similarity_search_with_score(query, k=5)

In [None]:
results

In [None]:
for i in range(len(results)):
    print(f'result n°{i} :')
    print(f'score {results[i][1]}:') # displays score
    print(f'{results[i][0].page_content} \n') # displays page_content