In [8]:
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader("books", glob="**/*.pdf")
books = loader.load()
len(books)

2

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(books)

In [10]:
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings

vectorstore = Chroma.from_documents(
    documents=all_splits,
    embedding=OllamaEmbeddings(model="llama3", show_progress=True),
    persist_directory="./chroma_db",
)

OllamaEmbeddings:   0%|          | 0/100 [00:00<?, ?it/s]

OllamaEmbeddings: 100%|██████████| 100/100 [15:55<00:00,  9.55s/it]


In [15]:
question = "Who is Tom Sawyer?"
docs = vectorstore.similarity_search(question)
docs

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.41it/s]


[Document(metadata={'source': 'books/level_1_-_The_Adventures_of_Tom_Sawyer_-_Penguin_Readers-min.pdf'}, page_content='“The money isn’t in Injun Joe’s house,” Tom said. “It’s in the cave! I know, because Injun Joe was there. Let’s get it!”\n\nHuck was afraid. “But maybe we can’t find it.” “I can find it again,” Tom said. “I know about a small door at the back of the cave. Becky and I came out there. We can go in that door, and I can find Injun Joe’s treasure.”'),
 Document(metadata={'source': 'books/D_Strange-Girl_Meets_Boy_Penguin_Readers-1-min.pdf'}, page_content='2 Write 100 words about one of your holidays. Where were you? Who was with you? What did you do on the first day or two? Did you make new friends?\n\nDonna sees Mark on the boat to Spain. She likes him and he likes her. Then Mark sees Donna dancing with his brother, Dave . .. Is Donna in love with Mark? W hy is Dave dancing with her?'),
 Document(metadata={'source': 'books/level_1_-_The_Adventures_of_Tom_Sawyer_-_Penguin_Re

In [16]:
from langchain import hub
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

llm = Ollama(model="llama3")

retriever = vectorstore.as_retriever()


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_prompt = hub.pull("rlm/rag-prompt")
qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)



In [18]:
question = "Who is Tom Sawyer?"
qa_chain.invoke(question)


OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.18it/s]


"Based on the provided context, Tom Sawyer is the main character in the story. He's a young boy who loves adventure and often gets into trouble. He's known for his mischievous behavior and his ability to come up with creative ideas.\n\nI don't have any information about my holiday or making new friends because I'm just an assistant for question-answering tasks, and this context doesn't provide that information.\n\nIs Donna in love with Mark? It seems like she might be, but it's not clear from the provided context."

In [20]:
question = "Where does Tom go?"
qa_chain.invoke(question)

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.44it/s]


"Based on the provided context, I don't know where Tom goes as this information is not explicitly mentioned. However, according to the story, Tom and Becky got lost in the cave and walked for some time without finding the door. Tom then told Becky to wait while he went to look for the door."