In [None]:
#Overview
#
#A simple and standart pipeline for RAG is
#Question --> Indexing --> Retrieval --> Generation --> Response
#Indexing is the process of creating a index of the documents. This is done by encoding
#the documents into a vector space. In tutorial it used OpenAI's embedding model.
#Here I'm going to use ollama to run locally without an API key.
#
#
#Documents are going to be about chess opening moves.
#Why Chess? Because my knowledge brain has been roting and my elo decaied from 1700 to 1300
#And I have a tournament in 2 weeks :p
#And, the document is kind of difficult to read
#
#Steps here are
#Load pdf; read it; embed and store
#Load questing; ebed;
#Retrieve relevant part from pdf

In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
# os.environ["LANGCHAIN_API_KEY"] = xxxx

import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_docling import DoclingLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_community.vectorstores.utils import filter_complex_metadata 

import glob
from langchain.prompts import ChatPromptTemplate


In [2]:
question = "What is opening theory? Provide section from the document."
doc_path = "/home/sersasj/rag-exploration/docs_to_index/FCO_Fundamental_Chess_Openings-1-10.pdf"

In [3]:
loader = DoclingLoader(file_path=doc_path)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)

chunks = filter_complex_metadata(chunks) 



The plugin langchain_docling will not be loaded because Docling is being executed with allow_external_plugins=false.
The plugin langchain_docling will not be loaded because Docling is being executed with allow_external_plugins=false.
Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


In [4]:
from langchain_community.vectorstores import Chroma

embeddings = OllamaEmbeddings(model="llama3")
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings)

retriever = vectorstore.as_retriever()

print(f"Number of documents in vectorstore: {vectorstore._collection.count()}")


Number of documents in vectorstore: 27


In [5]:
docs = retriever.invoke(question)
print(len(docs))
print(docs)

4
[Document(metadata={'source': '/home/sersasj/rag-exploration/docs_to_index/FCO_Fundamental_Chess_Openings-1-10.pdf'}, page_content='Introduction\nIn the colossal body of  chess literature, no aspect of the game has been treated as extensively as th openings.  In varying  degrees  of expertise,  clarity  and depth,  thousands  of books  discuss  ever imaginable and unimaginable opening the game of chess has to  offer.  This is a process that wi never stop. As long as a particular opening is being played, its variations will be worked out deepe and deeper and assessments will be modified on the basis of these new  experiences. As long a chess is alive, its opening theory will also be alive and new books will be needed to document all o this new life.'), Document(metadata={'source': '/home/sersasj/rag-exploration/docs_to_index/FCO_Fundamental_Chess_Openings-1-10.pdf'}, page_content='What is  Opening Theory?\nEveryone who devotes even the tiniest amount of thought to his first move not o

In [6]:


template = """ 
Awnser the question based on the context provided.

Context: {context}
Question: {question}


"""

prompt = ChatPromptTemplate.from_template(template)



In [7]:
llm = ChatOllama(model="llama3")


In [8]:
chain = prompt | llm

In [9]:
chain.invoke({"context":docs,"question":question})

AIMessage(content='According to the provided context, the answer to the question "What is Opening Theory?" can be found in the second document.\n\nHere\'s the relevant section:\n\n"What is  Opening Theory?\\nEveryone who devotes even the tiniest amount of thought to his first move not only makes a sta with that particular game but also with the development of opening theory. From that moment o every new game will confront him with the starting position again and therefore with his earli thoughts on it. Also he will sooner or later find out that millions of other players have pondered ex actly the same problems and, whether he wants to or not, he will to some extent start comparing hi own ideas about how to start a game with theirs."', additional_kwargs={}, response_metadata={'model': 'llama3', 'created_at': '2025-10-15T19:46:10.135363966Z', 'done': True, 'done_reason': 'stop', 'total_duration': 1718460988, 'load_duration': 13537675, 'prompt_eval_count': 656, 'prompt_eval_duration': 135

In [13]:


from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke(question)



'According to the provided context, the question "What is Opening Theory?" is answered in the document with the following content:\n\n"Everyone who devotes even the tiniest amount of thought to his first move not only makes a start with that particular game but also with the development of opening theory. From that moment on every new game will confront him with the starting position again and therefore with his early thoughts on it. Also he will sooner or later find out that millions of other players have pondered exactly the same problems and, whether he wants to or not, he will to some extent start comparing his own ideas about how to start a game with theirs."\n\n(Source: Document(metadata={\'source\': \'/home/sersasj/rag-exploration/docs_to_index/FCO_Fundamental_Chess_Openings-1-10.pdf\'}, page_content=\'What is  Opening Theory?\\n...\')'