In [11]:
!pip install --upgrade --quiet \
    langchain langchain-community langchain-openai chromadb \
    pypdf streamlit python-dotenv \
    "pandas==2.2.2"


In [12]:
from google.colab import files
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]


Saving  Chess Game Instructions.pdf to  Chess Game Instructions (1).pdf


In [26]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
import pandas as pd
import uuid
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_function = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# LLM
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)

# 1) Load PDF
loader = PyPDFLoader(pdf_path)
pages = loader.load()

# 2) Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n", "\n", " "]
)
chunks = text_splitter.split_documents(pages)

# 3) Embedding function
def get_embedding_function():
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

embedding_function = get_embedding_function()

# 4) Create vectorstore
def create_vectorstore(chunks, embedding_function, vectorstore_path):
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    unique_ids, unique_chunks = set(), []
    for chunk, id in zip(chunks, ids):
        if id not in unique_ids:
            unique_ids.add(id)
            unique_chunks.append(chunk)
    vectorstore = Chroma.from_documents(
        documents=unique_chunks,
        ids=list(unique_ids),
        embedding=embedding_function,
        persist_directory=vectorstore_path
    )
    vectorstore.persist()
    return vectorstore

vectorstore = create_vectorstore(chunks, embedding_function, "vectorstore_test")

# 5) Retriever
retriever = vectorstore.as_retriever(search_type="similarity")

# 6) Prompt
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

# Helper to format retrieved docs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# 7) RAG chain

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
)

while True:
    user_question = input("Ask your question (or type 'exit' to quit): ")
    if user_question.lower() == "exit":
        break

    answer = rag_chain.invoke(user_question)
    print("\nAnswer:\n", answer.content.strip(), "\n")



Ask your question (or type 'exit' to quit): pawns

Answer:
 Pawns in chess can move straight ahead one or two squares on their first move. After the first move, they can only move one square forward at a time. Pawns capture pieces diagonally one square ahead. They cannot retreat or jump over other pieces. Additionally, when a pawn reaches the last rank on the opponent’s side of the board, it can be promoted, usually becoming a Queen. It is also possible to have multiple Queens on the board as a result of pawn promotion. There is a special rule called "en passant" that allows a pawn to capture an adjacent opponent's pawn that has moved two squares forward in one move as if it had only moved one square. This capture can only be performed by pawns on adjacent files and is limited to just one move. 

Ask your question (or type 'exit' to quit): exit
