## Retriever And Chain With Langchain

In [21]:
# ================================
# Cell 1: Imports & Environment
# ================================
import os
from dotenv import load_dotenv

# Load env file (.env should have HF_TOKEN=your_token)
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
print("Hugging Face token loaded:", bool(hf_token))

Hugging Face token loaded: True


In [22]:

# ================================
# Cell 2: Load Documents
# ================================
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("attention.pdf")  # replace with your PDF
docs = loader.load()
print(f"Number of documents/pages: {len(docs)}")
print("First document preview:\n", docs[0].page_content[:300])

Number of documents/pages: 15
First document preview:
 Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Par


In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,   # smaller to avoid token limit errors
    chunk_overlap=50
)
chunks = text_splitter.split_documents(docs)
print("Number of chunks:", len(chunks))
print("First chunk preview:\n", chunks[0].page_content[:300])

Number of chunks: 93
First chunk preview:
 Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Par


In [23]:

# ================================
# Cell 3: Split into Chunks
# ================================
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,   # keep smaller to avoid sequence length issues
    chunk_overlap=50
)
chunks = text_splitter.split_documents(docs)
print(f"Number of chunks: {len(chunks)}")
print("First chunk preview:\n", chunks[0].page_content[:200])

Number of chunks: 93
First chunk preview:
 Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need



In [15]:
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline

pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base"
)

llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={"token": hf_token})
print("Hugging Face LLM initialized")

Device set to use mps:0


Hugging Face LLM initialized


In [25]:
# ================================
# Cell 4: Embeddings & Vector Store
# ================================
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(chunks, embeddings)
retriever = db.as_retriever()
print("FAISS vector store created with Hugging Face embeddings")

FAISS vector store created with Hugging Face embeddings


In [26]:
# ================================
# Cell 5: LLM Setup
# ================================
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline

# Use a small, free, local model
pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base",
    max_length=512
)

llm = HuggingFacePipeline(pipeline=pipe)
print("Hugging Face LLM initialized")


Device set to use mps:0


Hugging Face LLM initialized


In [27]:
# ================================
# Cell 6: Prompt Template
# ================================
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
If the context does not contain the answer, say "I don’t know based on the provided context."
<context>
{context}
</context>
Question: {input}
""")
print("Prompt template ready")

Prompt template ready


In [28]:
# ================================
# Cell 7: Document Chain + Retrieval Chain
# ================================
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

document_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, document_chain)
print("Retrieval chain ready")

Retrieval chain ready


In [20]:
query = "What is transformer architecture?"
response = retrieval_chain.invoke({"input": query})

print("Query result:")
print(response['answer'])

Query result:
a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output. So the final answer is a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output.


In [29]:
# ================================
# Cell 8: Test Queries
# ================================
query = "What is transformer architecture?"
response = retrieval_chain.invoke({"input": query})
print("Q:", query)
print("A:", response['answer'])


Q: What is transformer architecture?
A: The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively.


In [30]:
query = "Who are the authors of Attention is All You Need?"
response = retrieval_chain.invoke({"input": query})
print("Q:", query)
print("A:", response['answer'])

Token indices sequence length is longer than the specified maximum sequence length for this model (544 > 512). Running this sequence through the model will result in indexing errors


Q: Who are the authors of Attention is All You Need?
A: [19] Yoon Kim, Carl Denton, Luong Hoang, and Alexander M. Rush. Structured attention networks. In International Conference on Learning Representations, 2017. [20] Diederik Kingma and Jimmy Ba. Adam: A method for stochastic optimization. In ICLR, 2015. [21] Oleksii Kuchaiev and Boris Ginsburg. Factorization tricks for LSTM networks. arXiv preprint arXiv:1703.10722, 2017. [22] Zhouhan Lin, Minwei Feng, Cicero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen [16] ukasz Kaiser and Samy Bengio.


In [31]:
query = "What is duck?"  # random/unrelated question
response = retrieval_chain.invoke({"input": query})
print("Q:", query)
print("A:", response['answer'])

Q: What is duck?
A: I don’t know based on the given context.
