In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [3]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors.listwise_rerank import LLMListwiseRerank
from langgraph.graph import StateGraph, START, END

In [4]:
loader = PyPDFLoader("data/attention-is-all-you-need-Paper.pdf")
docs = loader.load()

print(f"Loaded {len(docs)} pages")
print(docs[0].page_content[:300])


Loaded 11 pages
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aid


In [5]:
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = splitter.split_documents(docs)

print(f"Total chunks: {len(chunks)}")
print(chunks[0].page_content[:300])


Total chunks: 51
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aid


In [6]:
embeddings = OpenAIEmbeddings()  # Uses OPENAI_API_KEY from .env
vectorstore = FAISS.from_documents(chunks, embeddings)

retriever = vectorstore.as_retriever(search_kwargs={"k": 8})

query = "What is transformer?"
docs_retrieved = retriever.get_relevant_documents(query)

print(f"Retrieved {len(docs_retrieved)} docs")
for i, d in enumerate(docs_retrieved, 1):
    print(f"\n--- Doc {i} ---\n{d.page_content[:250]}")


  docs_retrieved = retriever.get_relevant_documents(query)


Retrieved 8 docs

--- Doc 1 ---
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to
be superior in qua

--- Doc 2 ---
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@goo

--- Doc 3 ---
results to the base model.
7 Conclusion
In this work, we presented the Transformer, the ﬁrst sequence transduction model based entirely on
attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with
multi-headed

--- Doc 4 ---
textual entailment and learning task-independent sentence representations [4, 22, 23, 19].
End-to-end memory networks are based on a recurrent attention mechanism instead

In [9]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

reranker = LLMListwiseRerank.from_llm(llm=llm, top_n=4)

compression_retriever = ContextualCompressionRetriever(
    base_retriever=retriever,
    base_compressor=reranker
)

docs_reranked = compression_retriever.invoke(query)

print(f"\nReranked to top {len(docs_reranked)} docs:")
for i, d in enumerate(docs_reranked, 1):
    print(f"\n--- Reranked Doc {i} ---\n{d.page_content[:250]}")



Reranked to top 4 docs:

--- Reranked Doc 1 ---
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@goo

--- Reranked Doc 2 ---
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to
be superior in qua

--- Reranked Doc 3 ---
results to the base model.
7 Conclusion
In this work, we presented the Transformer, the ﬁrst sequence transduction model based entirely on
attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with
multi-headed

--- Reranked Doc 4 ---
textual entailment and learning task-independent sentence representations [4, 22, 23, 19].
End-to-end memory networks are base

In [10]:
context = "\n\n".join([d.page_content for d in docs_reranked])

prompt = f"Using the context below, answer the question:\n\nContext:\n{context}\n\nQuestion:\n{query}"

answer = llm.invoke(prompt)

print("\nGenerated Answer:\n", answer.content)



Generated Answer:
 The Transformer is a novel network architecture proposed for sequence transduction tasks, which relies entirely on attention mechanisms rather than using recurrent or convolutional layers. It is designed to process input and output sequences by employing multi-headed self-attention to compute representations, allowing for significantly faster training compared to traditional models that utilize recurrent or convolutional structures. The Transformer has demonstrated superior performance in machine translation tasks, achieving state-of-the-art results on benchmarks such as the WMT 2014 English-to-German and English-to-French translation tasks.


In [11]:
def retrieve_stage(state):
    docs_ = compression_retriever.get_relevant_documents(state["question"])
    state["context"] = "\n\n".join(d.page_content for d in docs_)
    return state

def generate_stage(state):
    prompt = f"Using the context below, answer the question:\n\nContext:\n{state['context']}\n\nQuestion:\n{state['question']}"
    state["answer"] = llm.invoke(prompt).content
    return state

graph = StateGraph(dict)
graph.add_node("retrieve", retrieve_stage)
graph.add_node("generate", generate_stage)
graph.add_edge(START, "retrieve")
graph.add_edge("retrieve", "generate")
graph.add_edge("generate", END)

rag_pipeline = graph.compile()

state = {"question": query}
result = rag_pipeline.invoke(state)
print("\nFinal Answer from LangGraph Pipeline:\n", result["answer"])



Final Answer from LangGraph Pipeline:
 The Transformer is a novel network architecture proposed for sequence transduction tasks, which relies entirely on attention mechanisms rather than traditional recurrent or convolutional layers. It is designed to process input and output sequences by using multi-headed self-attention to compute representations, allowing for significantly faster training compared to previous models. The Transformer has demonstrated superior performance in machine translation tasks, achieving state-of-the-art results on benchmarks such as the WMT 2014 English-to-German and English-to-French translation tasks. Its architecture enables better parallelization and reduces training time, making it a significant advancement in the field of natural language processing.
