In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [2]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

from langgraph.graph import StateGraph, START, END

In [3]:
# Load and preview the PDF
loader = PyPDFLoader("data/attention-is-all-you-need-Paper.pdf")
all_docs = loader.load()

# Process only the first few pages (e.g., first 2 pages)
num_pages_to_process = 2
docs = all_docs[:num_pages_to_process]

print(f"Loaded: {len(docs)}")
print(docs[0].page_content[:100])

Loaded: 2
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brai


In [4]:
# Split into managable chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

print(f"Split into: {len(chunks)}")
print(chunks[0].page_content[:100])

Split into: 10
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brai


In [5]:
# Create embeddings and vector store
embedding = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(chunks, embedding)

In [6]:
# Rank retrieval test
retriever = vectorstore.as_retriever(search_kwargs={"k": 8})

query = "What is transformer?"
retrieved_docs = retriever.invoke(query)
print(f"Retrieved {len(retrieved_docs)} documents")

for i, doc in enumerate(retrieved_docs, start=1):
    print(f"Rank Doc {i}:\n{doc.page_content[:200]}")

Retrieved 8 documents
Rank Doc 1:
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz
Rank Doc 2:
aligned RNNs or convolution. In the following sections, we will describe the Transformer, motivate
self-attention and discuss its advantages over models such as [14, 15] and [8].
3 Model Architecture

Rank Doc 3:
Attention mechanisms have become an integral part of compelling sequence modeling and transduc-
tion models in various tasks, allowing modeling of dependencies without regard to their distance in
the 
Rank Doc 4:
transduction problems such as language modeling and machine translation [ 29, 2, 5]. Numerous
efforts have since continued to push the boundaries of recurrent language models and encoder-decoder
archi
Rank Doc 5:
described in section 3.2.
Self-attention, sometimes called intra-attention is an attention mechanism relating diff

**Requires python 3.10 to work properly**

In [7]:
# Load a cross-encoder model
model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")

# Create the CrossEncoderReranker
compressor = CrossEncoderReranker(model=model, top_n=3)

# Combine the base retriever with the reranker
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

reranked = compression_retriever.invoke(query)
print(f"Retrieved Reranked {len(reranked)} documents")

for i, doc in enumerate(reranked, start=1):
    print(f"Rank Doc {i}:\n{doc.page_content[:200]}")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Retrieved Reranked 3 documents
Rank Doc 1:
described in section 3.2.
Self-attention, sometimes called intra-attention is an attention mechanism relating different positions
of a single sequence in order to compute a representation of the seque
Rank Doc 2:
Attention mechanisms have become an integral part of compelling sequence modeling and transduc-
tion models in various tasks, allowing modeling of dependencies without regard to their distance in
the 
Rank Doc 3:
The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU
[20], ByteNet [15] and ConvS2S [8], all of which use convolutional neural networks as basic building
bl


In [8]:
# Generate answer using top reranked context
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

context = "\n\n".join([d.page_content for d in reranked])
prompt = (f"Use the context below to answer the question.\n\nContext:\n{context}"
          f"\n\nQuestion:\n{query}")

response = llm.invoke(prompt)
print("\nGenerated Answer:\n", response.content)


Generated Answer:
 The Transformer is a model architecture that relies entirely on self-attention mechanisms to compute representations of its input and output, without using recurrent neural networks (RNNs) or convolutional layers. This design allows the Transformer to model dependencies between input and output sequences without regard to their distance, enabling significant parallelization during training. As a result, the Transformer can achieve state-of-the-art performance in tasks such as translation, often requiring less training time compared to traditional models that use recurrence. The architecture is particularly effective in handling long-range dependencies in sequences, which is a challenge for models that rely on sequential computation.


In [9]:
# Wrap with LangGraph and test end-to-end
def retrieve_stage(state):
    docs = compression_retriever.get_relevant_documents(state["question"])
    state["context"] = "\n\n".join([d.page_content for d in docs])
    return state

def generate_stage(state):
    prompt = (f"Use the context below to answer the question.\n\nContext:\n{state['context']}"
              f"\n\nQuestion:\n{state['question']}")
    state["answer"] = llm.invoke(prompt).content
    return state

graph = StateGraph(dict)
graph.add_node("retrieve", retrieve_stage)
graph.add_node("generate", generate_stage)
graph.add_edge(START, "retrieve")
graph.add_edge("retrieve", "generate")
graph.add_edge("generate", END)

rag_pipeline = graph.compile()

state = {"question": query}
result = rag_pipeline.invoke(state)
print("\nFinal Answer from LangGraph Pipeline:\n", result["answer"])


  docs = compression_retriever.get_relevant_documents(state["question"])



Final Answer from LangGraph Pipeline:
 The Transformer is a model architecture that relies entirely on self-attention mechanisms to compute representations of its input and output, without using recurrent neural networks (RNNs) or convolutional layers. This design allows the Transformer to model dependencies between input and output sequences without regard to their distance, enabling significant parallelization during training. As a result, the Transformer can achieve state-of-the-art performance in tasks such as translation, often requiring less training time compared to traditional models that use recurrence. The architecture is particularly effective in handling long-range dependencies in sequences, as it reduces the number of operations needed to relate signals from different positions to a constant number, which contrasts with other models where the number of operations grows with the distance between positions.
