In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [2]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

from flashrank import Ranker
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_compressors import FlashrankRerank

from langgraph.graph import StateGraph, START, END

In [3]:
loader = PyPDFLoader("data/attention-is-all-you-need-Paper.pdf")
docs = loader.load()

print(f"Loaded {len(docs)} pages.")
print("First page preview:\n", docs[0].page_content[:300])

Loaded 11 pages.
First page preview:
 Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aid


In [4]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

print(f"{len(chunks)} chunks created")
print(chunks[0].page_content[:300])

43 chunks created
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aid


In [5]:
embedding = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(chunks, embedding)

retriever = vectorstore.as_retriever(search_kwargs={"k": 8})

query = "What is transformer?"
docs_retrieved = retriever.get_relevant_documents(query)

print(f"Retrieved {len(docs_retrieved)} chunks:")
for i, d in enumerate(docs_retrieved, 1):
    print(f"\n--- Document {i} Content ---\n{d.page_content[:250]}")

  docs_retrieved = retriever.get_relevant_documents(query)


Retrieved 8 chunks:

--- Document 1 Content ---
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@goo

--- Document 2 Content ---
aligned RNNs or convolution. In the following sections, we will describe the Transformer, motivate
self-attention and discuss its advantages over models such as [14, 15] and [8].
3 Model Architecture
Most competitive neural sequence transduction mode

--- Document 3 Content ---
Attention mechanisms have become an integral part of compelling sequence modeling and transduc-
tion models in various tasks, allowing modeling of dependencies without regard to their distance in
the input or output sequences [2, 16]. In all but a fe

--- Document 4 Content ---
6 Results
6.1 Machine Translation
On the WMT 2014 English-to-German translation task, the big transformer model (Tr

In [6]:
reranker = FlashrankRerank(model="ms-marco-MiniLM-L-12-v2", top_n=4)

compression_retriever = ContextualCompressionRetriever(
    base_retriever=retriever,
    base_compressor=reranker
)

docs_reranked = compression_retriever.invoke(query)

print(f"\nReranked to top {len(docs_reranked)} docs:")
for i, d in enumerate(docs_reranked, 1):
    meta_score = d.metadata.get("relevance_score", None)
    print(f"\n--- Reranked Document {i} (score: {meta_score}) ---\n{d.page_content[:250]}")


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"



Reranked to top 4 docs:

--- Reranked Document 1 (score: 0.7896642684936523) ---
Figure 1: The Transformer - model architecture.
wise fully connected feed-forward network. We employ a residual connection [10] around each of
the two sub-layers, followed by layer normalization [ 1]. That is, the output of each sub-layer is
LayerNor

--- Reranked Document 2 (score: 0.7049769163131714) ---
Attention mechanisms have become an integral part of compelling sequence modeling and transduc-
tion models in various tasks, allowing modeling of dependencies without regard to their distance in
the input or output sequences [2, 16]. In all but a fe

--- Reranked Document 3 (score: 0.5463269948959351) ---
aligned RNNs or convolution. In the following sections, we will describe the Transformer, motivate
self-attention and discuss its advantages over models such as [14, 15] and [8].
3 Model Architecture
Most competitive neural sequence transduction mode

--- Reranked Document 4 (score: 0.4705306887626648

In [7]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

context = "\n\n".join([d.page_content for d in docs_reranked])
prompt = (
    f"Using the following context, answer the question:\n\nContext:\n{context}"
    f"\n\nQuestion:\n{query}"
)
response = llm.invoke(prompt)
print("\nGenerated Answer:\n", response.content)


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Generated Answer:
 The Transformer is a model architecture designed for sequence transduction tasks, which relies entirely on an attention mechanism to model dependencies between input and output sequences, rather than using recurrent or convolutional networks. It consists of an encoder-decoder structure, where the encoder maps an input sequence to continuous representations, and the decoder generates an output sequence one element at a time in an auto-regressive manner. The Transformer architecture allows for significant parallelization, enabling faster training compared to traditional models, and has achieved state-of-the-art results in translation tasks. It employs multi-headed self-attention and point-wise fully connected layers, with residual connections and layer normalization to enhance performance. The model is particularly noted for its efficiency and effectiveness in handling various sequence modeling tasks.


In [8]:
def retrieve_stage(state):
    docs_ = compression_retriever.get_relevant_documents(state["question"])
    state["context"] = "\n\n".join(d.page_content for d in docs_)
    return state

def generate_stage(state):
    prompt = (
        f"Using the context below, answer the question:\n\nContext:\n{state['context']}"
        f"\n\nQuestion:\n{state['question']}"
    )
    state["answer"] = llm.invoke(prompt).content
    return state

graph = StateGraph(dict)
graph.add_node("retrieve", retrieve_stage)
graph.add_node("generate", generate_stage)
graph.add_edge(START, "retrieve")
graph.add_edge("retrieve", "generate")
graph.add_edge("generate", END)

rag_pipeline = graph.compile()

state = {"question": query}
result = rag_pipeline.invoke(state)
print("\nFinal Answer from LangGraph Pipeline:\n", result["answer"])


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Final Answer from LangGraph Pipeline:
 The Transformer is a model architecture designed for sequence transduction tasks, which relies entirely on an attention mechanism to model dependencies between input and output sequences, rather than using recurrent or convolutional networks. It consists of an encoder-decoder structure, where the encoder maps an input sequence to continuous representations, and the decoder generates an output sequence one element at a time in an auto-regressive manner. The Transformer architecture allows for significant parallelization, enabling faster training compared to traditional models, and has achieved state-of-the-art results in translation tasks. It employs stacked self-attention and fully connected layers, with mechanisms like residual connections and layer normalization to enhance performance. The Transformer is also adaptable for various tasks beyond text, including images, audio, and video.
