In [None]:
import warnings
warnings.filterwarnings("ignore")

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings

from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_classic.chains.retrieval import create_retrieval_chain



# Load PDF
loader = PyPDFLoader("attention.pdf")
docs = loader.load()

# Split text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", " ", ""]
)
documents = text_splitter.split_documents(docs)

# Vector store
db = FAISS.from_documents(
    documents,
    OllamaEmbeddings()
)

# Retriever
retriever = db.as_retriever(search_kwargs={"k": 4})

# LLM
llm = ChatOllama(
    model="llama2",
    temperature=0,
    num_ctx=4096
)

# Prompt
prompt = ChatPromptTemplate.from_template("""
You are a technical assistant.

Using ONLY the provided context:
- Directly answer the question
- Include formulas if present
- Do NOT describe the context itself

<context>
{context}
</context>

Question: {input}

Answer:
""")

# Chains
document_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, document_chain)

# Query
response = retrieval_chain.invoke({
    "input": "Scaled Dot-Product Attention"
})

print("\nAnswer:\n", response["answer"])

print("\nSources:")
for i, doc in enumerate(response["context"], 1):
    print(f"{i}. Page {doc.metadata.get('page')}")




Answer:
 The context provided is a passage of text from a research paper, specifically the section on the Transformer model. The passage describes the architecture and training of the Transformer model, as well as its performance on a constituency parsing task.

To answer your question, the scaled dot-product attention mechanism is used in the Transformer model to compute the weighted sum of the input tokens' representations. The scaled dot-product attention allows the model to selectively focus on different parts of the input sequence as it processes it, allowing it to capture longer-range dependencies and better handle input sequences of varying lengths.

The formula for the scaled dot-product attention is:

attention_weight = softmax(Q \* K^T / sqrt(d))

where Q, K, and d are the query, key, and dimensionality of the input representation, respectively. The softmax function is used to normalize the attention weights, ensuring they sum to 1.

Sources:
1. Page 6
2. Page 1
3. Page 9
4.