In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
import pandas as pd

# QA
inputs = [
    "What is the Transformer model introduced in the paper 'Attention Is All You Need'?",
    "Write the formula for Scaled Dot-Product Attention and briefly explain its components.",
    "List three advantages of self-attention over recurrent or convolutional layers mentioned in the paper.",
    "What are the equations for positional encoding and why are they important in the Transformer?",
    "Which optimizer and learning rate schedule were used in the Transformer model, and what value was chosen for warmup_steps?"
]

outputs = [
    "The Transformer is a sequence-to-sequence model that relies entirely on attention mechanisms, removing recurrence and convolutions to improve parallelization and performance.",
    "The formula is Attention(Q, K, V) = softmax(QK^T / sqrt(d_k))V, where Q are queries, K are keys, V are values, and d_k is the key dimension used for scaling.",
    "Self-attention allows higher parallelization, has a shorter path between long-range dependencies, and provides lower computational complexity per layer compared to recurrent or convolutional models.",
    "Positional encodings use sine and cosine functions: PE(pos, 2i) = sin(pos / 10000^(2i/d_model)) and PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model)); they inject token position information since the Transformer lacks recurrence or convolution.",
    "The authors used the Adam optimizer with β1=0.9, β2=0.98, ε=1e-9, and a learning rate schedule defined as d_model^-0.5 * min(step_num^-0.5, step_num * warmup_steps^-1.5), with warmup_steps set to 4000."
]


# Dataset
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]
df = pd.DataFrame(qa_pairs)

# Write to csv
csv_path = "/home/shantanusingh/Downloads/Multi_doc_chat_proj/data/llmops_dataset.csv"
df.to_csv(csv_path, index=False)

In [7]:
from langsmith import Client

client = Client()
dataset_name = "llmops_attention_paper_dataset"

# Store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Input and expected output pairs for llmops_attention_paper_dataset",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

{'example_ids': ['435483ca-eb97-461f-be7c-d5a95af5151b',
  '6eb5550e-3cf2-4901-9188-1c1e10ad34ea',
  'de414544-554c-4b13-80b8-3f919ba40391',
  '3f2100f1-8284-4127-84e2-28fdbcc3b7e3',
  '1059bd82-4b2b-474e-95c6-9fe6da132bbd'],
 'count': 5}

In [8]:
import sys
sys.path.append("/home/shantanusingh/Downloads/Multi_doc_chat_proj")

from pathlib import Path
from multi_doc_chat.src.document_ingestion.data_ingestion import ChatIngestor
from multi_doc_chat.src.document_chat.retrieval import ConversationalRAG
import os

# Simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self, file_path: str):
        self.path = Path(file_path)
        self.name = self.path.name
    
    def getbuffer(self) -> bytes:
        return self.path.read_bytes()


def answer_ai_report_question(
    inputs: dict,
    data_path: str = "/home/shantanusingh/Downloads/Multi_doc_chat_proj/data/attention.pdf",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    k: int = 5
) -> dict:
    """
    Answer questions about the Attention paper using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the Attention paper pdf file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base="data",
            faiss_base="faiss_index",
            use_session_dirs=True
        )
        
        # Build retriever
        ingestor.built_retriver(
            uploaded_files=[file_adapter],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            k=k
        )
        
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        
        # Create RAG instance and load retriever
        rag = ConversationalRAG(session_id=session_id)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
        )
        
        # Get answer
        answer = rag.invoke(question, chat_history=[])
        
        return {"answer": answer}
        
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}

In [10]:
# Test the function with a sample question
test_input = {"question": "List three advantages of self-attention over recurrent or convolutional layers mentioned in the paper."}
result = answer_ai_report_question(test_input)
print("Question:", test_input["question"])
print("\nAnswer:", result["answer"])

{"timestamp": "2025-11-11T09:43:41.701569Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T09:43:41.702453Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T09:43:41.702835Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_a5...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-11T09:43:41.703601Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-11T09:43:41.704988Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251111_151341_a7c56d31", "temp_dir": "data/session_20251111_151341_a7c56d31", "faiss_dir": "faiss_index/session_20251111_151341_a7c56d31", "sessionized": true, "timestamp": "2025-11-11T09:43:41.705499Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "attention.pdf", "saved_as": "data/session_2025

Question: List three advantages of self-attention over recurrent or convolutional layers mentioned in the paper.

Answer: The three advantages of self-attention layers are: lower computational complexity per layer, more computation that can be parallelized, and fewer sequential operations required.


In [11]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

In [12]:
# Example: Test with all datasetquestions
print("Testing all questions from the dataset:\n")
for i, q in enumerate(inputs, 1):
    test_input = {"question": q}
    result = answer_ai_report_question(test_input)
    print(f"Q{i}: {q}")
    print(f"A{i}: {result['answer']}\n")
    print("-" * 80 + "\n")

{"timestamp": "2025-11-11T09:45:27.605368Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T09:45:27.605893Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T09:45:27.606197Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_a5...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-11T09:45:27.606465Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-11T09:45:27.607637Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251111_151527_689f1dcb", "temp_dir": "data/session_20251111_151527_689f1dcb", "faiss_dir": "faiss_index/session_20251111_151527_689f1dcb", "sessionized": true, "timestamp": "2025-11-11T09:45:27.609187Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "attention.pdf", "saved_as": "data/session_2025

Testing all questions from the dataset:



{"count": 15, "timestamp": "2025-11-11T09:45:28.010951Z", "level": "info", "event": "Documents loaded"}
{"chunks": 52, "chunk_size": 1000, "overlap": 200, "timestamp": "2025-11-11T09:45:28.012327Z", "level": "info", "event": "Documents split"}
{"model": "models/text-embedding-004", "timestamp": "2025-11-11T09:45:28.012911Z", "level": "info", "event": "Loading embedding model"}
{"added": 1, "index": "faiss_index/session_20251111_151527_689f1dcb", "timestamp": "2025-11-11T09:45:30.664301Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-11T09:45:30.665035Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-11T09:45:30.666834Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T09:45:30.667240Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T09:45:30.667625Z", "level": "info", "event": "Loaded GOOGLE_API_KEY fr

Q1: What is the Transformer model introduced in the paper 'Attention Is All You Need'?
A1: The Transformer model architecture uses stacked self-attention and point-wise, fully connected layers for both the encoder and decoder. The encoder is composed of a stack of N = 6 identical layers, each layer has two sub-layers: a multi-head self-attention mechanism, and a simple, position-wise fully connected feed-forward network. The output of each sub-layer is LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer itself.

--------------------------------------------------------------------------------



{"count": 15, "timestamp": "2025-11-11T09:45:43.534725Z", "level": "info", "event": "Documents loaded"}
{"chunks": 52, "chunk_size": 1000, "overlap": 200, "timestamp": "2025-11-11T09:45:43.535984Z", "level": "info", "event": "Documents split"}
{"model": "models/text-embedding-004", "timestamp": "2025-11-11T09:45:43.536468Z", "level": "info", "event": "Loading embedding model"}
{"added": 1, "index": "faiss_index/session_20251111_151543_e1abaaf7", "timestamp": "2025-11-11T09:45:46.643261Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-11T09:45:46.644041Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-11T09:45:46.647922Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T09:45:46.648715Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T09:45:46.649162Z", "level": "info", "event": "Loaded GOOGLE_API_KEY fr

Q2: Write the formula for Scaled Dot-Product Attention and briefly explain its components.
A2: The formula for Scaled Dot-Product Attention is: Attention(Q, K, V) = softmax(QKT/√dk)V. In this formula, Q represents the matrix of queries, K represents the matrix of keys, and V represents the matrix of values. The dimension of the keys is represented by dk.

--------------------------------------------------------------------------------



{"count": 15, "timestamp": "2025-11-11T09:45:51.211142Z", "level": "info", "event": "Documents loaded"}
{"chunks": 52, "chunk_size": 1000, "overlap": 200, "timestamp": "2025-11-11T09:45:51.212366Z", "level": "info", "event": "Documents split"}
{"model": "models/text-embedding-004", "timestamp": "2025-11-11T09:45:51.212870Z", "level": "info", "event": "Loading embedding model"}
{"added": 1, "index": "faiss_index/session_20251111_151550_8d25138d", "timestamp": "2025-11-11T09:45:54.125158Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-11T09:45:54.125610Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-11T09:45:54.127015Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T09:45:54.127457Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T09:45:54.127737Z", "level": "info", "event": "Loaded GOOGLE_API_KEY fr

Q3: List three advantages of self-attention over recurrent or convolutional layers mentioned in the paper.
A3: The three advantages of self-attention layers are: lower computational complexity per layer, more parallelizable computation (lower number of sequential operations), and the ability to extrapolate to sequence lengths longer than those encountered during training.

--------------------------------------------------------------------------------



{"count": 15, "timestamp": "2025-11-11T09:46:01.365421Z", "level": "info", "event": "Documents loaded"}
{"chunks": 52, "chunk_size": 1000, "overlap": 200, "timestamp": "2025-11-11T09:46:01.366899Z", "level": "info", "event": "Documents split"}
{"model": "models/text-embedding-004", "timestamp": "2025-11-11T09:46:01.367286Z", "level": "info", "event": "Loading embedding model"}
{"added": 1, "index": "faiss_index/session_20251111_151600_0f47f23c", "timestamp": "2025-11-11T09:46:04.019058Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-11T09:46:04.019807Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-11T09:46:04.022530Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T09:46:04.023133Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T09:46:04.023480Z", "level": "info", "event": "Loaded GOOGLE_API_KEY fr

Q4: What are the equations for positional encoding and why are they important in the Transformer?
A4: The equations for positional encoding are:
P E(pos,2i) = sin(pos/100002i/dmodel )
P E(pos,2i+1) = cos(pos/100002i/dmodel )

Positional encodings are added to the input embeddings to provide information about the position of tokens in the sequence. The model can easily learn to attend by relative positions because, for any fixed offset k, P Epos+k can be represented as a linear function of P Epos.

--------------------------------------------------------------------------------



{"count": 15, "timestamp": "2025-11-11T09:46:12.697869Z", "level": "info", "event": "Documents loaded"}
{"chunks": 52, "chunk_size": 1000, "overlap": 200, "timestamp": "2025-11-11T09:46:12.699648Z", "level": "info", "event": "Documents split"}
{"model": "models/text-embedding-004", "timestamp": "2025-11-11T09:46:12.700294Z", "level": "info", "event": "Loading embedding model"}
{"added": 1, "index": "faiss_index/session_20251111_151612_36f28eb4", "timestamp": "2025-11-11T09:46:15.694779Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-11T09:46:15.695778Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-11T09:46:15.699744Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T09:46:15.700626Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T09:46:15.701183Z", "level": "info", "event": "Loaded GOOGLE_API_KEY fr

Q5: Which optimizer and learning rate schedule were used in the Transformer model, and what value was chosen for warmup_steps?
A5: The Adam optimizer was used with β1 = 0.9, β2 = 0.98 and ϵ = 10−9. The learning rate was varied over the course of training according to a specific formula. The value chosen for warmup_steps was 4000.

--------------------------------------------------------------------------------



In [13]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Evaluators
qa_evaluator = [LangChainStringEvaluator("cot_qa")]
dataset_name = "llmops_attention_paper_dataset"

# Run evaluation using our RAG function
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="test-llmops_attention_paper_dataset-qa-rag",
    # Experiment metadata
    metadata={
        "variant": "RAG with FAISS and Attention Paper",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'test-llmops_attention_paper_dataset-qa-rag-0c7f5bbe' at:
https://smith.langchain.com/o/c1f713d4-59f0-447c-a0af-c045e1a36407/datasets/6fb8b6b2-44b0-40ae-80a3-ae608cec8207/compare?selectedSessions=6f7420bf-6b05-4def-b4ff-40703f52cbd4




0it [00:00, ?it/s]{"timestamp": "2025-11-11T09:47:19.981406Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T09:47:19.981768Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T09:47:19.982006Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_a5...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-11T09:47:19.982251Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-11T09:47:19.983503Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251111_151719_6f43682d", "temp_dir": "data/session_20251111_151719_6f43682d", "faiss_dir": "faiss_index/session_20251111_151719_6f43682d", "sessionized": true, "timestamp": "2025-11-11T09:47:19.983915Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "attention.pdf", "saved_as": 