In [1]:
%load_ext autoreload
%autoreload 2
from src.services.file_service import load_pdf, split_document
from src.services.vectordb_service import ChromaDB
from src.utils.util import pretty_print_docs
from config import TEMPLATE, MODEL, DOC_PATH

In [2]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

import pprint

In [3]:
pprint.pp(TEMPLATE)

('Use ONLY the following pieces of context to answer the question at the end.\n'
 "If you don't know the answer, just say that you don't know, NEVER make up an "
 'answer.\n'
 'Summarize in bullet point format. Keep the answer as concise as possible.\n'
 '{context}\n'
 'Question: {question}\n'
 'Helpful Answer:')


In [4]:
pages = load_pdf(doc_path=DOC_PATH)

Loading file AttentionIsAllYouNeed.pdf ...
File load correctly. Contains 15 pages


In [6]:
text_splits = split_document(pages, chunk_size=800, chunk_overlap=400, strategy='recursive')
chroma = ChromaDB(text_splits)
vectorstore = chroma.get_vectorstore()

Document spliting. Chunk size 800 - Chunk overlap 400 - Strategy recursive
Splits generated 92
Cleaning Chroma DB
Creating a local embedding vector DB on directory data/chroma
DB created successfuly. Collection count: 92 



In [13]:
# question = "What are the major improvments over the previous LSTM arquitecture?"
question = "Why is multi head attention has n2 complexity?"
llm = ChatOpenAI(model_name=MODEL, temperature=0)

QA_CHAIN_PROMPT = PromptTemplate.from_template(TEMPLATE)

search_results = []

for search_type in ['similarity']: # 'mmr' 
    s_type = {}
    retreiver = vectorstore.as_retriever(search_type=search_type)
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=retreiver,
        return_source_documents=True,
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
    )
    result = qa_chain.invoke({"query": question})
    
    s_type[search_type] = result
    search_results.append(s_type)

In [14]:
for search in search_results:
    for key in search.keys():
        print(str.upper(key))
        print(search[key]['result'])

SIMILARITY
- Multi-head attention has n^2 complexity because it involves computing attention across all pairs of positions in the sequence, resulting in a quadratic number of operations.
- The model employs h=8 parallel attention layers, or heads, each attending to different representation subspaces at different positions.


In [15]:
for search in search_results:
    for key in search.keys():
        print(str.upper(key))
        pretty_print_docs(search[key]['source_documents'])

SIMILARITY
Document 1 page 3 from data/raw/AttentionIsAllYouNeed.pdf:

a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is
much faster and more space-efficient in practice, since it can be implemented using highly optimized
matrix multiplication code.
While for small values of dkthe two mechanisms perform similarly, additive attention outperforms
dot product attention without scaling for larger values of dk[3]. We suspect that for large values of
dk, the dot products grow large in magnitude, pushing the softmax function into regions where it has
extremely small gradients4. To counteract this effect, we scale the dot products by1√dk.
3.2.2 Multi-Head Attention
Instead of performing a single attention function with dmodel-dimensional keys, values and queries,
----------------------------------------------------------------------------------------------------
Document 2 page 4 from data/raw/AttentionIsAllYouNeed.pdf:

output values. These a