### Data Ingestion

In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from dotenv import load_dotenv

load_dotenv()

Bad pipe message: %s [b' 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Sa']
Bad pipe message: %s [b'ri/537.36\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/', b'ng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7\r\nAccept-Encoding: gzip, deflate, br, zstd\r\nA']
Bad pipe message: %s [b'ept-Language: en-US,en;q=0.9\r\nPriority: u=0, i\r\nReferer: https://studio.firebase.google.com/\r\nSec-', b'-Ua: "Google Chrome";v="143", "Chromium";v="143", "Not A(Brand";v=']
Bad pipe message: %s [b'4"\r\nSec-Ch-Ua-Arch: "x86"\r\nSec-Ch']
Bad pipe message: %s [b'a-Bitness: "64"\r\nSec-Ch-Ua-Form-Factors: "De', b'top"\r\nSec-Ch-Ua-Full-Version: "143.0.7499.147"\r\nSec-Ch-Ua-Full-Version-List: "Google Chrome";v="143.0.7499.147", "', b'romium";v="143.0.7499.147", "Not A(Brand";v="24.0.0.0"\r\nSec-Ch-Ua-']
Bad pipe message: %s [b'bile: ?0\r\nSec-Ch-Ua-Model: ""\r\nSec-Ch-Ua-Platform: "Windows"\r\nSec-Ch-Ua-Plat', b'rm-Version: "19

True

In [2]:
import os

file_path = os.path.join(os.getcwd(), "data", "transformer_paper.pdf")
file_path

'/home/user/document-portal/notebooks/data/transformer_paper.pdf'

In [3]:
loader = PyPDFLoader(file_path)

In [4]:
document = loader.load()

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size= 500,
    chunk_overlap= 150,
    length_function= len
)

In [6]:
docs = text_splitter.split_documents(document)

In [7]:
len(docs)

115

In [8]:
docs[0]

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/home/user/document-portal/notebooks/data/transformer_paper.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.t

In [9]:
docs[0].metadata

{'producer': 'pdfTeX-1.40.25',
 'creator': 'LaTeX with hyperref',
 'creationdate': '2024-04-10T21:11:43+00:00',
 'author': '',
 'keywords': '',
 'moddate': '2024-04-10T21:11:43+00:00',
 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5',
 'subject': '',
 'title': '',
 'trapped': '/False',
 'source': '/home/user/document-portal/notebooks/data/transformer_paper.pdf',
 'total_pages': 15,
 'page': 0,
 'page_label': '1'}

In [10]:
llm = ChatOpenAI(model="gpt-4o-mini")
emb_model = OpenAIEmbeddings(model = "text-embedding-3-small")

In [11]:
vector_store = FAISS.from_documents(docs, emb_model)

In [12]:
sim_docs = vector_store.similarity_search("what do you mean by multi-head attention?", k=5)

In [13]:
sim_docs

[Document(id='94bb683d-01d8-454d-a1ed-7c8e714d55ec', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/home/user/document-portal/notebooks/data/transformer_paper.pdf', 'total_pages': 15, 'page': 1, 'page_label': '2'}, page_content='to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as\ndescribed in section 3.2.\nSelf-attention, sometimes called intra-attention is an attention mechanism relating different positions\nof a single sequence in order to compute a representation of the sequence. Self-attention has been\nused successfully in a variety of tasks including reading comprehension, abstractive summarization,'),
 Document(id='2b5ba0f6-a53e

In [14]:
retriever = vector_store.as_retriever()

In [15]:
retriever.invoke("what do you mean by multi-head attention?")

[Document(id='94bb683d-01d8-454d-a1ed-7c8e714d55ec', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/home/user/document-portal/notebooks/data/transformer_paper.pdf', 'total_pages': 15, 'page': 1, 'page_label': '2'}, page_content='to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as\ndescribed in section 3.2.\nSelf-attention, sometimes called intra-attention is an attention mechanism relating different positions\nof a single sequence in order to compute a representation of the sequence. Self-attention has been\nused successfully in a variety of tasks including reading comprehension, abstractive summarization,'),
 Document(id='2b5ba0f6-a53e

In [16]:
prompt_template = """
    Answer the question based on the context provided below.
    If the context does not contain sufficient information, respond with:
    "I do not have enough information about this."

    Context: {context}

    Question: {question}

    Answer:
    """

In [17]:
prompt = PromptTemplate(
    template = prompt_template,
    input_variables = ["context", "question"]
)

In [18]:
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\n    Answer the question based on the context provided below.\n    If the context does not contain sufficient information, respond with:\n    "I do not have enough information about this."\n\n    Context: {context}\n\n    Question: {question}\n\n    Answer:\n    ')

In [20]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

In [21]:
parser = StrOutputParser()

In [22]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt 
    | llm 
    | parser
)

In [23]:
rag_chain.invoke("what do you mean by multi-head attention?")

'Multi-head attention is an attention mechanism used in models like the Transformer that allows the model to jointly attend to information from different representation subspaces at various positions. Instead of using a single attention head that may average information and potentially miss specific details, multi-head attention runs several attention layers in parallel. This enables the model to focus on different aspects of the input sequence simultaneously, enhancing its ability to capture complex patterns and relationships. The output of each attention head is concatenated and projected to produce the final values.'