In [None]:
!pip install langchain langchain_community langchain_groq langchain_chroma chromadb sentence-transformers langchain_huggingface pypdf



In [None]:
#Importing required libraries
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma

In [None]:
#Loading environment variables
from google.colab import userdata
groq_api_key = userdata.get("GROQ_API_KEY")

In [None]:
#Download the data
!wget "https://arxiv.org/pdf/1706.03762.pdf"

--2025-09-16 07:42:35--  https://arxiv.org/pdf/1706.03762.pdf
Resolving arxiv.org (arxiv.org)... 151.101.3.42, 151.101.131.42, 151.101.195.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.3.42|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /pdf/1706.03762 [following]
--2025-09-16 07:42:36--  https://arxiv.org/pdf/1706.03762
Reusing existing connection to arxiv.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 2215244 (2.1M) [application/pdf]
Saving to: ‘1706.03762.pdf’


2025-09-16 07:42:36 (29.3 MB/s) - ‘1706.03762.pdf’ saved [2215244/2215244]



In [None]:
#Load the Documents
loader = PyPDFLoader("1706.03762.pdf")
documents = loader.load()

In [None]:
len(documents)

15

In [None]:
documents = documents[:5]
len(documents)

5

In [None]:
documents[1].page_content

'1 Introduction\nRecurrent neural networks, long short-term memory [13] and gated recurrent [7] neural networks\nin particular, have been firmly established as state of the art approaches in sequence modeling and\ntransduction problems such as language modeling and machine translation [ 35, 2, 5]. Numerous\nefforts have since continued to push the boundaries of recurrent language models and encoder-decoder\narchitectures [38, 24, 15].\nRecurrent models typically factor computation along the symbol positions of the input and output\nsequences. Aligning the positions to steps in computation time, they generate a sequence of hidden\nstates ht, as a function of the previous hidden state ht−1 and the input for position t. This inherently\nsequential nature precludes parallelization within training examples, which becomes critical at longer\nsequence lengths, as memory constraints limit batching across examples. Recent work has achieved\nsignificant improvements in computational efficiency t

In [None]:
#Splitting documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
docs = text_splitter.split_documents(documents)
docs

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '1706.03762.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu'),
 Document(metadata={'producer': 'p

In [None]:
docs[3].page_content

'training for 3.5 days on eight GPUs, a small fraction of the training costs of the\nbest models from the literature. We show that the Transformer generalizes well to\nother tasks by applying it successfully to English constituency parsing both with\nlarge and limited training data.\n∗Equal contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started\nthe effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Transformer models and'

In [None]:
#Embedding model
model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
)

In [None]:
#Create vectorstore
vectorstore = Chroma.from_documents(docs, embeddings)

In [None]:
#Retriever
vectorstore_retriever = vectorstore.as_retriever(search_kwargs = {"k": 2})

In [None]:
llm = ChatGroq(model = "gemma2-9b-it", api_key = groq_api_key)

In [None]:
from langchain_core.prompts import ChatPromptTemplate
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [None]:
from langchain_core.output_parsers import StrOutputParser

generate_query = (
    prompt_rag_fusion
    | llm
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

In [None]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents
        and an optional parameter k used in the RRF formula """

    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_query | vectorstore_retriever.map() | reciprocal_rank_fusion
# question = "What is GLU?"
# docs = retrieval_chain_rag_fusion.invoke({"question": question})
# len(docs)

In [None]:
#RAG Pipeline
from langchain_core.runnables import RunnablePassthrough

template = """Answer the following question based on this context:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

response = final_rag_chain.invoke({"question": "What is attention?"})
print(response)

Based on the provided text, an attention function can be described as mapping a query and a set of key-value pairs to an output.  

Here's a breakdown:

* **Query:**  A vector representing what you're looking for.
* **Key-value pairs:** A set of data points, each with a key (used for matching) and a value (the actual information).
* **Output:** A vector representing the result, computed as a weighted sum of the values.


Essentially, attention allows a model to focus on relevant parts of the input data (the key-value pairs) when processing the query. 



In [None]:
# response = final_rag_chain.invoke({"question": "Recurrent neural network?"})
# print(response)


