### RAG Systems with Transformers

##### 1. Building the Document Indexing System

In [1]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp312-cp312-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [75]:
import faiss
import torch
from transformers import AutoTokenizer, AutoModel

In [76]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [77]:
def generate_embedding(docs, model, tokenizer):
    ### tokenize each text and convert to pytorch tensors
    inputs = tokenizer(docs, padding=True, truncation=True, return_tensors="pt", max_length =512)
    with torch.no_grad():
        outputs = model(**inputs)
    ## embedding defined as mean pooling of all tokens
    attention_mask = inputs["attention_mask"]
    embeddings = outputs.last_hidden_state

    expanded_mask = attention_mask.unsqueeze(-1).expand(embeddings.shape).float()
    sum_embeddings = torch.sum(embeddings * expanded_mask, axis=1)
    sum_mask = torch.clamp(expanded_mask.sum(axis=1), min=1e-9)
    mean_embeddings = sum_embeddings / sum_mask

    
    ### convert numpy array
    return mean_embeddings.cpu().numpy()    

In [78]:
# Sample document collection
documents = [
    "Transformers are a type of deep learning model introduced in the paper 'Attention "
        "Is All You Need'.",
    "BERT (Bidirectional Encoder Representations from Transformers) is a "
        "transformer-based model designed to understand the context of a word based on "
        "its surroundings.",
    "GPT (Generative Pre-trained Transformer) is a transformer-based model designed for "
        "natural language generation tasks.",
    "T5 (Text-to-Text Transfer Transformer) treats every NLP problem as a text-to-text "
        "problem, where both the input and output are text strings.",
    "RoBERTa is an optimized version of BERT with improved training methodology and more "
        "training data.",
    "DistilBERT is a smaller, faster version of BERT that retains 97% of its language "
        "understanding capabilities.",
    "ALBERT reduces the parameters of BERT by sharing parameters across layers and using "
        "embedding factorization.",
    "XLNet is a generalized autoregressive pretraining method that overcomes the "
        "limitations of BERT by using permutation language modeling.",
    "ELECTRA uses a generator-discriminator architecture for more efficient pretraining.",
    "DeBERTa enhances BERT with disentangled attention and an enhanced mask decoder."
]

In [79]:
### Generate embeddings for all documents , then create FAIDD index for efficient similarity search

In [80]:
###  using L2 (Euclidean) distance
document_embeddings = generate_embedding(documents, model, tokenizer)
dimension = document_embeddings.shape[1]   ## dimension of embeddings
index = faiss.IndexFlatL2(dimension) ### Using L2 (Euclidean ) distance
index.add(document_embeddings)   #### add enbeddings to the index
print(f"Create index with {index.ntotal} documents")

Create index with 10 documents


In [81]:
### If you intended to use cosine distance
# document_embeddings = generate_embedding(documents, model, tokenizer)
# normalized = document_embeddings /np.linalg.norm(document_embeddings, axis=1, keepdims)
# index.add(normalized)

### Implement the retrival system

In [82]:
def retrieve_documents(query, index, documents, k=3):
    ## generate embeding for the query
    query_embedding = generate_embedding(query, model, tokenizer)
    distances, indices = index.search(query_embedding, k)
    retrieved_docs = [(documents[idx], float(distances[0][i])) for i , idx in enumerate(indices[0])]
    return retrieved_docs  

In [83]:
#### example query

query = "What is BERT?"
retrieved_docs = retrieve_documents(query, index,documents )


# Print the retrieved documents
print(f"Query: {query}\n")
for i, (doc, distance) in enumerate(retrieved_docs):
    print(f"Document {i+1} (Distance: {distance:.4f}):")
    print(doc)
    print()

Query: What is BERT?

Document 1 (Distance: 23.7060):
BERT (Bidirectional Encoder Representations from Transformers) is a transformer-based model designed to understand the context of a word based on its surroundings.

Document 2 (Distance: 28.0793):
RoBERTa is an optimized version of BERT with improved training methodology and more training data.

Document 3 (Distance: 29.5908):
DistilBERT is a smaller, faster version of BERT that retains 97% of its language understanding capabilities.



In [84]:
from transformers import AutoModelForSeq2SeqLM

gen_tokenizer = AutoTokenizer.from_pretrained("t5-small")
gen_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

def generate_response(query, retrieved_docs, max_length=150):
    # Combine the query and retrieved documents into a single prompt
    context = "\n".join(retrieved_docs)
    prompt = f"question: {query} context: {context}"

    # Generate a response
    inputs = gen_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = gen_model.generate(
            inputs.input_ids,
            max_length=max_length,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
    response = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


In [85]:
# Generate a response for the example query
response = generate_response(query, [doc for doc, score in retrieved_docs])
print("Generated Response:")
print(response)

Generated Response:
Bidirectional Encoder Representations from Transformers


In [86]:
#### Complete RAG System

def rag_pipeline(query, documents, retriever_k=3, max_length=150):
    retrieved_docs = retrieve_documents(query, index, documents, k=retriever_k)
    response = generate_response(query, [doc for doc, score in retrieved_docs], max_length=max_length)
    return response, retrieved_docs

In [87]:
# Example queries
queries = [
    "What is BERT?",
    "How does GPT work?",
    "What is the difference between BERT and GPT?",
    "What is a smaller version of BERT?"
]
# Run the RAG pipeline for each query
for query in queries:
    response, retrieved_docs = rag_pipeline(query, documents)
    print(f"Query: {query}")
    print()
    print("Retrieved Documents:")
    for i, (doc, distance) in enumerate(retrieved_docs):
        print(f"Document {i+1} (Distance: {distance:.4f}):")
        print(doc)
    print()
    print("Generated Response:")
    print(response)
    print("-" * 20)

Query: What is BERT?

Retrieved Documents:
Document 1 (Distance: 23.7060):
BERT (Bidirectional Encoder Representations from Transformers) is a transformer-based model designed to understand the context of a word based on its surroundings.
Document 2 (Distance: 28.0793):
RoBERTa is an optimized version of BERT with improved training methodology and more training data.
Document 3 (Distance: 29.5908):
DistilBERT is a smaller, faster version of BERT that retains 97% of its language understanding capabilities.

Generated Response:
Bidirectional Encoder Representations from Transformers
--------------------
Query: How does GPT work?

Retrieved Documents:
Document 1 (Distance: 33.8010):
GPT (Generative Pre-trained Transformer) is a transformer-based model designed for natural language generation tasks.
Document 2 (Distance: 52.0256):
BERT (Bidirectional Encoder Representations from Transformers) is a transformer-based model designed to understand the context of a word based on its surrounding