<a href="https://colab.research.google.com/github/Soha85/MyKaggle/blob/main/Applying_RAG_to_Question_Answering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**RAG-Based Question Answering Model**

Retrieval-Augmented Generation (RAG) combines large language models (LLMs) with a document retrieval system to provide more accurate and contextually relevant responses. Instead of relying purely on the internal knowledge of the LLM, RAG pulls external data from a corpus or knowledge base and uses that information to generate the answer. Here’s a step-by-step guide and sample code to demonstrate RAG with Python for teaching purposes.

**How RAG Works?**
* **Retrieve** grounding data based on the initial user-entered prompt.
* **Augment** the prompt with grounding data.
* **Generate** a grounded response using a language model

In [23]:
!pip install transformers
!pip install -U sentence-transformers==2.2.2
!pip install scikit-learn
!pip install beautifulsoup4
!pip install faiss-cpu



In [24]:
import warnings
from tqdm.notebook import tqdm, trange
warnings.filterwarnings("ignore", category=UserWarning)
from transformers import BertTokenizer, BertModel, pipeline
from sentence_transformers import SentenceTransformer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import faiss
import re
from google.colab import drive
from collections import Counter
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [25]:
# Step 1: Load Data
def read_data():
    data = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/train-squad.csv')
    return data[['question', 'text']].head(500)

# Step 2: Preprocessing the Text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    #text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and digits
    text =re.sub('\n', ' ', text)  # Remove newlines
    return text

# Step 3: Chunking
def chunk_text(text, chunk_size=100):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

# Step 4: Embedding Generation using BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

# Step 5: Embedding Generation and FAISS Integration
def create_faiss_index(embedding_dim):
    index = faiss.IndexFlatL2(embedding_dim)  # L2 distance for similarity search
    return index

def save_embeddings_to_faiss(questions, contexts, faiss_index):
    all_embeddings = []
    chunked_texts = []

    # Process each question-context pair
    for question, context in zip(questions, contexts):
        # Combine question and context (as one block of text)
        combined_text = question + " " + context
        preprocessed_text = preprocess_text(combined_text)

        # Chunk the combined text (if necessary) and generate embeddings
        chunks = chunk_text(preprocessed_text)
        chunked_texts.extend(chunks)

        for chunk in chunks:
            embedding = get_bert_embeddings(chunk)
            all_embeddings.append(embedding)

    # Convert embeddings to NumPy array (FAISS requires float32 arrays)
    all_embeddings = np.vstack(all_embeddings).astype('float32')

    # Add embeddings to FAISS index
    faiss_index.add(all_embeddings)

    return chunked_texts

# Step 6: Document Retrieval from FAISS
def retrieve_documents_faiss(query, faiss_index, chunked_texts, k=1):
    query_embedding = get_bert_embeddings(query)
    distances, indices = faiss_index.search(query_embedding, k)

    results = []
    for i, idx in enumerate(indices[0]):
        document = chunked_texts[idx]
        score = distances[0][i]
        results.append((document, score))

    return results

# F1 Score Calculation
def compute_f1(prediction, ground_truth):
    pred_tokens = normalize_text(prediction).split()
    ground_truth_tokens = normalize_text(ground_truth).split()

    common = Counter(pred_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return 0.0, 0.0, 0.0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return precision, recall, f1

# Text normalization for comparison
def normalize_text(text):
    return re.sub(r'\W', ' ', text.lower())


# Step 7: RAG Pipeline with FAISS-based Retrieval and QA Model (Updated)

def rag_generate_answer(query, real_answer, faiss_index, chunked_texts, k=1):
    # Step 1: Retrieve relevant documents (contexts) based on the question embedding
    retrieved_docs = retrieve_documents_faiss(query, faiss_index, chunked_texts, k=k)

    # Step 2: Concatenate the retrieved documents to form the context
    context = ' '.join([doc for doc, score in retrieved_docs])

    # Step 3: Use the QA model to generate an answer from the retrieved context
    qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad",batch_size=512)
    generated_answer = qa_pipeline(question=query, context=context)['answer']

    # Step 4: Generate embeddings for real answer and generated answer using BERT (for similarity calculation)
    real_answer_embedding = get_bert_embeddings(real_answer)
    generated_answer_embedding = get_bert_embeddings(generated_answer)


    # Step 5: Compute F1 Score
    precision, recall, f1 = compute_f1(generated_answer, real_answer)

    return generated_answer, precision, recall, f1

# --- Main Execution ---
from transformers import logging
logging.set_verbosity_error()
QA = read_data()

# Step 1: Create FAISS index
embedding_dim = bert_model.config.hidden_size
faiss_index = create_faiss_index(embedding_dim)

# Step 2: Preprocess, chunk, and save embeddings (using both questions and contexts)
questions = QA["question"]
contexts = QA["text"]
chunked_texts = save_embeddings_to_faiss(questions, contexts, faiss_index)

# Step 3: Iterate over the dataset to generate answers and calculate F1 scores
real_answers = []
generated_answers = []
precisions = []
recalls = []
f1_scores = []

# Iterate through all questions and answers
for idx, row in QA.iterrows():
    query = row['question']
    real_answer = row['text']

    # Step 4: Generate answer using the RAG pipeline
    generated_answer, precision, recall, f1 = rag_generate_answer(query, real_answer, faiss_index, chunked_texts, k=1)

    # Collect answers and evaluation metrics
    real_answers.append(real_answer)
    generated_answers.append(generated_answer)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Step 5: Compute average precision, recall, and F1 score over the entire dataset
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1 = sum(f1_scores) / len(f1_scores)

print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"Average F1 Score: {avg_f1}")




Average Precision: 0.5945079365079365
Average Recall: 0.5901
Average F1 Score: 0.5899999000999001


In [26]:
query, real_answer = QA.sample(n=1).iloc[0]
generated_answer, precision, recall, f1 = rag_generate_answer(query, real_answer, faiss_index, chunked_texts, k=1)
print("Query:", query)
print("Real Answer:", real_answer)
print("Generated Answer:", generated_answer)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Query: Beyonce confirmed what after performing one of her songs?
Real Answer: her pregnancy
Generated Answer: her pregnancy
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
