## Load Questions

In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/RAG_Optimizer/


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/RAG_Optimizer


In [2]:
!pip install -U --ignore-installed \
  torch \
  transformers \
  sentence-transformers \
  langchain \
  langchain-community \
  faiss-cpu \
  rank-bm25 \
  pandas \
  numpy \
  scikit-learn \
  spacy \
  tqdm

Collecting torch
  Using cached torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting transformers
  Using cached transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
Collecting langchain
  Using cached langchain-1.2.0-py3-none-any.whl.metadata (4.9 kB)
Collecting langchain-community
  Using cached langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting rank-bm25
  Using cached rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pandas
  Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting numpy
  Using cached numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting scikit-learn
  Using cached scik

In [3]:
import json
import pandas as pd
import re
from itertools import product
import spacy
from collections import Counter
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain_community.vectorstores import FAISS
from ingest import ingest_data
from run_rag import run_rag_pipeline



In [4]:
samples = pd.read_csv("questions.csv")

## Core Combinations

In [3]:
RETRIEVERS = ["faiss", "bm25", "hybrid"]
CHUNK_SIZE = [400, 800]
ANS_MODEL = ["qa", "llm"]
K = [3, 5]
RERANKER = [True, False]

In [5]:
configs = []
config_id = 0

from itertools import product

for ret, chk_size, model, k, rerank in product(
    RETRIEVERS,
    CHUNK_SIZE,
    ANS_MODEL,
    K,
    RERANKER
):
    # qa models doesn't perform well on large k values
    if model == "qa" and k > 5:
        continue
    if ret == "bm25" and rerank:
        continue
    if rerank and k < 5:
        continue
    if model == "qa" and ret == "hybrid" and rerank:
        continue

    configs.append({
        "config_id" : config_id,
        "Retriever" : ret,
        "Chunk_size" : chk_size,
        "Chunk_overlap" : 100,
        "k" : k,
        "Answer_model" : model,
        "Reranker" : rerank
    })
    config_id += 1

In [6]:
len(configs)

30

## Functions for question's feature extraction

In [7]:
def normalize(s):
    s = re.sub(r"\b(a|an|the)\b", " ", s)
    s = re.sub(r"[^\w\s]", "", s)
    s = re.sub(r"\s+", " ", s)

    return s.lower().strip()

In [8]:
def question_type(question):
    q = question.lower()

    if any(w in q for w in ["explain", "describe", "why", "how"]):
        return "explanatory"

    if any(w in q for w in ["list", "all", "authors", "members"]):
        return "factoid_list"

    if q.startswith(("who", "when", "what")):
        return "factoid_single"

    return "other"

In [9]:
nlp = spacy.load("en_core_web_sm")

def entity_count(question):
    doc = nlp(question)

    return len(doc.ents)

In [10]:
def get_features(question):
    tokens = normalize(question).split()

    return {
        "tokens_length" : len(tokens),
        "isdigit" : any(c.isdigit() for c in question),
        "num_entities" : entity_count(question),
        "question_type" : question_type(question)
    }

## Dataset Generation

In [11]:
# f1 score for normal qa models
def f1(truth, pred):
    pred_tokens = normalize(pred).split()
    truth_tokens = normalize(truth).split()

    commons = Counter(pred_tokens) & Counter(truth_tokens)
    num_commons = sum(commons.values())

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return pred_tokens == truth_tokens

    if num_commons == 0:
        return 0.0

    precision = num_commons / len(pred_tokens)
    recall = num_commons / len(truth_tokens)

    return 2 * precision * recall / (len(pred_tokens) + len(truth_tokens))

In [12]:
model = SentenceTransformer("all-MiniLM-L6-v2")

# semantic similarity for LLMs
def sementic_similarity(truth, pred):
    truth_embed = model.encode(truth, convert_to_tensor = True)
    pred_embed = model.encode(pred, convert_to_tensor = True)

    return util.cos_sim(truth_embed, pred_embed).item()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [16]:
reranker = CrossEncoder(
            "cross-encoder/ms-marco-MiniLM-L6-v2"
        )

embedding = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")

pipe = pipeline("text2text-generation", model="google/flan-t5-base", max_new_tokens = 512)
llm = HuggingFacePipeline(pipeline = pipe)

qa_model = pipeline("question-answering", model = "deepset/roberta-base-squad2")

Device set to use cpu
Device set to use cpu


In [18]:
metadata = None

In [19]:
rows = []

for cfg in configs:
    for _, row in samples.iterrows():
        question = row["question"]
        truth = row["answer"]
        context = row["context"]
        q_features = get_features(question)

        if metadata != [cfg["Retriever"], cfg["Chunk_size"], cfg["Chunk_overlap"], context]:

            metadata = [cfg["Retriever"], cfg["Chunk_size"], cfg["Chunk_overlap"], context]

            ingest_data(
                context = context,
                chunk_size = cfg["Chunk_size"],
                chunk_overlap = cfg["Chunk_overlap"],
                retriever = cfg["Retriever"]
            )

        pred = run_rag_pipeline(
            retriever = cfg["Retriever"],
            k = cfg["k"],
            answer_model = qa_model if cfg["Answer_model"] == "qa" else llm,
            answer_type = "qa" if cfg["Answer_model"] == "qa" else "llm",
            reranker = reranker,
            need_rerank = cfg["Reranker"],
            embedding = embedding,
            question = question
        )

        f1_score = f1(truth, pred)
        sementic_similarity_score = sementic_similarity(pred, truth)

        rows.append({
            "question" : question,
            "truth" : truth,
            "prediction" : pred,
            "f1" : f1_score,
            "sementic_similarity" : sementic_similarity_score,
            **q_features,
            **cfg
        })


Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


KeyboardInterrupt: 