## Load Questions

In [1]:
import json

In [2]:
def load(max_q):
    samples = []
    
    with open("dataset/squad.json", "r") as f:
        data = json.load(f)["data"]

        for article in data:
            for para in article["paragraphs"]:
                context = para["context"]

                for qa in para["qas"]:
                    q = qa["question"]
                    ans = qa["answers"][0]["text"] if qa["answers"] else None

                    samples.append({
                        "question" : q, 
                        "answer" : ans,
                        "context": context
                    })

                    if len(samples) >= max_q:
                        return samples

In [3]:
samples = load(70)

## Core Combinations

In [4]:
RETRIEVERS = ["faiss", "bm25", "hybrid"]
CHUNK_SIZE = [200, 400, 800]
CHUNK_OVERLAP = [100, 200]
ANS_MODEL = ["qa", "llm"]
K = [3, 5, 10]
RERANKER = [True, False]

In [17]:
from itertools import product

configs = []
config_id = 0

for re, chk_size, chk_overlap, model, k, rerank in product(
    RETRIEVERS,
    CHUNK_SIZE,
    CHUNK_OVERLAP,
    ANS_MODEL,
    K,
    RERANKER
):
    # qa models doesn't perform well on large k values
    if model == "qa" and k > 5:
        continue

    configs.append({
        "config_id" : config_id,
        "Retriever" : re,
        "Chunk_size" : chk_size,
        "Chunk_overlap" : chk_overlap,
        "k" : k,
        "Answer_model" : model,
        "Reranker" : rerank
    })
    config_id += 1

## Functions for question's feature extraction

In [7]:
import re

def normalize(s):
    s = re.sub(r"\b(a | an | the)\b", " ", s)
    s = re.sub(r"[^\w\s]", "", s)
    s = re.sub(r"\s+", " ", s)

    return s.lower().strip()

In [11]:
def question_type(question):
    q = question.lower()

    if any(w in q for w in ["explain", "describe", "why", "how"]):
        return "explanatory"

    if any(w in q for w in ["list", "all", "authors", "members"]):
        return "factoid_list"

    if q.startswith(("who", "when", "what year", "what is the id")):
        return "factoid_single"

    return "other"

In [14]:
import spacy

nlp = spacy.load("en_core_web_sm")

def entity_count(question):
    doc = nlp(question)

    return len(doc.ents())

In [15]:
def get_features(question):
    tokens = normalize(question).split()

    return {
        "tokens_length" : len(tokens),
        "isdigit" : any(c.isdigit() for c in question),
        "num_entities" : entity_count(),
        "question_type" : question_type(question)
    }

## Dataset Generation

In [9]:
from collections import Counter

# f1 score for normal qa models
def f1(truth, pred):
    pred_tokens = normalize(pred).split()
    truth_tokens = normalize(truth).split()

    commons = Counter(pred_tokens) & Counter(truth_tokens)
    num_commons = sum(commons)

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return pred_tokens == truth_tokens
    
    if len(num_commons) == 0:
        return 0.0

    precision = num_commons / len(pred_tokens)
    recall = num_commons / len(truth_tokens)

    return 2 * precision * recall / (len(pred_tokens) + len(truth_tokens))

In [10]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")

# semantic similarity for LLMs
def sementic_similarity(truth, pred):
    truth_embed = model.encode(truth, convert_to_tensor = True)
    pred_embed = model.encode(pred, convert_to_tensor = True)

    return util.cos_sim(truth_embed, pred_embed).item()