In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np


In [2]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001-9df3a936e1f63191.parquet', 'test': 'data/test-00000-of-00001-af2a9f454ad1b8a3.parquet'}
df = pd.read_parquet("hf://datasets/neural-bridge/rag-dataset-12000/" + splits["train"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
df.columns
df.head()


Unnamed: 0,context,question,answer
0,Caption: Tasmanian berry grower Nic Hansen sho...,What is the Berry Export Summary 2028 and what...,The Berry Export Summary 2028 is a dedicated e...
1,RWSN Collaborations\nSouthern Africa Self-supp...,What are some of the benefits reported from ha...,Benefits reported from having access to Self-s...
2,All Android applications categories\nDescripti...,What are the unique features of the Coolands f...,The unique features of the Coolands for Twitte...
3,"How unequal is India? The question is simple, ...",What is the main difference between the Nation...,The main difference between the NSS and the IH...
4,Gunnar Nelson took his time on the feet agains...,How did Gunnar Nelson win the fight against Za...,Gunnar Nelson won the fight against Zak Cummin...


In [4]:
df_small = df.sample(300, random_state=42).reset_index(drop=True)


In [5]:
# Drop rows where question or context is missing
df_small = df_small.dropna(subset=["question", "context", "answer"]).reset_index(drop=True)


In [7]:
vectorizer = TfidfVectorizer(stop_words="english")


In [8]:
# Fit on context
context_tfidf = vectorizer.fit_transform(df_small["context"])

# Transform questions
question_tfidf = vectorizer.transform(df_small["question"])


In [9]:
similarity_matrix = cosine_similarity(question_tfidf, context_tfidf)


In [10]:
top_indices = similarity_matrix.argmax(axis=1)
retrieved_contexts = df_small["context"].iloc[top_indices].values


In [11]:
found_answer = [
    df_small["answer"].iloc[i].lower() in retrieved_contexts[i].lower()
    for i in range(len(df_small))
]


In [12]:
import pandas as pd

results = pd.DataFrame({
    "question": df_small["question"],
    "true_answer": df_small["answer"],
    "retrieved_context": retrieved_contexts,
    "answer_found": found_answer
})

accuracy = sum(found_answer) / len(found_answer)
print(f"TF-IDF QA Accuracy (answer in retrieved context): {accuracy:.2%}")


TF-IDF QA Accuracy (answer in retrieved context): 4.68%


In [13]:
top_k = 3
top_k_indices = similarity_matrix.argsort(axis=1)[:, -top_k:]

found_in_top_k = []
for i, row in enumerate(top_k_indices):
    contexts = df_small["context"].iloc[row].str.lower().tolist()
    answer = df_small["answer"].iloc[i].lower()
    found = any(answer in context for context in contexts)
    found_in_top_k.append(found)

top_k_accuracy = sum(found_in_top_k) / len(found_in_top_k)
print(f"Top-{top_k} Accuracy: {top_k_accuracy:.2%}")


Top-3 Accuracy: 5.35%


In [14]:
false_positives = results[~results["answer_found"]]
false_positives.head(5)


Unnamed: 0,question,true_answer,retrieved_context,answer_found
0,What is the reaction of the user upon discover...,The user is very happy to discover the site.,"And so, it’s finally happening. Far too late f...",False
1,What is the purpose of split testing in market...,The purpose of split testing in marketing camp...,Subscription growth hack (by PayKickstart)\nFa...,False
2,What measures are being taken by major Asian e...,"Major Asian economies like India, China, South...",Asia's major oil consuming nations have decide...,False
3,"What is the central setting of the movie ""BREA...",The central setting is a military court room w...,>>:Kenneth G. Ross (play)\nJonathan Hardy (scr...,False
4,What was the main issue in the 2006 lawsuit fi...,The main issue in the 2006 lawsuit was that th...,9th Circuit judge urges settlement in campus-s...,False


In [15]:
!pip install rank_bm25
from rank_bm25 import BM25Okapi

tokenized_corpus = [context.split() for context in df_small["context"]]
bm25 = BM25Okapi(tokenized_corpus)

scores = [bm25.get_scores(q.split()) for q in df_small["question"]]
top_bm25 = [np.argmax(s) for s in scores]
retrieved = df_small["context"].iloc[top_bm25].values

found_bm25 = [df_small["answer"].iloc[i].lower() in retrieved[i].lower() for i in range(len(df_small))]
print(f"BM25 Top-1 Accuracy: {sum(found_bm25)/len(found_bm25):.2%}")


Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
BM25 Top-1 Accuracy: 5.02%


In [16]:
from rank_bm25 import BM25Okapi

# Tokenize the contexts
tokenized_corpus = [doc.split() for doc in df_small["context"]]
bm25 = BM25Okapi(tokenized_corpus)

# Score questions
scores = [bm25.get_scores(q.split()) for q in df_small["question"]]
top_indices_bm25 = [np.argmax(s) for s in scores]

# Check answer presence
retrieved_contexts_bm25 = df_small["context"].iloc[top_indices_bm25].values
found_bm25 = [
    df_small["answer"].iloc[i].lower() in retrieved_contexts_bm25[i].lower()
    for i in range(len(df_small))
]

# Accuracy
accuracy_bm25 = sum(found_bm25) / len(found_bm25)
print(f"BM25 Top-1 Accuracy: {accuracy_bm25:.2%}")


BM25 Top-1 Accuracy: 5.02%


In [17]:
import difflib

def is_answer_fuzzy_match(answer, context, threshold=0.8):
    answer = answer.lower()
    context_words = context.lower().split()
    return any(difflib.SequenceMatcher(None, answer, word).ratio() > threshold for word in context_words)

found_fuzzy = [
    is_answer_fuzzy_match(df_small["answer"].iloc[i], retrieved_contexts[i])
    for i in range(len(df_small))
]
print(f"TF-IDF Accuracy (fuzzy match): {sum(found_fuzzy) / len(found_fuzzy):.2%}")


TF-IDF Accuracy (fuzzy match): 0.67%


In [18]:
comparison = pd.DataFrame({
    "TF-IDF Top-1": found_answer,
    "TF-IDF Top-3": found_in_top_k,
    "BM25 Top-1": found_bm25,
    "TF-IDF Fuzzy": found_fuzzy
})
print("Method Comparison:")
print(comparison.mean())


Method Comparison:
TF-IDF Top-1    0.046823
TF-IDF Top-3    0.053512
BM25 Top-1      0.050167
TF-IDF Fuzzy    0.006689
dtype: float64
