In [1]:

# %pip install llama-index-embeddings-huggingface
# %pip install llama-index-embeddings-instructor

In [2]:
# !pip install llama-index
# !pip install git+https://github.com/csebuetnlp/normalizer
# !pip install datasets

In [3]:
import os
os.environ['MKL_THREADING_LAYER'] = 'GNU'
import openai

# needed to synthesize responses later
os.environ["OPENAI_API_KEY"] = "sk-..."
openai.api_key = os.environ["OPENAI_API_KEY"]

In [4]:
from datasets import load_dataset
from normalizer import normalize
from llama_index.core.readers.string_iterable import StringIterableReader
from llama_index.core.evaluation.retrieval.evaluator import RetrieverEvaluator
from llama_index.legacy.finetuning.embeddings.common import EmbeddingQAFinetuneDataset
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
class CustomSentenceSplitter(SentenceSplitter):
    def _split_text(self, text: str, chunk_size: int):

        text_splits = text.split(self.paragraph_separator)

        return text_splits

In [6]:
EMBED_MODEL_NAME = "FacebookAI/xlm-roberta-base"

In [7]:
dataset = load_dataset("csebuetnlp/squad_bn", split="test")
# dataset = dataset.select(range(10))
dataset = dataset.map(lambda x: {"context": normalize(x["context"]), "question": normalize(x["question"])})

KeyboardInterrupt: 

In [None]:
collection = list(set(dataset["context"]))
documents = StringIterableReader().load_data(texts=collection)

In [None]:
delimiter= "andolon"  # Define your delimiter here
joined_collection = delimiter.join(collection)

In [None]:
!mkdir -p 'data/squad_bn/'

In [None]:
file_path = "data/squad_bn/joined_collections.txt"  # Specify the file path here

# Write the joined string to the file
with open(file_path, "w") as file:
    file.write(joined_collection)

In [None]:
documents = SimpleDirectoryReader("./data/squad_bn/").load_data()

In [None]:
node_parser = CustomSentenceSplitter(paragraph_separator= delimiter)
nodes = node_parser.get_nodes_from_documents(documents)

In [None]:
for idx, node in enumerate(nodes):
    node.id_ = f"node_{idx}"

In [None]:
embed_model = HuggingFaceEmbedding(model_name= EMBED_MODEL_NAME)
Settings.embed_model = embed_model


No sentence-transformers model found with name FacebookAI/xlm-roberta-base. Creating a new one with mean pooling.
Generating embeddings:  28%|██▊       | 579/2048 [02:00<05:04,  4.82it/s]


In [None]:
vector_index = VectorStoreIndex(
    nodes=nodes,
    show_progress=True
)

Generating embeddings: 100%|██████████| 2048/2048 [05:31<00:00,  6.19it/s]
Generating embeddings: 100%|██████████| 201/201 [00:32<00:00,  6.13it/s]


In [None]:
retriever = vector_index.as_retriever(similarity_top_k=3)

In [None]:


metrics = ["mrr", "hit_rate", "precision"]


retriever_evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever
)

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:

query_texts = dataset["question"]  # List of query texts
query_ids=[]
for i in range(len(query_texts)):
    query_ids.append(str(i))
query_dict = dict(zip(query_ids, query_texts))

In [None]:
reverse_doc_dict = {}
doc_dict = {}

for key, value in vector_index.docstore.docs.items():
    reverse_doc_dict[value.text] = key
    doc_dict[key] = value.text

In [None]:
relevent_doc_dict = {query_id: [reverse_doc_dict[dataset["context"][i]]] for i, query_id in enumerate(query_ids)}


In [None]:
qa_dataset = EmbeddingQAFinetuneDataset(
    queries= query_dict,
    corpus= doc_dict,
    relevant_docs= relevent_doc_dict
)

In [None]:
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [None]:
import pandas as pd


def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()
    columns = {"retrievers": [name], "hit_rate": [hit_rate], "mrr": [mrr]}

    metric_df = pd.DataFrame(columns)

    return metric_df

In [None]:

display_results("top-50 eval", eval_results)

Unnamed: 0,retrievers,hit_rate,mrr
0,top-50 eval,0.055911,0.00818
