This experiments with ragas and how to use it in our context. Multiple problems arise from the fact that in their example contexts is a list of a single element and it should be a list of more than one element, but then the code does not work.

In [34]:
from datasets import load_dataset

amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2")
amnesty_qa

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    eval: Dataset({
        features: ['question', 'ground_truth', 'answer', 'contexts'],
        num_rows: 20
    })
})

In [35]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama

model_name = "NeuML/pubmedbert-base-embeddings"
device = 'cuda:0'
model_id = "llama2:latest" 

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': device},
    encode_kwargs={'device': device}
)


langchain_llm = Ollama(model = "llama2") # any langchain LLM instance
langchain_embeddings = embeddings # any langchain Embeddings instance


In [36]:
## evaluate single context
from ragas import evaluate

result = evaluate(
    amnesty_qa['eval'],
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
    llm=langchain_llm,
    embeddings=langchain_embeddings,
)

result

Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

In [8]:
def element_to_list(element):
    return element[0].split('\n-')

In [21]:
pandas_dataset = amnesty_qa['eval'].to_pandas()
pandas_dataset['contexts'] = pandas_dataset['contexts'].apply(element_to_list)
pandas_dataset

Unnamed: 0,question,ground_truth,answer,contexts
0,What are the global implications of the USA Su...,The global implications of the USA Supreme Cou...,The global implications of the USA Supreme Cou...,"[- In 2022, the USA Supreme Court handed down ..."
1,Which companies are the main contributors to G...,"According to the Carbon Majors database, the m...","According to the Carbon Majors database, the m...","[- Fossil fuel companies, whether state or pri..."
2,Which private companies in the Americas are th...,The largest private companies in the Americas ...,"According to the Carbon Majors database, the l...",[The private companies responsible for the mos...
3,What action did Amnesty International urge its...,Amnesty International urged its supporters to ...,Amnesty International urged its supporters to ...,[Amnesty International called on its vast netw...
4,What are the recommendations made by Amnesty I...,The recommendations made by Amnesty Internatio...,Amnesty International made several recommendat...,[Amnesty International recommends that the Spe...
5,Who are the target audience of the two books c...,The target audience of the two books created b...,The target audience of the two books created b...,[Amnesty International has therefore created t...
6,Which right guarantees access to comprehensive...,The right that guarantees access to comprehens...,The right that guarantees access to comprehens...,[26. The Act raises serious questions about it...
7,Who has the right to be fully informed about h...,The victims of gross human rights violations a...,Everyone has the right to be fully informed ab...,[- The victims of gross human rights violation...
8,When can individuals be found guilty under Art...,Individuals can be found guilty under Article ...,Under Article 207.3 of the Russian Criminal Co...,[- As long as their statements are contrary to...
9,When does the prosecution consider statements ...,The prosecution considers statements contrary ...,Under Article 207.3 of the Russian Criminal Co...,[- As long as their statements are contrary to...


In [33]:
print(len(pandas_dataset.iloc[15]['contexts']))

from datasets import Dataset
from pprint import pprint
amnesty_qa = Dataset.from_pandas(pandas_dataset)
pprint(amnesty_qa.info)

8
DatasetInfo(description='', citation='', homepage='', license='', features={'question': Value(dtype='string', id=None), 'ground_truth': Value(dtype='string', id=None), 'answer': Value(dtype='string', id=None), 'contexts': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name=None, dataset_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)


In [2]:
## evaluate multiple contexts
from ragas import evaluate

result = evaluate(
    amnesty_qa,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
    llm=langchain_llm,
    embeddings=langchain_embeddings,
)

result

In [20]:
df = result.to_pandas()
df.head()


Unnamed: 0,question,ground_truth,answer,contexts,context_precision,answer_relevancy,context_recall
0,What are the global implications of the USA Su...,The global implications of the USA Supreme Cou...,The global implications of the USA Supreme Cou...,"[- In 2022, the USA Supreme Court handed down ...",,,1.0
1,Which companies are the main contributors to G...,"According to the Carbon Majors database, the m...","According to the Carbon Majors database, the m...","[- Fossil fuel companies, whether state or pri...",0.0,,
