In [1]:
from src.adapter.chromadb import ChromaDB
from src.service.text_embedder.sentence_transformer import TextEmbedding
from src.service.text_generation.local_transformers import LocalTransformerTextGeneration
from src.service.classification.local_transformers import LocalTransformerClassification
from src.service.reranking.local_transformers import LocalTransformerReranking
from src.service.document_retrieval.chromadb import ChromaDocumentRetrievalService

text_embedding_service = TextEmbedding(model_id="BAAI/bge-m3")
chromadb = ChromaDB(embedding_service=text_embedding_service)
text_generation_service = LocalTransformerTextGeneration(
    model_id="meta-llama/Meta-Llama-3-8B-Instruct",
    device="cuda",
    attn_implementation="sdpa"
)
classification_service = LocalTransformerClassification(
    model_id="MoritzLaurer/deberta-v3-large-zeroshot-v2.0"
)
reranking_service = LocalTransformerReranking(
    model_id='BAAI/bge-reranker-v2-m3'
)
document_retrieval_service = ChromaDocumentRetrievalService(
    client=chromadb.get_client(),
    embedding_service=chromadb.get_embedding_service()
)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  3.39it/s]


Loaded Text Generator model: meta-llama/Meta-Llama-3-8B-Instruct
Loaded Cross-Encoder model: BAAI/bge-reranker-v2-m3


In [2]:
import importlib
from src import rag_system, utils, task_manager

# Reload the modules
importlib.reload(rag_system)
importlib.reload(utils.config_manager)
importlib.reload(task_manager)

from src.rag_system import RAGSystem
from src.utils.config_manager import ConfigManager
from src.task_manager import TaskManager

config = ConfigManager(env='dev')

rag_system = RAGSystem(
  config=config, 
  text_generation_service=text_generation_service,
  classification_service=classification_service,
  reranking_service=reranking_service,
  document_retrieval_service=document_retrieval_service
)

GLOBAL ConfigManager init...
. . . Loading config from /home/buddy/Study-Buddy/config/dev.yaml



In [1]:
dataset = [
  {
    "query": "What is the name of the muscular tube that transports food from the mouth to the stomach, and does any chemical digestion occur there?",
    "passages": "ns_21",
    "explanation": """This question requires specific information about the esophagus and its role in digestion. Passage 21 directly addresses both parts of the question.""",
    "ground_truth": "The muscular tube that transports food from the mouth to the stomach is called the **esophagus**. No chemical digestion occurs in the esophagus.",
  },
  {
    "query": "What are the two main types of starch, and how do they differ in their structure?",
    "passages": "ns_56",
    "explanation": """This question requires understanding the two types of starch and their chemical structures. Passage 56 defines amylose and amylopectin and explains how their structures differ.""",
    "ground_truth": "The two main types of starch are **amylose** and **amylopectin**. **Amylose** is made up of straight chains of glucose molecules, while **amylopectin** is made up of branched chains of glucose molecules.",
  },
  {
    "query": "What is the name of the hormone secreted by adipose tissue that helps regulate appetite, and how does it work?",
    "passages": "ns_119, ns_232",
    "explanation": """This question focuses on the specific hormone leptin and requires understanding its role in signaling energy balance.  Passage 119 mentions leptin's role in regulating appetite, and passage 232 explains how it works and why it may not be an effective weight loss drug.""",
    "ground_truth": "The hormone secreted by adipose tissue that helps regulate appetite is called **leptin**. As fat stores increase, more leptin is produced. Leptin acts on the brain to suppress hunger and increase energy expenditure. However, people who are overweight or obese are often resistant to leptin, meaning that their brains don't respond to it as effectively.",
  },
  {
    "query": "Explain the three mechanisms by which fluoride combats tooth decay.",
    "passages": "ns_382",
    "explanation": """This question requires specific knowledge about the action of fluoride on teeth. Passage 382 directly addresses this by listing and explaining the three mechanisms.""",
    "ground_truth": "Fluoride combats tooth decay by: \n\n1. **Blocking acid formation by bacteria**\n2. **Preventing demineralization of teeth**\n3. **Enhancing remineralization of destroyed enamel**",
  },
  {
    "query": "What is the recommended daily intake of protein for athletes, and how does this compare to the RDA for sedentary adults?",
    "passages": "ns_193, ns_444",
    "explanation": """This question focuses on the specific protein recommendations for athletes and requires comparing them to the standard RDA. Passage 193 provides the RDA for sedentary adults, and passage 444 provides the recommendations for athletes.""",
    "ground_truth": "The recommended protein intake for athletes is **1.2 to 2.0 grams per kilogram of body weight per day**, depending on the type and intensity of training. This is higher than the RDA for sedentary adults, which is **0.8 grams per kilogram of body weight per day**.",
  },
  {
    "query": "What is the name of the enzyme produced in the mouth that begins the chemical breakdown of triglycerides?",
    "passages": "ns_20",
    "explanation": "Passage 20 details the beginning of lipid digestion in the mouth and names the specific enzyme involved.",
    "ground_truth": "The enzyme produced in the mouth that begins the chemical breakdown of triglycerides is called **lingual lipase**.",
  },
  {
    "query": "What is the difference between a monounsaturated fatty acid and a polyunsaturated fatty acid?",
    "passages": "ns_134",
    "explanation": "Passage 134 defines both monounsaturated and polyunsaturated fatty acids, focusing on their structural differences.",
    "ground_truth": "A **monounsaturated fatty acid** has **one double bond** between carbon atoms in its chain, while a **polyunsaturated fatty acid** has **two or more double bonds**.",
  },
  {
    "query": "Explain the process of emulsification and why it is important for lipid digestion.",
    "passages": "ns_146",
    "explanation": "Passage 146 explains the role of bile in emulsifying fat and why this is necessary for lipid digestion in the small intestine.",
    "ground_truth": "Emulsification is the process of breaking down large fat globules into smaller droplets. This is important for lipid digestion because it increases the surface area of the fat, allowing digestive enzymes called lipases to more effectively break down the triglycerides. Bile, produced in the liver and stored in the gallbladder, is released into the small intestine and acts as an emulsifier.",
  },
  {
    "query": "What are the names of the two hormones involved in blood calcium regulation, and which gland produces each?",
    "passages": "ns_369",
    "explanation": "Passage 369 describes the process of blood calcium regulation, naming the two key hormones and identifying the glands that produce them.",
    "ground_truth": "The two hormones involved in blood calcium regulation are: \n\n1. **Parathyroid hormone (PTH)**, produced by the **parathyroid glands**. PTH increases blood calcium levels.\n2. **Calcitonin**, produced by the **thyroid gland**. Calcitonin decreases blood calcium levels.",
  },
  {
    "query": "Why is it important to introduce potentially allergenic foods to babies during their first year of life?",
    "passages": "ns_477, ns_478",
    "explanation": "Passages 477 and 478 address the rationale for introducing allergenic foods in infancy, including the evidence that this practice can prevent the development of food allergies.",
    "ground_truth": "Introducing potentially allergenic foods to babies during their first year of life can help prevent food allergies from developing. Studies show that early introduction of foods like peanut and egg can significantly reduce the risk of developing allergies to those foods.",
  }
]

In [5]:
from src.models.context import Context
results = []

for instance in dataset:
  query = instance['query']
  ground_truth = instance.get('ground_truth', None)
  response = await rag_system.process_query(query, eval = True)
  results.append(response)

Route: question
Query Doc Pairs:
10
QUERY: what is the name of the muscular tube that transports food from the mouth to the stomach and does any chemical digestion occur there
DOCUMENT: ns_21
The esophagus is a muscular tube that transports food from the mouth to the stomach. No chemical digestion occurs while the bolus is mechanically propelled through this tube by peristalsis.
GRADE: yes
DOCUMENT: ns_18
The process of digestion includes five main activities: ingestion, mechanical digestion, chemical digestion, absorption, and excretion.
The first of these processes, ingestion, refers to the entry of food into the GI tract through the mouth. There, the food is chewed and mixed with saliva, which contains enzymes that begin breaking down the carbohydrates and lipids in food. Mastication (chewing) increases the surface area of the food and allows for food to be broken into small enough pieces to be swallowed safely.
Food (now called a bolus since it has been chewed and moistened) leaves

In [34]:
query_passages= []
for instance, result in zip(dataset, results):
  query = instance['query']
  ground_truth = instance.get('ground_truth', None)
  rag_response = result.response.text
  ground_truth_passages = []
  rag_response_passages = []
  for doc in result.retrieved_documents.documents:
    if doc.id in instance['passages']:
      ground_truth_passages.append({
        "id": doc.id,
        "passage": doc.document
        })
    else:
      rag_response_passages.append({
        "id": doc.id,
        "passage": doc.document
      })
  query_passages.append({
    "query": query,
    "ground_truth": ground_truth,
    "ground_truth_passages": ground_truth_passages,
    "rag_response": rag_response,
    "other_passages": rag_response_passages
  })

  """
  {
    "query": "The original query",
    "ground_truth": "The ground truth answer",
    "ground_truth_passages": [
      {
        "id": "ns_21", "passage": "Passed that was used to answer the query and generate the ground truth"
      },
      {
        "id": "ns_56", "passage": "Another passage that was used to answer the query and generate the ground truth"
      },
      ...
    ],
    "rag_response": "The response generated by the RAG system",
    "other_passages": [
      {
        "id": "ns_119", "passage": "Another passage that was retrieved by the RAG system that MIGHT have been used to generate the RAG response"
      },
      {
        "id": "ns_232", "passage": "Another passage that was retrieved by the RAG system that MIGHT have been used to generate the RAG response"
      },
      ...
  }
  """

query_passages


[{'query': 'What is the name of the muscular tube that transports food from the mouth to the stomach, and does any chemical digestion occur there?',
  'ground_truth': 'The muscular tube that transports food from the mouth to the stomach is called the **esophagus**. No chemical digestion occurs in the esophagus.',
  'ground_truth_passages': [{'id': 'ns_21',
    'passage': 'The esophagus\xa0is a muscular tube that transports food from the mouth to the stomach. No chemical digestion occurs while the bolus is mechanically propelled through this tube by peristalsis.'}],
  'rag_response': 'The muscular tube that transports food from the mouth to the stomach is the esophagus [ns_21]. As for chemical digestion, it does not occur in the esophagus [ns_21, ns_18].',
  'other_passages': [{'id': 'ns_18',
    'passage': 'The process of digestion includes five main activities: ingestion, mechanical digestion, chemical digestion, absorption, and excretion.\nThe first of these processes, ingestion, ref

In [33]:
import json

with open('evaluation/df_responses.json', 'r') as f:
  data = json.load(f)
data


[{'query': 'what is the name of the muscular tube that transports food from the mouth to the stomach and does any chemical digestion occur there',
  'response': 'The muscular tube that transports food from the mouth to the stomach is the esophagus [ns_21]. As for chemical digestion, it does not occur in the esophagus [ns_21, ns_18].'},
 {'query': 'what are the two main types of starch and how do they differ in their structure',
  'response': 'Starch is a complex carbohydrate that serves as a primary source of energy for many organisms. It is composed of two main types: amylose and amylopectin. These two types of starch differ significantly in their structure, which is crucial for their function and properties.\n\nAmylose is a straight chain of glucose units, whereas amylopectin is a branched chain. The main difference between the two is the presence of branches in amylopectin, which is not found in amylose [ns_56]. This structural difference has significant implications for the propert

In [20]:
import json

output = []

for result, instance in zip(results, dataset):
    output.append({
        "query": result.query.text,
        "response": result.response.text,
        "ground_truth": instance['ground_truth'],
        "retrieved_documents": [doc.id + " " + doc.transformed_document for doc in result.retrieved_documents.documents]
    })

json_output = json.dumps(output, indent=2)
print(json_output)

with open('eval1.json', 'w') as json_file:
    json_file.write(json_output)

[
  {
    "query": "what is the name of the muscular tube that transports food from the mouth to the stomach and does any chemical digestion occur there",
    "response": "The muscular tube that transports food from the mouth to the stomach is the esophagus [ns_21]. As for chemical digestion, it does not occur in the esophagus [ns_21, ns_18].",
    "ground_truth": "The muscular tube that transports food from the mouth to the stomach is called the **esophagus**. No chemical digestion occurs in the esophagus.",
    "retrieved_documents": [
      "ns_21 The muscular tube that transports food from the mouth to the stomach is the esophagus. No chemical digestion occurs in the esophagus.",
      "ns_18 The muscular tube that transports food from the mouth to the stomach is the esophagus. Chemical digestion does not occur in the esophagus, but rather in the small intestine, where digestive secretions containing enzymes break down macronutrients into their chemical building blocks.",
      "ns

In [11]:
import json

queries = []
responses = []
ground_truths = []

# Populate lists
for result, instance in zip(results, dataset):
    queries.append(result.query.text)
    responses.append(result.response.text)
    ground_truths.append(instance['ground_truth'])

output = {
    "queries": queries,
    "responses": responses,
    "ground_truths": ground_truths
}

json_output = json.dumps(output, indent=2)

print(json_output)

with open('eval_list.json', 'w') as json_file:
    json_file.write(json_output)

{
  "queries": [
    "what is the name of the muscular tube that transports food from the mouth to the stomach and does any chemical digestion occur there",
    "what are the two main types of starch and how do they differ in their structure",
    "what is the name of the hormone secreted by adipose tissue that helps regulate appetite and how does it work",
    "explain the three mechanisms by which fluoride combats tooth decay",
    "what is the recommended daily intake of protein for athletes and how does this compare to the rda for sedentary adults",
    "what is the name of the enzyme produced in the mouth that begins the chemical breakdown of triglycerides",
    "what is the difference between a monounsaturated fatty acid and a polyunsaturated fatty acid",
    "explain the process of emulsification and why it is important for lipid digestion",
    "what are the names of the two hormones involved in blood calcium regulation and which gland produces each",
    "why is it important t

In [17]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score
from bert_score import score
from rouge import Rouge
import nltk
nltk.download('wordnet')

rouge = Rouge()

rouge_l_scores = []
bleu_scores = []
meteor_scores = []


# Example data (replace with your actual data)
P, R, F1 = score(responses, ground_truths, lang="en", verbose=True)

print(f"Average BERTScore F1: {F1.mean()}")

for query, ground_truth, response in zip(queries, ground_truths, responses):
    rouge_scores = rouge.get_scores(response, ground_truth)
    rouge_l_scores.append(rouge_scores[0]['rouge-l']['f'])

    bleu_scores.append(sentence_bleu([ground_truth.split()], response.split()))

    meteor_scores.append(single_meteor_score(ground_truth.split(), response.split()))

    
for rouge_l, bleu, meteor in zip(rouge_l_scores, bleu_scores, meteor_scores):
    print(f"ROUGE-L: {rouge_l}, BLEU: {bleu}, METEOR: {meteor}")


avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_meteor = sum(meteor_scores) / len(meteor_scores)

print(f"Average ROUGE-L: {avg_rouge_l}")
print(f"Average BLEU: {avg_bleu}")
print(f"Average METEOR: {avg_meteor}")

ROUGE-L: 0.6666666617283951, BLEU: 0.45443623637127284, METEOR: 0.7585268884703913
ROUGE-L: 0.25287355955608404, BLEU: 0.028419549417753945, METEOR: 0.2147971360381862
ROUGE-L: 0.4516128984131114, BLEU: 0.1870441036625162, METEOR: 0.4820615931101504
ROUGE-L: 0.15942028727473223, BLEU: 8.348472855689752e-79, METEOR: 0.17361111111111113
ROUGE-L: 0.4952380907936509, BLEU: 0.10711388838062348, METEOR: 0.5319647946833956
ROUGE-L: 0.6046511581179016, BLEU: 0.37184214350816, METEOR: 0.645760963960052
ROUGE-L: 0.14285714050177095, BLEU: 6.076398446407336e-79, METEOR: 0.1449275362318841
ROUGE-L: 0.33540372252305084, BLEU: 0.04903382977862074, METEOR: 0.3859640831758034
ROUGE-L: 0.2752293540712062, BLEU: 0.05259899630112923, METEOR: 0.41902314622529824
ROUGE-L: 0.35443037644207664, BLEU: 0.09686207813569611, METEOR: 0.45767062960838056
Average ROUGE-L: 0.373838324942198
Average BLEU: 0.13473508255557726
Average METEOR: 0.4214307882614653


[nltk_data] Downloading package wordnet to /home/buddy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [36]:
import json
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score
from bert_score import score
from rouge import Rouge
import nltk
nltk.download('wordnet')

rouge = Rouge()

rouge_l_scores = []
bleu_scores = []
meteor_scores = []
bert_scores = []

# Example data (replace with your actual data)
P, R, F1 = score(responses, ground_truths, lang="en", verbose=True)

print(f"Average BERTScore F1: {F1.mean()}")

query_scores = []

for query, ground_truth, response, bert_f1 in zip(queries, ground_truths, responses, F1):
    rouge_scores = rouge.get_scores(response, ground_truth)
    rouge_l = rouge_scores[0]['rouge-l']['f']
    rouge_l_scores.append(rouge_l)

    bleu = sentence_bleu([ground_truth.split()], response.split())
    bleu_scores.append(bleu)

    meteor = single_meteor_score(ground_truth.split(), response.split())
    meteor_scores.append(meteor)

    bert_scores.append(bert_f1.item())

    query_scores.append({
        "query": query,
        "rouge_l": rouge_l,
        "bleu": bleu,
        "meteor": meteor,
        "bert_score": bert_f1.item()
    })

avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_meteor = sum(meteor_scores) / len(meteor_scores)
avg_bert = sum(bert_scores) / len(bert_scores)

print(f"Average ROUGE-L: {avg_rouge_l}")
print(f"Average BLEU: {avg_bleu}")
print(f"Average METEOR: {avg_meteor}")
print(f"Average BERTScore: {avg_bert}")

# Add average scores to the JSON output
query_scores.append({
    "query": "average",
    "rouge_l": avg_rouge_l,
    "bleu": avg_bleu,
    "meteor": avg_meteor,
    "bert_score": avg_bert
})

# Write the scores to a JSON file
with open('query_scores.json', 'w') as f:
    json.dump(query_scores, f, indent=4)

[nltk_data] Downloading package wordnet to /home/buddy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  8.02it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 398.93it/s]

done in 0.13 seconds, 77.18 sentences/sec
Average BERTScore F1: 0.8843156695365906
Average ROUGE-L: 0.373838324942198
Average BLEU: 0.13473508255557726
Average METEOR: 0.4214307882614653
Average BERTScore: 0.8843156397342682





In [61]:
import json

# Load the JSON file
with open('llm_eval.json', 'r') as f:
    data = json.load(f)

# Extract keys from the different objects
keys = set()
for obj in data:
    keys.update(obj.keys())

# Print the keys
print("Keys in the JSON objects:", keys)
eval_auto_llm = []
for query, score, eval in zip(queries, query_scores, data):
  eval_auto_llm.append({
    "query": query,
    "automatic_evaluation": score,
    "llm_eval": eval
    })
    
with open('llm_eval_q.json', 'w') as f:
  json.dump(eval_auto_llm, f, indent=4)

Keys in the JSON objects: {'appropriateness_explanation', 'appropriateness_score', 'information_usage_score', 'relevance_score', 'accuracy_explanation', 'relevance_explanation', 'depth_and_elaboration_score', 'accuracy_score', 'improvement_suggestions', 'coherence_score', 'information_usage_explanation', 'coherence_explanation', 'essential_completeness_explanation', 'overall_score', 'overall_assessment', 'depth_explanation', 'essential_completeness_score'}


In [63]:
n_keys = set()

for obj in data:
  for key in obj.keys():
    if "_score" in key:
      
      n_keys.update([key.split('_score')[0] for key in obj.keys()])

n_keys

{'accuracy',
 'accuracy_explanation',
 'appropriateness',
 'appropriateness_explanation',
 'coherence',
 'coherence_explanation',
 'depth_and_elaboration',
 'depth_explanation',
 'essential_completeness',
 'essential_completeness_explanation',
 'improvement_suggestions',
 'information_usage',
 'information_usage_explanation',
 'overall',
 'overall_assessment',
 'relevance',
 'relevance_explanation'}

In [47]:
key_lists_text = {key: [] for key in keys if "_score" not in key}
key_lists_score = {key: [] for key in keys if "_score" in key}
# Populate the lists with values from the JSON objects
for obj in data:
    for key in keys:
      k = key.split("_")
      if k[1] == "score":
      if "_score" in key:
        key_lists_score[key].append(obj.get(key, None))
      else:
        key_lists_text[key].append(obj.get(key, None))

avg_scores = []
# Calculate the average scores
for key, values in key_lists_score.items():
    avg_score = sum(values) / len(values)
    avg_scores.append({
        "key": key,
        "average_score": avg_score
    })
    #key_lists_score[key] = avg_score

# Write the lists to a new JSON file
with open('key_lists_text.json', 'w') as f:
    json.dump(key_lists_text, f, indent=4)
with open('key_lists_score.json', 'w') as f:
    json.dump(key_lists_score, f, indent=4)
with open('avg_scores.json', 'w') as f:
    json.dump(avg_scores, f, indent=4)
# Print the keys
print("Keys in the JSON objects:", keys)

Keys in the JSON objects: {'appropriateness_explanation', 'appropriateness_score', 'information_usage_score', 'relevance_score', 'accuracy_explanation', 'relevance_explanation', 'depth_and_elaboration_score', 'accuracy_score', 'improvement_suggestions', 'coherence_score', 'information_usage_explanation', 'coherence_explanation', 'essential_completeness_explanation', 'overall_score', 'overall_assessment', 'depth_explanation', 'essential_completeness_score'}


In [7]:
import json




for i, result in enumerate(results):
  query_text = result.query.text
  documents = result.retrieved_documents.documents
  documents_list = [
    {
        'id': doc.id,
        'document': doc.document,
    }
    for doc in documents
  ]
  query_and_documents = {
    "query": query_text,
    "documents": documents_list
  }

  with open(f'query_{i}.json', 'w') as json_file:
    json.dump(query_and_documents, json_file, indent=4)
  

In [8]:
import json

for i, result in enumerate(results):
  query_text = result.query.text
  documents = result.retrieved_documents.documents
  documents_list = [
    {
        'id': doc.id,
        'summarized_document': doc.transformed_document,
    }
    for doc in documents
  ]
  query_and_documents = {
    "query": query_text,
    "summarized_documents": documents_list
  }

  with open(f'query_t_{i}.json', 'w') as json_file:
    json.dump(query_and_documents, json_file, indent=2)

In [5]:
import json

for i, result in enumerate(results):
  query_text = result.query.text
  response = result.response.text
  documents = result.retrieved_documents.documents
  documents_list = [
    {
        'id': doc.id,
        'summarized_document': doc.transformed_document,
    }
    for doc in documents
  ]
  query_and_documents = {
    "query": query_text,
    "summarized_documents": documents_list,
    "response": response
  }

  with open(f'query_t_{i}.json', 'w') as json_file:
    json.dump(query_and_documents, json_file, indent=2)

In [7]:
import json

for i, result in enumerate(results):
  query_text = result.query.text
  documents = result.retrieved_documents.documents
  documents_list = [
    {
        'id': doc.id,
        'full_document': doc.document,
        'summarized_document': doc.transformed_document,
    }
    for doc in documents
  ]
  query_and_documents = {
    "query": query_text,
    "documents": documents_list
  }

  with open(f'query__ft_{i}.json', 'w') as json_file:
    json.dump(query_and_documents, json_file, indent=2)

In [13]:
documents = results[0].retrieved_documents.documents

documents_list = [
    {
        'id': doc.id,
        'document': doc.document,
    }
    for doc in documents
]

with open('documents.json', 'w') as json_file:
    json.dump(documents_list, json_file, indent=4)

In [7]:
from src.task_manager import TaskManager

services = {
  "text_generation_service": text_generation_service,
  "classification_service": classification_service,
  "reranking_service": reranking_service,
  "document_retrieval_service": document_retrieval_service
}

task_manager = TaskManager(config._config, services)

In [20]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score
from rouge import Rouge

rouge = Rouge()

queries = ["What is the name...", ...]
ground_truths = ["The muscular tube...", ...]
rag_responses = ["The muscular tube...", ...]

rouge_l_scores = []
bleu_scores = []
meteor_scores = []

for query, ground_truth, rag_response in zip(queries, ground_truths, rag_responses):
    rouge_scores = rouge.get_scores(rag_response, ground_truth)
    rouge_l_scores.append(rouge_scores[0]['rouge-l']['f'])

    bleu_scores.append(sentence_bleu([ground_truth.split()], rag_response.split()))

    meteor_scores.append(single_meteor_score(ground_truth, rag_response))

avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_meteor = sum(meteor_scores) / len(meteor_scores)

print(f"Average ROUGE-L: {avg_rouge_l}")
print(f"Average BLEU: {avg_bleu}")
print(f"Average METEOR: {avg_meteor}")

{'PreprocessQueryTask': <src.tasks.preprocess_query.PreprocessQueryTask object at 0x79d79039aec0>, 'ReformulateQueryTask': <src.tasks.reformulate_query.ReformulateQueryTask object at 0x79d7903987f0>, 'DecomposeQueryTask': <src.tasks.decompose_query.DecomposeQueryTask object at 0x79d79039a110>, 'ClassifyQueryTask': <src.tasks.classify_query.ClassifyQueryTask object at 0x79d79039af20>, 'ClassifyMultistepQueryTask': <src.tasks.classify_multistep_query.ClassifyMultistepQueryTask object at 0x79d790398ee0>, 'RetrieveDocumentsTask': <src.tasks.retrieve_documents.RetrieveDocumentsTask object at 0x79d790398e50>, 'FilterDocumentsTask': <src.tasks.filter_documents.FilterDocumentsTask object at 0x79d790398a60>, 'ReRankingTask': <src.tasks.rerank_documents.ReRankingTask object at 0x79d79039bd90>, 'GenerateResponseTask': <src.tasks.generate_response.GenerateResponseTask object at 0x79d79039abf0>, 'GradeResponseTask': <src.tasks.grade_response.GradeResponseTask object at 0x79d79039b880>, 'ComplexQuer

In [27]:

from src.models.context import Context
from src.models.query import Query

results = []
for instance in dataset:

  query = instance['query']
  context = Context(query=Query(text=query))
  
  response = await text_generation_service.generate_text(user_prompt=query, system_prompt="", temperature=0.7)
  results.append(response)

print(results)

import json

with open('results.json', 'w') as json_file:
  json.dump(results, json_file, indent=4)

['The muscular tube that transports food from the mouth to the stomach is called the esophagus. The esophagus is a muscular tube that uses peristalsis, a wave-like motion, to propel food into the stomach.\n\nAs for chemical digestion, some limited chemical digestion does occur in the esophagus, but it is not a significant site of digestion. The enzymes present in the saliva, such as amylase and lipase, continue to break down carbohydrates and fats during the passage of food through the esophagus. However, the main site of chemical digestion occurs in the mouth, where saliva enzymes break down food into smaller molecules, and in the small intestine, where pancreatic enzymes and intestinal enzymes further break down carbohydrates, proteins, and fats.\n\nThe stomach, which receives the food from the esophagus, is a major site of mechanical digestion, where food is mixed with stomach acid and digestive enzymes to break down proteins and fats into smaller molecules.', 'The two main types of

In [28]:

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score
from bert_score import score
from rouge import Rouge
import nltk
nltk.download('wordnet')

rouge = Rouge()

rouge_l_scores = []
bleu_scores = []
meteor_scores = []

def automatic_evaluation(query, response, ground_truth):
    rouge_scores = rouge.get_scores(response, ground_truth)
    bleu = sentence_bleu([ground_truth.split()], response.split())
    meteor = single_meteor_score(ground_truth.split(), response.split())
    P, R, F1 = score([response], [ground_truth], lang="en", verbose=True)

    return {
        "rouge_l": rouge_scores[0]['rouge-l']['f'],
        "bleu": bleu,
        "meteor": meteor,
        "bert_score": F1
    }
    
scores = []

for instance, result in zip(dataset, results):
    query = instance["query"]
    response = result
    ground_truth = instance['ground_truth']
    scores.append(automatic_evaluation(query, response, ground_truth))

[nltk_data] Downloading package wordnet to /home/buddy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 91.91it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 1143.17it/s]


done in 0.01 seconds, 73.11 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 58.96it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 1113.14it/s]


done in 0.02 seconds, 50.01 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 63.36it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 1087.17it/s]
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


done in 0.02 seconds, 54.64 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 73.66it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 1069.70it/s]


done in 0.02 seconds, 61.81 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 60.56it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 1008.00it/s]


done in 0.02 seconds, 52.10 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 169.49it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 1124.18it/s]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


done in 0.01 seconds, 120.07 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 64.07it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 1038.19it/s]


done in 0.02 seconds, 54.35 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 50.35it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 943.81it/s]


done in 0.02 seconds, 43.54 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 111.61it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 1216.09it/s]


done in 0.01 seconds, 85.49 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 50.33it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 1025.50it/s]

done in 0.02 seconds, 43.85 sentences/sec





In [29]:
for score in scores:
    print(score)


sum_rouge_l = 0
sum_bleu = 0
sum_meteor = 0
sum_bert = 0

# Calculate the sum for each score type
for score in scores:
    sum_rouge_l += score['rouge_l']
    sum_bleu += score['bleu']
    sum_meteor += score['meteor']
    sum_bert += score['bert_score']

# Calculate the mean for each score type
num_scores = len(scores)
mean_rouge_l = sum_rouge_l / num_scores
mean_bleu = sum_bleu / num_scores
mean_meteor = sum_meteor / num_scores
mean_bert = sum_bert / num_scores

# Print the mean scores
print(f"Mean ROUGE-L: {mean_rouge_l}")
print(f"Mean BLEU: {mean_bleu}")
print(f"Mean METEOR: {mean_meteor}")
print(f"Mean BERT Score: {mean_bert}")

{'rouge_l': 0.35999999680000005, 'bleu': 0.11264407914737562, 'meteor': 0.42623495070219625, 'bert_score': tensor([0.8943])}
{'rouge_l': 0.23809523531746032, 'bleu': 0.03967366136148311, 'meteor': 0.24965564738292018, 'bert_score': tensor([0.8651])}
{'rouge_l': 0.26600984850396764, 'bleu': 0.056563164522335674, 'meteor': 0.29163921826965306, 'bert_score': tensor([0.8838])}
{'rouge_l': 0.15384615113727815, 'bleu': 1.8050399273508783e-155, 'meteor': 0.15294117647058827, 'bert_score': tensor([0.8529])}
{'rouge_l': 0.2696629181889913, 'bleu': 0.04950079826119334, 'meteor': 0.3312005940273669, 'bert_score': tensor([0.8720])}
{'rouge_l': 0.5909090862809917, 'bleu': 0.3075189572962212, 'meteor': 0.49188906331763477, 'bert_score': tensor([0.9384])}
{'rouge_l': 0.07741935249614991, 'bleu': 7.912046311067353e-232, 'meteor': 0.09414225941422596, 'bert_score': tensor([0.8252])}
{'rouge_l': 0.27966101370870444, 'bleu': 0.0434988839554199, 'meteor': 0.30200593471810094, 'bert_score': tensor([0.8530]