In [3]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Sample reference and model responses
reference = """Regular exercise offers a wide range of benefits for both physical and mental well-being. Here's a comprehensive breakdown:

- **Improved Mood & Stress Reduction**: Exercise boosts endorphin levels, which not only improves mood but also helps alleviate stress, making you feel energized and positive.
- **Stronger Body & Bones**: Engaging in regular physical activity helps build muscle mass and increases bone density, protecting against age-related decline and promoting overall physical strength.
- **Boosted Immunity**: Physical activity strengthens the immune system, making it more effective in fighting off illness and keeping you healthier.
- **Better Sleep**: Consistent exercise helps regulate sleep patterns, leading to deeper, more restorative sleep cycles.
- **Weight Management**: Exercise burns calories, boosts metabolism, and helps in maintaining a healthy weight.

In essence, exercise fuels both your body and mind, contributing to a healthier and more balanced lifestyle. Whether you're looking for better sleep, enhanced immunity, or improved mood, regular physical activity is key."""


model_responses = {
    "Model 1": """Regular exercise delivers a wealth of benefits! Think:

Improved mood & reduced stress: Endorphins flood your brain, giving you a natural energy boost and positive feelings.
Stronger body & bones: Builds muscle mass & bone density, combating age-related decline.
Boosted immunity: Exercise helps the immune system fight off illness, keeping you healthier overall.
Better sleep: Regular exercise can regulate your sleep cycle for deeper, more restorative nights.
Weight management: Burns calories and boosts metabolism, making it easier to maintain a healthy weight.
Essentially, moving regularly fuels you to live better! 💪
""",

    "Model 2": """As a Logical Reasoning Expert, I've analyzed the context provided by Gemma and identified the key benefits of regular exercise:

The benefits of regular exercise can be summarized as follows:

Improved mood and reduced stress
Stronger body and bones
Boosted immunity
Better sleep
Weight management
These benefits are supported by evidence from various scientific studies, which consistently demonstrate the positive impact of physical activity on overall health and well-being.
""",

    "Model 3": """As an Engagement Expert with a keen interest in promoting physical activity for overall well-being, 
    I can confidently state that engaging regularly in exercises offers numerous advantages such as: 
    enhanced mood and stress reduction; fortified body strength and bone density to prevent age-related decline; 
    heightened immunity against illnesses; regulated sleep cycles leading to deeper restoration at night. 
    Additionally, it aids in maintaining an ideal weight through calorie burning and metabolism increase 💪 Exercise truly fuels us for better living!
""",

    "Model 4": """As a Creative Expert, I can frame the benefits of regular exercise with vivid storytelling:

Recharge: Endorphins energize, boosting mood and combating stress.
Sturdier You: Muscles get stronger, bones denser; aging gracefully becomes easier.
Health Shield: Immune system gets stronger against sickness.
Sweet Dreams: Regular workouts help you sleep better at night.
Leaner Me: Burns calories and boosts metabolism for maintaining a healthy weight.
""",

    "CoRE": """The Benefits of Regular Exercise: A Holistic Approach to Overall Well-being

Regular exercise is a powerful tool for achieving overall health and well-being. As various experts have highlighted, incorporating physical activity into your lifestyle can bring about numerous benefits, which can be summarized as follows:
Improved Mood and Reduced Stress: Regular exercise has been shown to increase the production of endorphins, also known as "feel-good" hormones, which can help alleviate stress and anxiety.
Stronger Body and Bones: Exercise helps build muscle mass and bone density, combating age-related decline and reducing the risk of osteoporosis.
Boosted Immunity: Physical activity can strengthen the immune system, helping to fight off illness and disease.
Better Sleep: Regular exercise can regulate sleep patterns, leading to deeper, more restorative nights' sleep.
Weight Management: Exercise burns calories and boosts metabolism, making it easier to maintain a healthy weight.
In addition to these physical benefits, regular exercise has been shown to have mental health benefits as well. It can help reduce symptoms of depression and anxiety, improve cognitive function, and even increase self-esteem.
Overall, incorporating regular exercise into your lifestyle can have a profound impact on both physical and mental health. Whether you're looking to boost your mood, build stronger muscles, or simply get better sleep, the benefits of regular exercise are undeniable.
As Gemma, the Conversational Expert, so aptly put it: "Regular exercise fuels you to live better!""",

}

In [5]:
# ROUGE Score (only recall and F1, no precision)
scorer = rouge_scorer.RougeScorer(['rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {}
for model, response in model_responses.items():
    rouge_scores[model] = scorer.score(reference, response)
    # Only keep recall and F1 for comparison
    rouge_scores[model] = {
        'rouge2': {'recall': rouge_scores[model]['rouge2'].recall, 'fmeasure': rouge_scores[model]['rouge2'].fmeasure},
        'rougeL': {'recall': rouge_scores[model]['rougeL'].recall, 'fmeasure': rouge_scores[model]['rougeL'].fmeasure}
    }

# Calculate averages of single models
rouge2_recall_avg = np.mean([rouge_scores[f'Model {i}']['rouge2']['recall'] for i in range(1, 5)])
rouge2_fmeasure_avg = np.mean([rouge_scores[f'Model {i}']['rouge2']['fmeasure'] for i in range(1, 5)])
rougeL_recall_avg = np.mean([rouge_scores[f'Model {i}']['rougeL']['recall'] for i in range(1, 5)])
rougeL_fmeasure_avg = np.mean([rouge_scores[f'Model {i}']['rougeL']['fmeasure'] for i in range(1, 5)])

# Compare MoRE with average single model scores
CoRE_rouge2_recall = rouge_scores['CoRE']['rouge2']['recall']
CoRE_rouge2_fmeasure = rouge_scores['CoRE']['rouge2']['fmeasure']
CoRE_rougeL_recall = rouge_scores['CoRE']['rougeL']['recall']
CoRE_rougeL_fmeasure = rouge_scores['CoRE']['rougeL']['fmeasure']


# BLEU Score
bleu_scores = {}
for model, response in model_responses.items():
    bleu_scores[model] = sentence_bleu([reference.split()], response.split())

# Calculate average BLEU for single models
bleu_avg = np.mean([bleu_scores[f'Model {i}'] for i in range(1, 5)])

# BERTScore
bert_scores = {}
for model, response in model_responses.items():
    P, R, F1 = score([response], [reference], lang="en")
    bert_scores[model] = F1.mean().item()

# Calculate average BERTScore for single models
bert_score_avg = np.mean([bert_scores[f'Model {i}'] for i in range(1, 5)])

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

In [6]:
# Output the results
print("ROUGE Scores (Single Models Average vs CoRE):")
print(f"\nAverage ROUGE-2 Recall: {rouge2_recall_avg:.4f}\nCoRE ROUGE-2 Recall: {CoRE_rouge2_recall:.4f}")
print(f"\nAverage ROUGE-2 F1: {rouge2_fmeasure_avg:.4f}\nCoRE ROUGE-2 F1: {CoRE_rouge2_fmeasure:.4f}")
print(f"\nAverage ROUGE-L Recall: {rougeL_recall_avg:.4f} \nCoRE ROUGE-L Recall: {CoRE_rougeL_recall:.4f}")
print(f"\nAverage ROUGE-L F1: {rougeL_fmeasure_avg:.4f} \nCoRE ROUGE-L F1: {CoRE_rougeL_fmeasure:.4f}")
print("---------------------------------------------------")
print("\nBLEU Scores (Single Models Average vs CoRE):")
print(f"\nAverage BLEU: {bleu_avg:.4f} \nCoRE BLEU: {bleu_scores['CoRE']:.4f}")
print("---------------------------------------------------")
print("\nBERTScore F1 (Single Models Average vs CoRE):")
print(f"\nAverage BERTScore: {bert_score_avg:.4f}\nCoRE BERTScore: {bert_scores['CoRE']:.4f}")

ROUGE Scores (Single Models Average vs CoRE):

Average ROUGE-2 Recall: 0.0970
CoRE ROUGE-2 Recall: 0.3421

Average ROUGE-2 F1: 0.1274
CoRE ROUGE-2 F1: 0.2680

Average ROUGE-L Recall: 0.2042 
CoRE ROUGE-L Recall: 0.5098

Average ROUGE-L F1: 0.2694 
CoRE ROUGE-L F1: 0.4000
---------------------------------------------------

BLEU Scores (Single Models Average vs CoRE):

Average BLEU: 0.0031 
CoRE BLEU: 0.1100
---------------------------------------------------

BERTScore F1 (Single Models Average vs CoRE):

Average BERTScore: 0.8748
CoRE BERTScore: 0.9018


In [8]:
# Calculate improvement percentage for MoRE compared to average single models
def calculate_improvement(new_value, avg_value):
    return ((new_value - avg_value) / avg_value) * 100 if avg_value != 0 else 0

# Improvement calculations
improvement_rouge2_recall = calculate_improvement(CoRE_rouge2_recall, rouge2_recall_avg)
improvement_rouge2_fmeasure = calculate_improvement(CoRE_rouge2_fmeasure, rouge2_fmeasure_avg)
improvement_rougeL_recall = calculate_improvement(CoRE_rougeL_recall, rougeL_recall_avg)
improvement_rougeL_fmeasure = calculate_improvement(CoRE_rougeL_fmeasure, rougeL_fmeasure_avg)
improvement_bleu = calculate_improvement(bleu_scores['CoRE'], bleu_avg)
improvement_bert = calculate_improvement(bert_scores['CoRE'], bert_score_avg)

# Results as a DataFrame
data = {
    'Metric': ['Human Eval','ROUGE-2 Recall', 'ROUGE-2 F1', 'ROUGE-L Recall', 'ROUGE-L F1', 'BLEU', 'BERTScore F1'],
    'Average of Single Models': [
        4.44, rouge2_recall_avg, rouge2_fmeasure_avg, 
        rougeL_recall_avg, rougeL_fmeasure_avg, 
        bleu_avg, bert_score_avg
    ],
    'CoRE': [
        7.30,CoRE_rouge2_recall, CoRE_rouge2_fmeasure, 
        CoRE_rougeL_recall, CoRE_rougeL_fmeasure, 
        bleu_scores['CoRE'], bert_scores['CoRE']
    ],
    'Improvement (%)': [
        64.41,improvement_rouge2_recall, improvement_rouge2_fmeasure,
        improvement_rougeL_recall, improvement_rougeL_fmeasure,
        improvement_bleu, improvement_bert
    ]
}

# Creating a DataFrame
df_results = pd.DataFrame(data)

# Display the results
print(df_results)

           Metric  Average of Single Models      CoRE  Improvement (%)
0      Human Eval                  4.440000  7.300000        64.410000
1  ROUGE-2 Recall                  0.097039  0.342105       252.542373
2      ROUGE-2 F1                  0.127449  0.268041       110.311928
3  ROUGE-L Recall                  0.204248  0.509804       149.600000
4      ROUGE-L F1                  0.269442  0.400000        48.455174
5            BLEU                  0.003145  0.109974      3396.730896
6    BERTScore F1                  0.874820  0.901769         3.080594
