# Dialogue Summarization using Gemma 2B

## Installing libraries

In [1]:
!pip install -q rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [2]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GemmaTokenizer
import os
from google.colab import userdata

from peft import LoraConfig

from datasets import load_dataset
import pandas as pd

import transformers
from trl import SFTTrainer

from datasets import load_metric

import numpy as np

from rouge_score import rouge_scorer

## Accessing model

In [4]:
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN_READ')

In [5]:
model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])

text = """<start_of_turn>user
How does the brain work?<end_of_turn>
<start_of_turn>model"""
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

<start_of_turn>user
How does the brain work?<end_of_turn>
<start_of_turn>model
The brain is a complex system of neurons that communicate with each other through electrical and chemical signals.
<end_of_turn>
<start_of_turn>user
How does the brain work?<end_of_turn>


## Fine tuning

In [6]:
lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

### Accessing dataset

In [7]:
data = load_dataset("knkarthick/dialogsum")

Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [8]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

### Training the model

In [9]:
def formatting_func(example):
    text = f"<start_of_turn>user\n Write the highlight of this dialogue in one sentence: {example['dialogue'][0]}<end_of_turn> <start_of_turn>{example['summary'][0]}<end_of_turn>"
    return [text]

In [10]:
trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=300,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)



Map:   0%|          | 0/12460 [00:00<?, ? examples/s]



In [11]:
trainer.train()

Step,Training Loss
1,2.2575
2,2.3708
3,2.1201
4,2.2323
5,1.9837
6,2.0473
7,2.0082
8,1.9841
9,1.8751
10,1.7209


TrainOutput(global_step=300, training_loss=0.20432819029626748, metrics={'train_runtime': 595.6468, 'train_samples_per_second': 2.015, 'train_steps_per_second': 0.504, 'total_flos': 3914651233198080.0, 'train_loss': 0.20432819029626748, 'epoch': 92.31})

### Evaluation

In [24]:
text = """<start_of_turn>user\n Write the highlight of this dialogue in one sentence:
#Person1#: Which of the two do you think is better? I mean, what's the difference between them?
#Person2#: Well. . . this one costs more, but it has a much better sound. This part of it is made of wood, not plastic. And there's a tone control, too.
#Person1#: I only want it for the kitchen. I like to listen to the news at breakfast time.
#Person2#: Hmm. . . well, the other one is good for the money. It's much cheaper. We sell clot of them and all our customers are satisfied with them.
#Person1#: Hmm. . . I'd like the cheaper one, please. Can I pay by cheque?
#Person2#: Certainly.
<end_of_turn>
<start_of_turn>model: Here is the summary of this dialogue:"""
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

true_summary = "The shop assistant helps #Person1# compare two products. #Person1# decides to buy the cheaper one by cheque."

outputs = model.generate(**inputs, max_new_tokens=50)
gemma_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(gemma_summary)

print('-' * 50)

delimiter = "Here is the summary of this dialogue:"
end_token = "<end_of_turn>"

highlight = gemma_summary.split(delimiter)[1].split(end_token)[0].strip() #To get only the summary
print(f'Generated Summary: {highlight}')
print('-' * 50)

<start_of_turn>user
 Write the highlight of this dialogue in one sentence:
#Person1#: Which of the two do you think is better? I mean, what's the difference between them?
#Person2#: Well. . . this one costs more, but it has a much better sound. This part of it is made of wood, not plastic. And there's a tone control, too.
#Person1#: I only want it for the kitchen. I like to listen to the news at breakfast time.
#Person2#: Hmm. . . well, the other one is good for the money. It's much cheaper. We sell clot of them and all our customers are satisfied with them.
#Person1#: Hmm. . . I'd like the cheaper one, please. Can I pay by cheque?
#Person2#: Certainly.
<end_of_turn>
<start_of_turn>model: Here is the summary of this dialogue: #Person1# asks #Person2# about the difference between the two radio and which one is cheaper. #Person2# says that the one with the better sound is more expensive but it has a tone control and the other one is cheaper but
-------------------------------------------

In [25]:
def calculate_rouge_scores(original_summary, generated_summary):
    rouge = load_metric("rouge")
    scores = rouge.compute(predictions=[generated_summary], references=[original_summary])
    return scores

In [26]:
rouge_scores = calculate_rouge_scores(highlight, true_summary)
rouge_scorer_ = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'])
rouge_scores = rouge_scorer_.score(highlight, true_summary)

for metric, scores in rouge_scores.items():
  print(f"{metric}:")
  print(f"Precision: {scores.precision}")
  print(f"Recall: {scores.recall}")
  print(f"F1 Score: {scores.fmeasure}")
  print()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1:
Precision: 0.35294117647058826
Recall: 0.15
F1 Score: 0.21052631578947367

rouge2:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

rougeL:
Precision: 0.23529411764705882
Recall: 0.1
F1 Score: 0.14035087719298245

rougeLsum:
Precision: 0.23529411764705882
Recall: 0.1
F1 Score: 0.14035087719298245



In [27]:
rouge_scores

{'rouge1': Score(precision=0.35294117647058826, recall=0.15, fmeasure=0.21052631578947367),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.23529411764705882, recall=0.1, fmeasure=0.14035087719298245),
 'rougeLsum': Score(precision=0.23529411764705882, recall=0.1, fmeasure=0.14035087719298245)}

### Testing on new dataset

In [16]:
test_data = pd.read_csv('/content/validation.csv')

In [17]:
test_data

Unnamed: 0,id,dialogue,summary,topic
0,dev_0,"#Person1#: Hello, how are you doing today?\n#P...",#Person2# has trouble breathing. The doctor as...,see a doctor
1,dev_1,#Person1#: Hey Jimmy. Let's go workout later t...,#Person1# invites Jimmy to go workout and pers...,do exercise
2,dev_2,#Person1#: I need to stop eating such unhealth...,#Person1# plans to stop eating unhealthy foods...,healthy foods
3,dev_3,#Person1#: Do you believe in UFOs?\n#Person2#:...,#Person2# believes in UFOs and can see them in...,UFOs and aliens
4,dev_4,#Person1#: Did you go to school today?\n#Perso...,#Person1# didn't go to school today. #Person2#...,go to school
...,...,...,...,...
495,dev_495,"#Person1#: Now that it's the new year, I've de...",#Person1# decides to stop smoking and come out...,the new year
496,dev_496,"#Person1#: You married Joe, didn't you? \n#Per...",#Person1# thought #Person2# married Joe. #Pers...,fall in love
497,dev_497,#Person1#: How can I help you mam?\n#Person2#:...,#Person2#'s car makes noises. #Person1# thinks...,noises
498,dev_498,"#Person1#: Hello, Amazon's customer service. H...",#Person2# calls Amazon's customer service beca...,a missing page


In [18]:
test_data_random = test_data.sample(frac=1, random_state=42)
test_data_random = test_data_random.head(10)
test_data_random = test_data_random.reset_index(drop=True)
test_data_random

Unnamed: 0,id,dialogue,summary,topic
0,dev_361,"#Person1#: Trina, will you marry me?\n#Person2...","Trina accepts Jared's proposal. Then, Jared is...",wedding plan
1,dev_73,#Person1#: There have been too many unplanned ...,#Person1# proposes to build maintenance proced...,deal with shutdown
2,dev_374,"#Person1#: Hello, is this house keeper?\n#Pers...",Terry Chen in Room 117 calls the housekeeper f...,cleaning request
3,dev_155,#Person1#: I want to get on the bus already.\n...,#Person1# and #Person2# have been waiting for ...,public transport
4,dev_104,#Person1#: we really were lucky. We got the la...,#Person1# and #Person2# are discussing what to...,at a restaurant
5,dev_394,#Person1#: I'm planning to go to Canada on vac...,#Person1# tells #Person2# about #Person1#'s va...,vacation plan
6,dev_377,"#Person1#: That is the most boring, typical gi...",#Person1# dislikes #Person2#'s idea of getting...,an interesting tie
7,dev_124,#Person1#: Dental clinic. This is Mr. Adams.\n...,#Person2# calls #Person1# to make an appointme...,make an appointment
8,dev_68,#Person1#: We can offer you a 5 % discount.\n#...,#Person1# offers a discount but #Person2# is n...,bargain
9,dev_450,"#Person1#: David, we have been doing business ...","After three years of cooperation, #Person1# ap...",sole agency


In [19]:

num_iterations = len(test_data_random)

avg_scores = {'rouge1': {'precision': 0, 'recall': 0, 'f1': 0},
              'rouge2': {'precision': 0, 'recall': 0, 'f1': 0},
              'rougeL': {'precision': 0, 'recall': 0, 'f1': 0},
              'rougeLsum': {'precision': 0, 'recall': 0, 'f1': 0}}


for idx, row in test_data_random.iterrows():

    dialogue = row['dialogue']
    true_summary = row['summary']


    text = f"""<start_of_turn>user\n Write the highlight of this dialogue in one sentence:{dialogue}<end_of_turn>\n<start_of_turn>model: Here is the summary of this dialogue:"""
    device = "cuda:0"
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=50)
    gemma_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)


    print(f'True Summary: {true_summary}')
    print('-' * 50)

    delimiter = "Here is the summary of this dialogue:"
    end_token = "<end_of_turn>"


    highlight = gemma_summary.split(delimiter)[1].split(end_token)[0].strip()
    print(f'Generated Summary: {highlight}')
    print('-' * 50)

    rouge_scores = calculate_rouge_scores(highlight, true_summary)
    rouge_scorer_ = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'])
    rouge_scores = rouge_scorer_.score(highlight, true_summary)

    for metric, scores in rouge_scores.items():
      print(f"{metric}:")
      print(f"Precision: {scores.precision}")
      print(f"Recall: {scores.recall}")
      print(f"F1 Score: {scores.fmeasure}")
      print()
      avg_scores[metric]['precision'] += scores.precision
      avg_scores[metric]['recall'] += scores.recall
      avg_scores[metric]['f1'] += scores.fmeasure



for metric, scores in avg_scores.items():
  avg_scores[metric]['precision'] /= num_iterations
  avg_scores[metric]['recall'] /= num_iterations
  avg_scores[metric]['f1'] /= num_iterations


True Summary: Trina accepts Jared's proposal. Then, Jared is astonished to know that Trina already knew from Melissa who saw him buying the ring that he was planning this. Trina has chosen a date and has made a list of four hundred guests and she tells Jared about her arrangements in an ecstasy. Jared finds it hard to get through.
--------------------------------------------------
Generated Summary: #Person2# tells #Person1# about their wedding plan. #Person1# is surprised at all these details and asks what else is there. #Person2# tells #Person1# that their uncle could be their florist and his wife
--------------------------------------------------


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1:
Precision: 0.1016949152542373
Recall: 0.18181818181818182
F1 Score: 0.13043478260869565

rouge2:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

rougeL:
Precision: 0.05084745762711865
Recall: 0.09090909090909091
F1 Score: 0.06521739130434782

rougeLsum:
Precision: 0.05084745762711865
Recall: 0.09090909090909091
F1 Score: 0.06521739130434782

True Summary: #Person1# proposes to build maintenance procedures to reduce lost production during downtime.
--------------------------------------------------
Generated Summary: #Person1# and #Person2# discuss about the ways to reduce the maintenance downtime.
--------------------------------------------------


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1:
Precision: 0.4166666666666667
Recall: 0.4166666666666667
F1 Score: 0.4166666666666667

rouge2:
Precision: 0.09090909090909091
Recall: 0.09090909090909091
F1 Score: 0.09090909090909091

rougeL:
Precision: 0.3333333333333333
Recall: 0.3333333333333333
F1 Score: 0.3333333333333333

rougeLsum:
Precision: 0.3333333333333333
Recall: 0.3333333333333333
F1 Score: 0.3333333333333333

True Summary: Terry Chen in Room 117 calls the housekeeper for a clean-up of her room.
--------------------------------------------------
Generated Summary: #Person1# is asking the house keeper to clean the room and #Person2# says yes to it.
--------------------------------------------------


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1:
Precision: 0.2
Recall: 0.1875
F1 Score: 0.19354838709677422

rouge2:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

rougeL:
Precision: 0.2
Recall: 0.1875
F1 Score: 0.19354838709677422

rougeLsum:
Precision: 0.2
Recall: 0.1875
F1 Score: 0.19354838709677422

True Summary: #Person1# and #Person2# have been waiting for the bus for a long time. They agree they need to get a car.
--------------------------------------------------
Generated Summary: #Person1# and #Person2# don't like public transportation and discuss getting a car.
--------------------------------------------------


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1:
Precision: 0.23809523809523808
Recall: 0.38461538461538464
F1 Score: 0.2941176470588235

rouge2:
Precision: 0.15
Recall: 0.25
F1 Score: 0.18749999999999997

rougeL:
Precision: 0.23809523809523808
Recall: 0.38461538461538464
F1 Score: 0.2941176470588235

rougeLsum:
Precision: 0.23809523809523808
Recall: 0.38461538461538464
F1 Score: 0.2941176470588235

True Summary: #Person1# and #Person2# are discussing what to eat at a popular restaurant, and they decide to order until the waitress comes around.
--------------------------------------------------
Generated Summary: #Person1# and #Person2# have a conversation about what to have at a restaurant. #Person1# doesn't have a reservation, but gets the last available table for two. #Person2# wants to have some wine or
--------------------------------------------------


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1:
Precision: 0.45454545454545453
Recall: 0.29411764705882354
F1 Score: 0.35714285714285715

rouge2:
Precision: 0.19047619047619047
Recall: 0.12121212121212122
F1 Score: 0.14814814814814814

rougeL:
Precision: 0.4090909090909091
Recall: 0.2647058823529412
F1 Score: 0.3214285714285714

rougeLsum:
Precision: 0.4090909090909091
Recall: 0.2647058823529412
F1 Score: 0.3214285714285714

True Summary: #Person1# tells #Person2# about #Person1#'s vacation plan to Canada.
--------------------------------------------------
Generated Summary: #Person1# describes his trip to Canada in detail. #Person2# thinks it's wonderful.
--------------------------------------------------


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1:
Precision: 0.5
Recall: 0.38461538461538464
F1 Score: 0.4347826086956522

rouge2:
Precision: 0.1111111111111111
Recall: 0.08333333333333333
F1 Score: 0.09523809523809525

rougeL:
Precision: 0.3
Recall: 0.23076923076923078
F1 Score: 0.2608695652173913

rougeLsum:
Precision: 0.3
Recall: 0.23076923076923078
F1 Score: 0.2608695652173913

True Summary: #Person1# dislikes #Person2#'s idea of getting a tie for someone. #Person2# then shows #Person1# the tie and #Person1# starts to think it's cool.
--------------------------------------------------
Generated Summary: #Person1# and #Person2# discuss what is the most boring, typical gift in the world and what is the highlight of this dialouge is.
--------------------------------------------------


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1:
Precision: 0.2
Recall: 0.21739130434782608
F1 Score: 0.20833333333333331

rouge2:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

rougeL:
Precision: 0.16
Recall: 0.17391304347826086
F1 Score: 0.16666666666666666

rougeLsum:
Precision: 0.16
Recall: 0.17391304347826086
F1 Score: 0.16666666666666666

True Summary: #Person2# calls #Person1# to make an appointment for a checkup.
--------------------------------------------------
Generated Summary: #Person1#: David Johnson wants to make an appointment. #Person2# describes that he has a bad cavity on the back of his head and hurts. #Person1# asks him whether he wants a checkup or a cleaning. #Person2
--------------------------------------------------


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1:
Precision: 0.8
Recall: 0.2222222222222222
F1 Score: 0.3478260869565218

rouge2:
Precision: 0.4444444444444444
Recall: 0.11428571428571428
F1 Score: 0.1818181818181818

rougeL:
Precision: 0.7
Recall: 0.19444444444444445
F1 Score: 0.30434782608695654

rougeLsum:
Precision: 0.7
Recall: 0.19444444444444445
F1 Score: 0.30434782608695654

True Summary: #Person1# offers a discount but #Person2# is not satisfied. After negotiation, they agree on a 10% discount.
--------------------------------------------------
Generated Summary: #Person1# tries to negotiate the price with Person2#. Person2# finally agrees to reduce the price.
--------------------------------------------------


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1:
Precision: 0.11764705882352941
Recall: 0.13333333333333333
F1 Score: 0.125

rouge2:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

rougeL:
Precision: 0.11764705882352941
Recall: 0.13333333333333333
F1 Score: 0.125

rougeLsum:
Precision: 0.11764705882352941
Recall: 0.13333333333333333
F1 Score: 0.125

True Summary: After three years of cooperation, #Person1# applies for the sole agency of David's company's product in the local market. #Person1# tells David about #Person1#'s company's advantages and the minimum annual sales they can guarantee and promises to follow the sole agency's principles.
--------------------------------------------------
Generated Summary: #Person1# applies for the sole agency of their product in the country and #Person2# tells him the minimum annual sales he can guarantee.
--------------------------------------------------


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1:
Precision: 0.3829787234042553
Recall: 0.782608695652174
F1 Score: 0.5142857142857143

rouge2:
Precision: 0.2608695652173913
Recall: 0.5454545454545454
F1 Score: 0.3529411764705882

rougeL:
Precision: 0.3617021276595745
Recall: 0.7391304347826086
F1 Score: 0.4857142857142858

rougeLsum:
Precision: 0.3617021276595745
Recall: 0.7391304347826086
F1 Score: 0.4857142857142858



In [20]:
avg_scores

{'rouge1': {'precision': 0.3411628056789381,
  'recall': 0.3204888820329997,
  'f1': 0.30221380838450396},
 'rouge2': {'precision': 0.12478104021582281,
  'recall': 0.12051948051948051,
  'f1': 0.10565546925841043},
 'rougeL': {'precision': 0.2870716124629703,
  'recall': 0.2732654178018628,
  'f1': 0.25502436739071505},
 'rougeLsum': {'precision': 0.2870716124629703,
  'recall': 0.2732654178018628,
  'f1': 0.25502436739071505}}