In [2]:
import torch
import warnings
import pandas as pd
import numpy as np
import nltk


from datasets import Dataset, DatasetDict, load_metric, load_dataset

from transformers import T5Tokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoTokenizer
from evaluate import load


In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
nltk.download('punkt')
warnings.simplefilter(action='ignore', category=FutureWarning)

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load Data WOT

In [4]:
df = pd.read_excel('../WOT_internal_refined_6.xlsx')

df.to_csv('temp.csv')
df = pd.read_csv('temp.csv').rename(columns={'Unnamed: 0': 'id'}).drop(columns=['Unnamed: 0.1', 'Unnamed: 0.2', 'Unnamed: 0.3'])
df['id'] = df['id'].astype(str)

df['output'] = df['output'].fillna('unanswerable')

df_comp = df[['id', 'labels', 'input', 'output']].fillna('')

df_val = df_comp.sample(150)
train_test_ind = list(set(df_comp['id']) - set(df_val['id']))
df_train_test = df_comp[df_comp['id'].isin(train_test_ind)]

df_train_1_shot = df_train_test.sample(1)
df_train_64_shot =  df_train_test.sample(64)
df_train_128_shot = df_train_test.sample(128)
df_train_256_shot = df_train_test.sample(256)

test_ind_1_shot = list(set(df_train_test['id']) - set(df_train_1_shot['id']))
test_ind_64_shot = list(set(df_train_test['id']) - set(df_train_64_shot['id']))
test_ind_128_shot = list(set(df_train_test['id']) - set(df_train_128_shot['id']))
test_ind_256_shot = list(set(df_train_test['id']) - set(df_train_256_shot['id']))

df_test_1_shot = df_train_test[df_train_test['id'].isin(test_ind_1_shot)]
df_test_64_shot = df_train_test[df_train_test['id'].isin(test_ind_64_shot)]
df_test_128_shot = df_train_test[df_train_test['id'].isin(test_ind_128_shot)]
df_test_256_shot = df_train_test[df_train_test['id'].isin(test_ind_256_shot)]

ds_val = Dataset.from_pandas(df_val)

ds_test_0_shot = Dataset.from_pandas(df_train_test)
ds_0_shot = DatasetDict({'test': ds_test_0_shot, 'validation': ds_val})

ds_train_1_shot = Dataset.from_pandas(df_train_1_shot)
ds_test_1_shot = Dataset.from_pandas(df_test_1_shot)
ds_1_shot = DatasetDict({'train': ds_train_1_shot, 'test': ds_test_1_shot, 'validation': ds_val})

ds_train_64_shot = Dataset.from_pandas(df_train_64_shot)
ds_test_64_shot = Dataset.from_pandas(df_test_64_shot)
ds_64_shot = DatasetDict({'train': ds_train_64_shot, 'test': ds_test_64_shot, 'validation': ds_val})

ds_train_128_shot = Dataset.from_pandas(df_train_128_shot)
ds_test_128_shot = Dataset.from_pandas(df_test_128_shot)
ds_128_shot = DatasetDict({'train': ds_train_128_shot, 'test': ds_test_128_shot, 'validation': ds_val})

ds_train_256_shot = Dataset.from_pandas(df_train_256_shot)
ds_test_256_shot = Dataset.from_pandas(df_test_256_shot)
ds_256_shot = DatasetDict({'train': ds_train_256_shot, 'test': ds_test_256_shot, 'validation': ds_val})

In [8]:
df_comp.loc[0, 'input']

'How much cream cheese and other ingredients will I need?SPLITInstructions: Title:\ncream cheese and cashew dip\n\n|Description:\n smooth cashew butter blended with softened cream cheese and spiced with a bit of cinnamon and nutmeg makes a great dip for fresh fruit. add vanilla, honey or other spices, if you like. serve with your favorite fruits or crunchy vegetables.\n\n|Ingredients:\n1 (8.0-ounce) package cream cheese, softened\n1/2 cup smooth cashew butter\n2 tablespoons agave nectar, optional\nground cinnamon and/or grated nutmeg, to taste\n\n|Steps:\nblend all ingredients together in a blender or food processor.;\ntransfer to a bowl and serve with fruits or vegetables of your choice.\n\n\nHistory: nan\n'

## Load pre fine tune Datasets

### DoQA

In [4]:
df_doqa = pd.read_excel('../Data/doqa_wot_format.xlsx').drop(columns=['Unnamed: 0'])
df_doqa = df_doqa.fillna('')

ds_doqa_train = Dataset.from_pandas(df_doqa[df_doqa['split']=='train'].reset_index())
ds_doqa_eval = Dataset.from_pandas(df_doqa[df_doqa['split']=='validation'].reset_index())
ds_doqa_test = Dataset.from_pandas(df_doqa[df_doqa['split']=='test'].reset_index())
ds_doqa = DatasetDict({'train': ds_doqa_train, 'test': ds_doqa_test, 'validation': ds_doqa_eval})

### CoQA

In [5]:
ds_coqa = load_dataset("coqa")
df_coqa_train = pd.DataFrame(ds_coqa['train'])
df_coqa_train['data_split'] = ['train'] * len(df_coqa_train)
df_coqa_eval = pd.DataFrame(ds_coqa['validation'])
df_coqa_eval['data_split'] = ['validation'] * len(df_coqa_eval)

df_coqa = pd.concat([df_coqa_train, df_coqa_eval]).reset_index()

source_id = []
source = []
context = []
questions = []
answers = []
history = []
ans_start = []
ans_end = []
data_split = []

prev_utterances = 4


def create_conv(questions, answers):
    out = [''] * 2 * len(questions)
    out[::2] = questions
    out[1::2] = answers
    return out

df_coqa['conversation'] = df_coqa.apply(lambda row: create_conv(row['questions'], row['answers']['input_text']), axis=1)

for i, row in df_coqa.iterrows():
    for j, question in enumerate(row['questions']):
        context.append(row['story'])
        questions.append(question)
        ans_start = row['answers']['answer_start'][j]
        ans_end = row['answers']['answer_end'][j]
        answers.append(row['story'][ans_start:ans_end])
        history.append(row['conversation'][max(0, j*2-4):j*2])
        source_id.append(i)
        source.append(row['source'])
        data_split.append(row['data_split'])
     
df_coqa_wot = pd.DataFrame(data={'source_id': source_id, 'data_split': data_split, 'source': source, 'context': context, 'question': questions, 'answer':answers, 'history': history})

df_coqa_wot['input'] = df_coqa_wot.apply(lambda row: f"{row['question']}SPLITContext: {row['context']}\nHistory: {row['history']}\n", axis=1)
df_coqa_wot['output'] = df_coqa_wot['answer']

df_coqa_wot = df_coqa_wot[(df_coqa_wot['output'].apply(lambda x: len(x)) > 0) & (df_coqa_wot['output'].apply(lambda x: len(x.split(' '))) < 20)]

ds_coqa_train = Dataset.from_pandas(df_coqa_wot[df_coqa_wot['data_split']=='train'].reset_index())
ds_coqa_eval = Dataset.from_pandas(df_coqa_wot[df_coqa_wot['data_split']=='validation'].reset_index())

ds_coqa = DatasetDict({'train': ds_coqa_train,
                     'validation': ds_coqa_eval})

Found cached dataset coqa (/home/ubuntu/.cache/huggingface/datasets/coqa/default/1.0.0/1b03a32914e882ed315577005c472665e542419f910bab445815ad1929a7958f)


  0%|          | 0/2 [00:00<?, ?it/s]

### QuAC

In [6]:
ds_quac = load_dataset("quac")
df_quac_train = pd.DataFrame(ds_quac['train'])
df_quac_train['data_split'] = ['train'] * len(df_quac_train)
df_quac_eval = pd.DataFrame(ds_quac['validation'])
df_quac_eval['data_split'] = ['validation'] * len(df_quac_eval)
df_quac = pd.concat([df_quac_train, df_quac_eval]).reset_index()

source_id = []
source = []
context = []
titles = []
questions = []
answers = []
answer_starts = []
history = []
ans_start = []
ans_end = []
data_split = []

prev_utterances = 4

def create_conv(questions, answers):
    out = [''] * 2 * len(questions)
    out[::2] = questions
    out[1::2] = [a[0] for a in answers]
    return out

df_quac['conversation'] = df_quac.apply(lambda row: create_conv(row['questions'], row['answers']['texts']), axis=1)

for i, row in df_quac.iterrows():
    for j, question in enumerate(row['questions']):
        context.append(row['context'])
        titles.append(row['section_title'])
        questions.append(question)
        ans_start = row['answers']['answer_starts'][j][0]
        answer_starts.append(ans_start)
        answers.append(row['answers']['texts'][j][0])
        history.append(row['conversation'][max(0, j*2-4):j*2])
        source_id.append(row['dialogue_id'])
        source.append(row['wikipedia_page_title'])
        data_split.append(row['data_split'])

df_quac_wot = pd.DataFrame(data={'source_id': source_id,
                                 'data_split': data_split, 
                                 'source': source, 
                                 'title': titles,
                                 'context': context, 
                                 'question': questions,
                                 'answer':answers, 
                                 'answer_start': answer_starts,
                                 'history': history})

df_quac_wot['input'] = df_quac_wot.apply(lambda row: f"{row['question']}SPLITTitle: {row['title']}\nContext: {row['context']}\nHistory: {row['history']}\n", axis=1)
df_quac_wot['output'] = df_quac_wot['answer'].apply(lambda x: x.replace('CANNOTANSWER', 'unanswerable'))

ds_quac_train = Dataset.from_pandas(df_quac_wot[df_quac_wot['data_split']=='train'].reset_index())
ds_quac_eval = Dataset.from_pandas(df_quac_wot[df_quac_wot['data_split']=='validation'].reset_index())

ds_quac = DatasetDict({'train': ds_quac_train,
                     'validation': ds_quac_eval})   

Found cached dataset quac (/home/ubuntu/.cache/huggingface/datasets/quac/plain_text/1.1.0/4170258e7e72d7c81bd6441b3f3489ea1544f0ff226ce61e22bb00c6e9d01fb6)


  0%|          | 0/2 [00:00<?, ?it/s]

## Load Model & other Functions

In [4]:
model_name = "allenai/unifiedqa-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model_dir = f"Models/{model_name}/Data_Strat/"

In [5]:
prefix = "question: "
max_input_length = 512
max_target_length = 50

def clean_text(text):
    question = nltk.sent_tokenize(text.split('SPLIT')[0])
    sentences = nltk.sent_tokenize(text.split('SPLIT')[1])
    text_cleaned = "\n ".join([" ".join(question).lower(), " ".join(sentences).lower()])
    return text_cleaned

def preprocess_data(examples):
    texts_cleaned = [clean_text(text) for text in examples["input"]]
    inputs = [prefix + text for text in texts_cleaned]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def preprocess_test(examples):
    texts_cleaned = [clean_text(text) for text in examples["input"]]
    inputs = [prefix + text for text in texts_cleaned]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,
                            padding="max_length")
    return model_inputs

metric = load_metric('rouge')
exact_match_metric = load("exact_match")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels_raw = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels_raw]
    decoded_labels_list = [["\n".join(nltk.sent_tokenize(label.strip()))] 
                      for label in decoded_labels_raw]

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)
    
    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    #result['bleu'] = bleu.compute(predictions=decoded_preds, references=decoded_labels_list)['bleu']
    result['exact_match'] = exact_match_metric.compute(predictions=decoded_preds, references=decoded_labels)['exact_match']
    #result_bert_score = bert_score.compute(predictions=decoded_preds, references=decoded_labels_list, lang="en")['f1']
    #result['bert_score (avg. F1)'] = sum(result_bert_score) / len(result_bert_score)
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    with open('/home/ubuntu/QuestionAnswering/logging.txt', 'a') as log:
        log.write("\n" + str({k: round(v, 4) for k, v in result.items()}))
        
    return {k: round(v, 4) for k, v in result.items()}

## Training Parameters

In [6]:
per_device_eval_batch_size = 64     #64
per_device_train_batch_size = 16     #16
gradient_accumulation_steps = 4     # per_device_train_batch_size * gradient_accumulation_steps=64 change here to be 64

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=100,
    learning_rate=4e-5,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard",
    max_steps=5000,
    optim='adafactor'
)

data_collator = DataCollatorForSeq2Seq(tokenizer)

def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

## Pre Fine Tune

In [10]:
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
model_checkpoint = model_name
model_dir = model_dir
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
tokenized_train = ds_doqa['train'].map(preprocess_data, batched=True)
tokenized_dev = ds_doqa['validation'].map(preprocess_data, batched=True)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

print(f"MODEL_DIR: {model_dir}\nMODEL_CHECKPOINT:{model_checkpoint}")

Map:   0%|          | 0/4612 [00:00<?, ? examples/s]



Map:   0%|          | 0/911 [00:00<?, ? examples/s]

MODEL_DIR: Models/allenai/unifiedqa-t5-base/Data_Strat/
MODEL_CHECKPOINT:allenai/unifiedqa-t5-base


In [11]:
trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--allenai--unifiedqa-t5-base/snapshots/e72d6077d80b8a2e20e602c6ecf396e04f920746/config.json
Model config T5Config {
  "_name_or_path": "allenai/unifiedqa-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_leng

Step,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: history, output, context, question, answer_start, input, index, split, answer. If history, output, context, question, answer_start, input, index, split, answer are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 911
  Batch size = 64
***** Running Evaluation *****
  Num examples = 911
  Batch size = 64
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config Generati

KeyboardInterrupt: 

## Fine Tune

In [7]:
# Set to wherever finetuning continues and where it is supposed 
# to be stored

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
model_checkpoint = "./Models/allenai/unifiedqa-t5-base/Data_Strat/DoQA_PreTuned"
model_dir = model_dir
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
tokenized_train = ds_128_shot['train'].map(preprocess_data, batched=True)
tokenized_dev = ds_128_shot['validation'].map(preprocess_data, batched=True)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

print(f"MODEL_DIR: {model_dir}\nMODEL_CHECKPOINT:{model_checkpoint}")

Map:   0%|          | 0/128 [00:00<?, ? examples/s]



Map:   0%|          | 0/150 [00:00<?, ? examples/s]

MODEL_DIR: Models/allenai/unifiedqa-t5-base/Data_Strat/
MODEL_CHECKPOINT:./Models/allenai/unifiedqa-t5-base/Data_Strat/DoQA_PreTuned


In [8]:
trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

loading configuration file ./Models/allenai/unifiedqa-t5-base/Data_Strat/DoQA_PreTuned/config.json
Model config T5Config {
  "_name_or_path": "./Models/allenai/unifiedqa-t5-base/Data_Strat/DoQA_PreTuned",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_b

Step,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: id, output, __index_level_0__, input. If id, output, __index_level_0__, input are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 64
***** Running Evaluation *****
  Num examples = 150
  Batch size = 64
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Saving model checkpoint to Models/allenai/unifiedqa-t5-base/Data_Strat/checkpoint-100
Configuration saved 

KeyboardInterrupt: 

## Evaluate

In [22]:
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
model_checkpoint = "./Models/allenai/unifiedqa-t5-base/Data_Strat/CoQA_PreTuned"

model_checkpoint = "allenai/unifiedqa-t5-base"
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [23]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

test_tokenized_dataset = ds_val.map(preprocess_test, batched=True)

# prepare dataloader
test_tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
dataloader = torch.utils.data.DataLoader(test_tokenized_dataset, batch_size=64)

# generate text for each batch
all_predictions = []
for i,batch in enumerate(dataloader):
  predictions = model.generate(**batch)
  all_predictions.append(predictions)

# flatten predictions
all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

# tokenize and pad titles
all_titles = tokenizer(test_tokenized_dataset["output"], max_length=max_target_length,
                       truncation=True, padding="max_length")["input_ids"]

# compute metrics
predictions_labels = [all_predictions_flattened, all_titles]
compute_metrics(predictions_labels)

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--allenai--unifiedqa-t5-base/snapshots/e72d6077d80b8a2e20e602c6ecf396e04f920746/config.json
Model config T5Config {
  "_name_or_path": "allenai/unifiedqa-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_leng

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



{'rouge1': 15.6519,
 'rouge2': 9.4988,
 'rougeL': 14.9918,
 'rougeLsum': 15.1247,
 'exact_match': 0.0267,
 'gen_len': 6.3533}