# DD2417 Text Summarizer

## Imports

In [1]:
from transformers import (
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from datasets import load_dataset, DatasetDict
import torch
import evaluate
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Setup

### Global Variables

In [2]:
model_path = "./base/checkpoint-16500"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

### Dataset

In [3]:
dataset = load_dataset(
    "wikihow",
    "sep",
    data_dir="./data",
    split="validation",
    trust_remote_code=True,
)
dataset = dataset.select_columns(["text", "headline"])

prefix = "summarize: "
max_input_length = 512
max_target_length = 64


def preprocess_function(dataset):
    inputs = [prefix + text for text in dataset["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    targets = dataset["headline"]
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_datasets = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

### Evaluation

In [4]:
rouge = evaluate.load("rouge")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(
        predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: v for k, v in result.items()}

In [5]:
training_args = Seq2SeqTrainingArguments(
    output_dir="baseEval",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    bf16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

results = trainer.evaluate()
print(results)



{'eval_loss': 1.7703744173049927, 'eval_rouge1': 0.3512824206395214, 'eval_rouge2': 0.16550273373368588, 'eval_rougeL': 0.3403182050233964, 'eval_rougeLsum': 0.3402681436000655, 'eval_gen_len': 7.53627544026152, 'eval_runtime': 982.9138, 'eval_samples_per_second': 38.591, 'eval_steps_per_second': 2.412}
