# DD2417 Text Summarizer

## Imports

In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    pipeline,
)
from datasets import load_dataset
import torch
import numpy as np
import json
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Setup

In [2]:
model_path = "./base/checkpoint-16500"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
)
dataset = load_dataset(
    "wikihow",
    "sep",
    data_dir="./data",
    split="validation",
    trust_remote_code=True,
)
np.random.seed(1)
samples = len(dataset)
n = 20
idx = np.random.choice(samples, size=n, replace=False)
dataset = dataset.select_columns(["text", "headline"])
dataset = dataset.select(idx)
dataset_dict = dataset.to_dict()

### Dataset

In [3]:
generated = []
for text in tqdm(dataset_dict["text"]):
    summary = summarizer(text, max_length=50, min_length=3, do_sample=False)
    summary_text = summary[0]['summary_text']
    generated.append(summary_text)
dataset_dict["generated"] = generated
with open("manual_eval.json", "w+") as f:
    json.dump(dataset_dict, f, indent=4)

  0%|          | 0/50 [00:00<?, ?it/s]Your max_length is set to 50, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
 30%|███       | 15/50 [00:45<01:58,  3.39s/it]Your max_length is set to 50, but your input_length is only 41. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)
 40%|████      | 20/50 [00:59<01:34,  3.15s/it]Your max_length is set to 50, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
 44%|████▍     | 22/50 [01:04<01:15,  2.70s/it]Your max_length is set to 50, but your input_length is only 30. Since this is a

### Evaluation