# T5 - Book title generation

### Imports

In [1]:
import pandas as pd
import numpy as np
import transformers
import string
import nltk
import re
import evaluate
import gradio as gr
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from datasets import (
    load_dataset,
    load_metric,
    Dataset
)

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tommo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load data

In [2]:
df = pd.read_csv('./booksummaries.txt',
                sep='\t',
                names=[
                    'ID', 'Link', 'Title',
                    'Author', 'PubDate', 'Genre', 'Summary'],
                header=None,
                usecols=['Title', 'Summary'])


print(df.shape)
df.head(3)

(16559, 2)


Unnamed: 0,Title,Summary
0,Animal Farm,"Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,"Alex, a teenager living in near-future Englan..."
2,The Plague,The text of The Plague is divided into five p...


### Data preprocessing, engineering

The data is already cleaned (downloaded from <a href="https://www.kaggle.com/datasets/athu1105/book-genre-prediction">Kaggle</a>):

In [3]:
df.isnull().sum()

Title      0
Summary    0
dtype: int64

Create dataset and splitting data:

In [4]:
ds = Dataset.from_pandas(df[['Title', 'Summary']])
ds = ds.shuffle(seed=42)
ds = ds.train_test_split(0.2)

Load tokenizer:

In [5]:
model_checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,
                                          model_max_length=512)

Filter dataset based on summary length (<20 would be too low and >500 would be too much for the T5 model):

In [6]:
ds['train'] = ds['train'].filter(
    lambda example: (len(example['Summary']) >= 500) and
    (len(example['Summary']) >= 20)
)
ds['test'] = ds['test'].filter(
    lambda example: (len(example['Summary']) >= 500) and
    (len(example['Summary']) >= 20)
)

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

Encode inputs and outputs:

In [7]:
max_input_length = 512
max_target_length = 64

def clean_text(text):
    sentences = nltk.sent_tokenize(text.strip())
    sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]
    sentences_cleaned_no_titles = [sent for sent in sentences_cleaned
                                   if len(sent) > 0 and
                                   sent[-1] in string.punctuation]
    text_cleaned = "\n".join(sentences_cleaned_no_titles)
    return text_cleaned

def preprocess_data(examples):
    inputs = [f'generate title: {clean_text(text)}' for text in examples["Summary"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(examples["Title"],
                       max_length=max_target_length,
                       truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_ds = ds.map(preprocess_data, batched=True)
tokenized_ds = tokenized_ds.remove_columns(['Summary', 'Title'])

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

### Fine tuning

In [8]:
batch_size = 4 # GPU limitation
model_name = "t5-base-book-title-generation-V1"
model_dir = f"models/{model_name}"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=100,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

data_collator = DataCollatorForSeq2Seq(tokenizer)
metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)
    result = {key: value * 100 for key, value in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

loading configuration file config.json from cache at C:\Users\tommo/.cache\huggingface\hub\models--t5-base\snapshots\23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9\config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
  

In [9]:
%load_ext tensorboard
%tensorboard --logdir '{model_dir}'/runs

In [10]:
trainer.train()

loading configuration file config.json from cache at C:\Users\tommo/.cache\huggingface\hub\models--t5-base\snapshots\23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9\config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
  

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,3.3919,2.651303,31.6982,17.1045,31.5075,31.5526,5.1954
200,2.8881,2.602286,32.4981,17.3788,32.2152,32.275,5.2018
300,2.7356,2.601072,32.298,17.4716,32.0654,32.0931,5.2438
400,2.8373,2.547202,32.798,17.5216,32.5768,32.6331,5.0246
500,2.7215,2.533113,32.7508,17.3038,32.4839,32.5266,5.1986
600,2.5724,2.522356,32.9241,17.4144,32.5644,32.6057,5.521
700,2.7677,2.531556,33.6225,17.759,33.3624,33.4032,5.2249
800,2.6345,2.521943,32.9882,17.8454,32.7522,32.8295,5.3431
900,2.6382,2.507135,33.6243,17.8089,33.2959,33.318,5.7057
1000,2.6826,2.503758,33.1797,17.8172,32.9081,32.9493,5.4185


***** Running Evaluation *****
  Num examples = 2810
  Batch size = 4
Saving model checkpoint to models/t5-base-book-title-generation-V1\checkpoint-100
Configuration saved in models/t5-base-book-title-generation-V1\checkpoint-100\config.json
Model weights saved in models/t5-base-book-title-generation-V1\checkpoint-100\pytorch_model.bin
tokenizer config file saved in models/t5-base-book-title-generation-V1\checkpoint-100\tokenizer_config.json
Special tokens file saved in models/t5-base-book-title-generation-V1\checkpoint-100\special_tokens_map.json
Copy vocab file to models/t5-base-book-title-generation-V1\checkpoint-100\spiece.model
***** Running Evaluation *****
  Num examples = 2810
  Batch size = 4
Saving model checkpoint to models/t5-base-book-title-generation-V1\checkpoint-200
Configuration saved in models/t5-base-book-title-generation-V1\checkpoint-200\config.json
Model weights saved in models/t5-base-book-title-generation-V1\checkpoint-200\pytorch_model.bin
tokenizer config file

Model weights saved in models/t5-base-book-title-generation-V1\checkpoint-1200\pytorch_model.bin
tokenizer config file saved in models/t5-base-book-title-generation-V1\checkpoint-1200\tokenizer_config.json
Special tokens file saved in models/t5-base-book-title-generation-V1\checkpoint-1200\special_tokens_map.json
Copy vocab file to models/t5-base-book-title-generation-V1\checkpoint-1200\spiece.model
Deleting older checkpoint [models\t5-base-book-title-generation-V1\checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2810
  Batch size = 4
Saving model checkpoint to models/t5-base-book-title-generation-V1\checkpoint-1300
Configuration saved in models/t5-base-book-title-generation-V1\checkpoint-1300\config.json
Model weights saved in models/t5-base-book-title-generation-V1\checkpoint-1300\pytorch_model.bin
tokenizer config file saved in models/t5-base-book-title-generation-V1\checkpoint-1300\tokenizer_config.json
Special tokens file saved in model

Configuration saved in models/t5-base-book-title-generation-V1\checkpoint-2300\config.json
Model weights saved in models/t5-base-book-title-generation-V1\checkpoint-2300\pytorch_model.bin
tokenizer config file saved in models/t5-base-book-title-generation-V1\checkpoint-2300\tokenizer_config.json
Special tokens file saved in models/t5-base-book-title-generation-V1\checkpoint-2300\special_tokens_map.json
Copy vocab file to models/t5-base-book-title-generation-V1\checkpoint-2300\spiece.model
Deleting older checkpoint [models\t5-base-book-title-generation-V1\checkpoint-2100] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2810
  Batch size = 4
Saving model checkpoint to models/t5-base-book-title-generation-V1\checkpoint-2400
Configuration saved in models/t5-base-book-title-generation-V1\checkpoint-2400\config.json
Model weights saved in models/t5-base-book-title-generation-V1\checkpoint-2400\pytorch_model.bin
tokenizer config file saved in models/t5-base-book-t

***** Running Evaluation *****
  Num examples = 2810
  Batch size = 4
Saving model checkpoint to models/t5-base-book-title-generation-V1\checkpoint-3400
Configuration saved in models/t5-base-book-title-generation-V1\checkpoint-3400\config.json
Model weights saved in models/t5-base-book-title-generation-V1\checkpoint-3400\pytorch_model.bin
tokenizer config file saved in models/t5-base-book-title-generation-V1\checkpoint-3400\tokenizer_config.json
Special tokens file saved in models/t5-base-book-title-generation-V1\checkpoint-3400\special_tokens_map.json
Copy vocab file to models/t5-base-book-title-generation-V1\checkpoint-3400\spiece.model
Deleting older checkpoint [models\t5-base-book-title-generation-V1\checkpoint-3100] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2810
  Batch size = 4
Saving model checkpoint to models/t5-base-book-title-generation-V1\checkpoint-3500
Configuration saved in models/t5-base-book-title-generation-V1\checkpoint-3500\config.j

KeyboardInterrupt: 

### Test custom title generations

In [None]:
tokenizer = AutoTokenizer.from_pretrained('./models/t5-base-book-title-generation-V1/checkpoint-3700/')
model = AutoModelForSeq2SeqLM.from_pretrained('./models/t5-base-book-title-generation-V1/checkpoint-3700/').to('cuda')

In [11]:
def preprocess_input(text):
    text = text.strip()
    text = re.sub('\n', ' ', text)
    text = re.sub('\s{2,}', ' ', text)
    return [f'generate title: {text}']

def generate_title_from_summary(summary, top_p):
    inputs = preprocess_input(summary)
    inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt").to('cuda')
    output = model.generate(**inputs, do_sample=True, max_length=50, top_p=top_p, top_k=0)
    decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
    return predicted_title

summary = """
The story begins with Geralt of Rivia, Crown Princess Ciri of Cintra,
and the sorceress Yennefer of Vengerberg at different points of time,
exploring formative events that shape their characters throughout the first season,
before eventually merging into a single timeline.
Geralt and Ciri are linked by destiny since before she was born
when he unknowingly demanded her as a reward for his services by invoking the Law of Surprise.
After the two finally meet, Geralt becomes the princess's protector
and must help her and fight against her various pursuers to prevent her Elder Blood
and powerful magic from being used for malevolent purposes and keep Ciri and their world safe.
"""

demo = gr.Interface(
    fn=generate_title_from_summary,
    inputs=[gr.Textbox(value=summary.strip(), lines=3), gr.Slider(0, 1)],
    outputs=["text"],
)
demo.launch()

Running on local URL:  http://127.0.0.1:7869

To create a public link, set `share=True` in `launch()`.


(<gradio.routes.App at 0x1bf800851c0>, 'http://127.0.0.1:7869/', None)