In [2]:
from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import pandas as pd
from datasets import Dataset
import random

In [3]:
base_model = T5ForConditionalGeneration.from_pretrained('t5-small')
base_tokenizer = T5Tokenizer.from_pretrained('t5-small')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
reviews = pd.read_csv('../../data/reviews.csv')

def add_punc(s):
    if s[-1] not in ('.', '!', '?'):
        s = s + '.'
    return s

reviews.dropna(inplace=True)

reviews['Summary'] = reviews['Summary'].map(add_punc)

print(reviews.shape)

reviews.head()

(96486, 3)


Unnamed: 0,Text,Summary,Score
0,Great taffy at a great price. There was a wid...,Great taffy.,5
1,This taffy is so good. It is very soft and ch...,"Wonderful, tasty taffy.",5
2,Right now I'm mostly just sprouting this so my...,Yay Barley.,5
3,This is a very healthy dog food. Good for thei...,Healthy Dog Food.,5
4,good flavor! these came securely packed... the...,fresh and greasy!,4


In [6]:
reviews = reviews[(reviews['Summary'].str.len() < 100) & (reviews['Summary'].str.len() >=30)]

reviews.shape

(13073, 3)

In [7]:
random.seed(0)

reviews_dataset = Dataset.from_pandas(reviews.astype(str).sample(5000))

In [8]:
prefix = "summarize: "

# we will manually add our own labels because unlike GPT, we cannot assume the labels are based on the inputs
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["Text"]]
    model_inputs = base_tokenizer(inputs, max_length=1024, truncation=True)

    labels = base_tokenizer(examples["Summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [22]:
tokenized_reviews_dataset = reviews_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 5000/5000 [00:00<00:00, 10448.75 examples/s]


In [23]:
tokenized_reviews_dataset # if you check the input_ids, you will see all the ids staring with 21603 which is teh id for Sumamrize as we are adding prefix summarize to all data which helps T5 understand the task at hand

Dataset({
    features: ['Text', 'Summary', 'Score', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5000
})

In [24]:
tokenized_reviews_dataset = tokenized_reviews_dataset.train_test_split(test_size=.1)

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer=base_tokenizer, model=base_model)

In [13]:
training_args = TrainingArguments(
    output_dir="./t5_summary_results",
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    load_best_model_at_end=True,
    logging_steps=50,
    save_strategy='epoch'
)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_reviews_dataset["train"],
    eval_dataset=tokenized_reviews_dataset["test"],
    data_collator=data_collator,
)

trainer.evaluate()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


[34m[1mwandb[0m: Currently logged in as: [33mniketgirdhar2004[0m ([33mniketgirdhar2004-vit-chennai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 4.5269246101379395,
 'eval_model_preparation_time': 0.0009,
 'eval_runtime': 3.3077,
 'eval_samples_per_second': 151.162,
 'eval_steps_per_second': 4.837}

In [14]:
trainer.train()



Epoch,Training Loss,Validation Loss,Model Preparation Time
1,3.7485,3.43019,0.0009
2,3.5297,3.348232,0.0009
3,3.4351,3.28368,0.0009
4,3.3286,3.240723,0.0009
5,3.2781,3.201202,0.0009
6,3.2903,3.174802,0.0009
7,3.2328,3.147642,0.0009
8,3.1591,3.128328,0.0009
9,3.1374,3.112822,0.0009
10,3.1014,3.098534,0.0009


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2820, training_loss=3.151630442193214, metrics={'train_runtime': 1244.5448, 'train_samples_per_second': 72.316, 'train_steps_per_second': 2.266, 'total_flos': 1240979641073664.0, 'train_loss': 3.151630442193214, 'epoch': 20.0})

In [15]:
trainer.evaluate()



{'eval_loss': 3.0376880168914795,
 'eval_model_preparation_time': 0.0009,
 'eval_runtime': 1.5303,
 'eval_samples_per_second': 326.728,
 'eval_steps_per_second': 10.455,
 'epoch': 20.0}

In [16]:
trainer.save_model()

In [17]:
loaded_model = T5ForConditionalGeneration.from_pretrained('./t5_summary_results')

# summarization pipeline prepends a default prefix of summarize: 
generator = pipeline(
    'summarization', model=loaded_model, tokenizer=base_tokenizer
)

Device set to use mps:0


In [18]:
sam = reviews.sample(1)

print(sam['Summary'])

text = sam['Text'].tolist()[0]
text

74105    Great Coffee at a great value.
Name: Summary, dtype: object


'I am a coffee fanatic.  This coffee is delicious!  As good if not better than other brands, and the price is reasonable.'

In [19]:
# Generating a summary
generator(text, min_length=3, max_length=15, early_stopping=True, num_beams=2)

Both `max_new_tokens` (=256) and `max_length`(=15) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'summary_text': 'Great coffee for a great price.'}]

In [20]:
# Try the base t5 on the same text
base_generator = pipeline(
    'summarization', model='t5-small', tokenizer='t5-small'
)

# Summary is a bit more extractive than our fine-tuned version and style isn't quite the same as our dataset
base_generator(text, min_length=3, max_length=15, early_stopping=True, num_beams=2)

Device set to use mps:0
Both `max_new_tokens` (=256) and `max_length`(=15) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'summary_text': 'a coffee fanatic, this coffee is delicious!'}]