In [8]:
!pip install transformers torch datasets accelerate



In [9]:
import pandas as pd
from datasets import load_dataset,DatasetDict

In [12]:
dataset = load_dataset("EdinburghNLP/xsum")

new_dataset = DatasetDict({
    "train": dataset['train'].shuffle(seed = 42).select(range(30000)),
    "validation": dataset['validation'].shuffle(seed = 43).select(range(2000)),
    "test":dataset['test'].shuffle(seed = 43).select(range(2000)),
})


In [25]:
from transformers import AutoTokenizer,T5ForConditionalGeneration

checkpoint = "google-t5/t5-small"
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [26]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [27]:
tokenized_xsum = new_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [28]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [32]:
from transformers import TrainingArguments,Trainer

training_args = TrainingArguments(
    output_dir="./t5_xsum_pretrained_1.0",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=10_000,
    fp16=True,
)



In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_xsum['train'],
    eval_dataset=tokenized_xsum['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [34]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
10000,2.7696,2.520031
20000,2.7845,2.476628
30000,2.7204,2.461113


TrainOutput(global_step=30000, training_loss=2.7509970703125, metrics={'train_runtime': 2346.2451, 'train_samples_per_second': 12.786, 'train_steps_per_second': 12.786, 'total_flos': 3799347300335616.0, 'train_loss': 2.7509970703125, 'epoch': 1.0})

In [35]:
model.save_pretrained("./T5_xsum_pretrained1")
tokenizer.save_pretrained("./T5_xsum_pretrained1")

('./T5_xsum_pretrained1/tokenizer_config.json',
 './T5_xsum_pretrained1/special_tokens_map.json',
 './T5_xsum_pretrained1/spiece.model',
 './T5_xsum_pretrained1/added_tokens.json',
 './T5_xsum_pretrained1/tokenizer.json')

In [36]:
import shutil
import os
os.makedirs("./models", exist_ok=True)
shutil.make_archive("T5_model", 'zip', "./T5_xsum_pretrained1")

'/content/T5_model.zip'

In [37]:
checkpoint = "./T5_xsum_pretrained1"
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [38]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples["document"],
        max_length=1024,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    labels = tokenizer(
        examples["summary"],
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    inputs["labels"] = labels["input_ids"]
    return inputs

In [43]:
tokenized_validation = new_dataset['validation'].select(range(10)).map(preprocess_function, batched=True)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [44]:
predictions = []
references = []
model.eval()
for example in tokenized_validation:
    inputs = tokenizer(
        example["document"], return_tensors="pt", max_length=4096, truncation=True
    )
    outputs = model.generate(inputs["input_ids"], max_length=128, num_beams=4)
    predicted_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(predicted_summary)
    references.append(example["summary"])  # Replace with your reference column


In [45]:
!pip install rouge_score
!pip install evaluate
from rouge_score import rouge_scorer
import evaluate

rouge = evaluate.load("rouge")

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=ac2fdc1791fac32f6bdcba933ed0629b5d8fc81ff5134e79dc02679ecaa06145
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [46]:
results = rouge.compute(predictions=predictions, references=references)
print(results)

{'rouge1': 0.28883674112026325, 'rouge2': 0.09951231143397607, 'rougeL': 0.23737622613938147, 'rougeLsum': 0.2388364719556295}
