In [1]:
!pip install transformers



In [2]:
!pip install datasets



In [3]:
import datasets

ModuleNotFoundError: No module named 'datasets'

In [3]:
from datasets import load_dataset

xsum = load_dataset("xsum", split="train[:20%]+test[:20%]")
print(xsum)

ModuleNotFoundError: No module named 'datasets'

In [5]:
xsum = xsum.train_test_split(test_size=0.25)

In [6]:
xsum["train"][0]

{'document': 'A specialist animal rescue advisor and firefighters released the horse, named Spencer, after his leg broke through the floor of the trailer in Droxford, Hampshire, on Sunday afternoon.\nThe horse was anesthetised before the trailer wheels were removed.\nHampshire Fire and Rescue said it was "complicated rescue" but Spencer had suffered no lasting damage.',
 'summary': 'A horse has been freed from a transporter after his hind leg was trapped between the rear wheels.',
 'id': '35628962'}

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc.strip().replace("\n","") for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
tokenized_xsum = xsum.map(preprocess_function, batched=True)

  0%|          | 0/33 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [10]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
print('\nModel type:', str(type(model)))

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]


Model type: <class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>


In [11]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/NLP Project/Summarize/results-summarisation",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=7,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_xsum["train"],
    eval_dataset=tokenized_xsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,2.8135,2.565406


Saving model checkpoint to /content/drive/MyDrive/NLP Project/Summarize/results-summarisation/checkpoint-2500
Configuration saved in /content/drive/MyDrive/NLP Project/Summarize/results-summarisation/checkpoint-2500/config.json
Model weights saved in /content/drive/MyDrive/NLP Project/Summarize/results-summarisation/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/NLP Project/Summarize/results-summarisation/checkpoint-2500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/NLP Project/Summarize/results-summarisation/checkpoint-2500/special_tokens_map.json
Deleting older checkpoint [/content/drive/MyDrive/NLP Project/Summarize/results-summarisation/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to /content/drive/MyDrive/NLP Project/Summarize/results-summarisation/checkpoint-3000
Configuration saved in /content/drive/MyDrive/NLP Project/Summarize/results-summarisation/checkpoint-3000/config.json
Model we

In [None]:
xsum['train']

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq

In [None]:
tokenizer_cus = AutoTokenizer.from_pretrained("/content/drive/MyDrive/NLP Project/Summarize/results-summarisation/checkpoint-28000")
model_cus = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/NLP Project/Summarize/results-summarisation/checkpoint-28000")

In [None]:
summarize=pipeline('summarization',model=model_cus,tokenizer=tokenizer_cus,max_length = 85)

In [None]:
text="""Musk was born and grew up in Pretoria, South Africa. He attended the University of Pretoria before moving to Canada at age 17, acquiring citizenship through his Canadian-born mother. Two years later, he matriculated at Queen's University and transferred to the University of Pennsylvania, where he received bachelor's degrees in economics and physics. He moved to California in 1995 to attend Stanford University but decided to instead pursue a business career, co-founding the web software company Zip2 with his brother, Kimbal. The startup was acquired by Compaq for $307 million in 1999. The same year, Musk co-founded the online bank X.com, which merged with Confinity in 2000 to form PayPal. eBay bought PayPal in 2002 for $1.5 billion. """

In [None]:
summary=summarize(text)

In [None]:
summary[0]['summary_text']