<a href="https://colab.research.google.com/github/ShraddhaSharma24/Natural-Language-Processing/blob/main/Text_Summarization_using_T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install transformers datasets
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset
import torch





In [13]:
dataset = load_dataset("cnn_dailymail", "3.0.0")
train_data = dataset['train'].select(range(1000))
val_data = dataset['validation'].select(range(200))


In [14]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

def preprocess_function(batch):
    input_texts = ["summarize: " + doc for doc in batch["article"]]
    target_texts = batch["highlights"]

    model_inputs = tokenizer(
        input_texts, max_length=512, truncation=True, padding="max_length"
    )

    labels = tokenizer(
        target_texts, max_length=128, truncation=True, padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


train_data = train_data.map(preprocess_function, batched=True)
val_data = val_data.map(preprocess_function, batched=True)


In [15]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')


In [16]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    save_steps=100,
    weight_decay=0.01,
    fp16=True if torch.cuda.is_available() else False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,1.1064,0.822894


In [None]:
def summarize(text):
    input_ids = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True).input_ids
    summary_ids = model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Try it
test_text = dataset['test'][0]['article']
print("Original:", test_text[:300])
print("\nSummary:", summarize(test_text))
