In [1]:
!pip install pandas transformers
!pip install pandas torch
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m98.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.6 MB/s[0m eta [36m0:00:0

In [2]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import random

In [3]:
# Check if a GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Load your full CSV data
data = pd.read_csv('/content/drive/MyDrive/Datasets/train.csv')
data.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [6]:
# Display basic information about the dataset
rows, cols = data.shape
print(f"There are {rows} rows and {cols} columns in the dataset")
print(f"There are {data.duplicated().sum()} duplicate values")
data.describe().transpose()
data.info()
data.isna().sum()

There are 287113 rows and 3 columns in the dataset
There are 0 duplicate values
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287113 entries, 0 to 287112
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          287113 non-null  object
 1   article     287113 non-null  object
 2   highlights  287113 non-null  object
dtypes: object(3)
memory usage: 6.6+ MB


id            0
article       0
highlights    0
dtype: int64

In [7]:
# Randomly sample 500 rows from dataset
sampled_data = data.sample(n=500, random_state=42)  # Adjust random_state for reproducibility

In [8]:
# Preprocess data
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Tokenize articles and highlights while truncating/padding
max_article_length = 512  # can be adjust this as needed
max_summary_length = 128  # can be adjust this as needed

tokenized_articles = []
attention_masks = []

for article in sampled_data['article']:
    tokens = tokenizer.encode("summarize: " + article, add_special_tokens=True, truncation=True, max_length=max_article_length, padding='max_length', return_tensors="pt")
    tokenized_articles.append(tokens.to(device))
    attention_mask = (tokens != tokenizer.pad_token_id).long().to(device)
    attention_masks.append(attention_mask)

tokenized_summaries = []
for highlight in sampled_data['highlights']:
    tokens = tokenizer.encode(highlight, add_special_tokens=True, truncation=True, max_length=max_summary_length, padding='max_length', return_tensors="pt")
    tokenized_summaries.append(tokens.to(device))

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
# Prepare input data
input_ids = torch.cat(tokenized_articles, dim=0)
labels = torch.cat(tokenized_summaries, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Define a custom dataset
dataset = TensorDataset(input_ids, attention_masks, labels)

# Define a DataLoader with the desired batch size
batch_size = 8  # Adjust the batch size
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [10]:
# Fine-tune a T5 model for summarization on the GPU
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



In [11]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_perplexity = 0  # Initialize perplexity
    num_batches = 0

    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Calculate perplexity (cross-entropy per word)
        perplexity = torch.exp(loss)
        total_perplexity += perplexity.item()
        num_batches += 1

    avg_loss = total_loss / len(dataloader)
    avg_perplexity = total_perplexity / num_batches

    print(f"Epoch {epoch + 1} - Avg Loss: {avg_loss}, Avg Perplexity: {avg_perplexity:.2f}")

Epoch 1 - Avg Loss: 7.489215964362735, Avg Perplexity: 5403.28
Epoch 2 - Avg Loss: 3.985093037287394, Avg Perplexity: 92.53
Epoch 3 - Avg Loss: 2.4192748618504356, Avg Perplexity: 11.95
Epoch 4 - Avg Loss: 2.0420859351990717, Avg Perplexity: 7.91
Epoch 5 - Avg Loss: 1.9089927351663982, Avg Perplexity: 6.98
Epoch 6 - Avg Loss: 1.8322837314908467, Avg Perplexity: 6.44
Epoch 7 - Avg Loss: 1.7658332870120095, Avg Perplexity: 6.00
Epoch 8 - Avg Loss: 1.6835504789201041, Avg Perplexity: 5.49
Epoch 9 - Avg Loss: 1.6137797681112138, Avg Perplexity: 5.16
Epoch 10 - Avg Loss: 1.5779474368171087, Avg Perplexity: 4.96


In [12]:
# Save the fine-tuned model
model.save_pretrained('fine_tuned_t5_summarizer')

In [13]:
# Inference using the fine-tuned model (GPU)
model = T5ForConditionalGeneration.from_pretrained('fine_tuned_t5_summarizer').to(device)
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [14]:
# Get user input
user_input = input("Enter the article or text you want to summarize: ")

# Tokenize and preprocess the user input
tokenized_input = tokenizer.encode("summarize: " + user_input, add_special_tokens=True, truncation=True, max_length=max_article_length, padding='max_length', return_tensors="pt").to(device)
attention_mask = (tokenized_input != tokenizer.pad_token_id).long().to(device)

# Generate the summary
with torch.no_grad():
    summary_ids = model.generate(tokenized_input, attention_mask=attention_mask, max_length=max_summary_length, num_beams=4, length_penalty=2.0, early_stopping=True)

# Decode the predicted summary
predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the summarized response
print("\nSummarized Response:")
print(predicted_summary)

Enter the article or text you want to summarize: A state in southern India is taking measures to contain an outbreak of the Nipah virus after two people died from the rare and often deadly disease, shutting schools and testing hundreds to prevent its spread.  Kerala’s chief minister Pinarayi Vijayan said the virus has been detected in the state’s Kozhikode district, urging residents to exercise caution and follow the health department’s safety guidelines.

Summarized Response:
Kerala’s chief minister Pinarayi Vijayan said the virus has been detected in the state’s Kozhikode district. urging residents to exercise caution and follow the health department’s safety guidelines.
