**Mounting Google Drive**

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


**importing  libraries**

In [2]:
# Import pandas library for data manipulation
import pandas as pd

# Import torch library for PyTorch functionalities
import torch

# Importing DataLoader and Dataset from torch.utils.data enables creating my datasets and data loaders for efficient batch processing in PyTorch training loops.
from torch.utils.data import DataLoader, Dataset

# Import classes from transformers library for using T5 model and AdamW optimizer
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

# Import tqdm for displaying progress bars during training
from tqdm import tqdm


**Loading the train and validation data**

In [3]:
# Load the train and validation data
train_data = pd.read_csv('/content/drive/My Drive/train_data1.csv')
val_data = pd.read_csv('/content/drive/My Drive/validate_data.csv')


**Defining the Mydataset class**

In [4]:

class MyDataset(Dataset):

    def __init__(self, data, tokenizer, max_input_length=512, max_output_length=150):#constructor of the MyDataset class, responsible for dataset handling and preparation for model training.
        self.data = data  # Initialize with the provided data
        self.tokenizer = tokenizer  # Set the tokenizer for encoding text
        self.max_input_length = max_input_length  # Maximum length for input sequences
        self.max_output_length = max_output_length  # Maximum length for output sequences

    def __len__(self):
        return len(self.data)  # Return the total number of samples in the dataset

    def __getitem__(self, idx): # responsible for retrieving a specific sample from your dataset at the given index (idx).
        article = self.data.iloc[idx]['article']  # Get the article text at the specified index
        highlight = self.data.iloc[idx]['highlight']  # Get the highlight (target summary) text at the specified index

        # Encode the article text using the tokenizer, ensuring it fits within max_input_length
        inputs = self.tokenizer.encode_plus(
            article,
            max_length=self.max_input_length,
            padding='max_length',  # Pad to ensure all inputs are the same length
            truncation=True,  # Truncate if the text exceeds max_length
            return_tensors="pt"  # Return PyTorch tensors
        )

        # Encode the highlight text using the tokenizer, ensuring it fits within max_output_length
        targets = self.tokenizer.encode_plus(
            highlight,
            max_length=self.max_output_length,
            padding='max_length',  # Pad to ensure all targets are the same length
            truncation=True,  # Truncate if the text exceeds max_length
            return_tensors="pt"  # Return PyTorch tensors
        )

        # Return a dictionary containing the input_ids, attention_mask, and labels
        #flatten()to ensure that the tensors representing input_ids,attention_mask and labels are in a suitable format for training
        return {
            'input_ids': inputs.input_ids.flatten(),  # Flattened input_ids tensor
            'attention_mask': inputs.attention_mask.flatten(),  # Flattened attention_mask tensor
            'labels': targets.input_ids.flatten()  # Flattened labels (target sequences) tensor
        }

    def display_data_size(self):
        print(f"Dataset size: {len(self.data)}")

    def get_sample(self, idx):
        return self.__getitem__(idx)


**Initializing the tokenizer and model**

In [5]:
# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('/content/drive/My Drive/fine_tuning')
model = T5ForConditionalGeneration.from_pretrained('/content/drive/My Drive/fine_tuning')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


**Creating the dataset and dataloaders**

In [6]:
# Create the dataset using the myDataset class for training data
train_dataset = MyDataset(train_data, tokenizer)

# Create the dataset using the myDataset class for validation data
val_dataset = MyDataset(val_data, tokenizer)

# Create a DataLoader for training dataset with batch size 8 and shuffle enabled (for random sampling)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Create a DataLoader for validation dataset with batch size 8 and shuffle disabled (for sequential sampling)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


**Defining the optimizer and setting device**

In [7]:
# Define the optimizer
optimizer = AdamW(model.parameters(), lr=6e-5)

# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)




T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

**Initializing early stopping parameters**

In [8]:
# Early stopping parameters
patience = 3  # Number of epochs with no improvement after which training will be stopped
best_val_loss = float('inf')  # Initialize the best validation loss to infinity
epochs_no_improve = 0  # Counter for epochs with no improvement


# TRAINING LOOP

In [9]:
# Training loop for a single epoch
epochs = 1  # Maximum number of epochs to train (here only 1 epoch)
for epoch in range(epochs):

    # Set the model in training mode
    model.train()

    # Initialize the total training loss for this epoch
    train_loss = 0

    # Initialize tqdm loop to display progress bar during training
    loop = tqdm(train_loader, leave=True)

    # Iterate through each batch in the training DataLoader
    for batch in loop:
        # Move batch tensors to the appropriate device (CPU or GPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Clear previously calculated gradients
        optimizer.zero_grad()

        # Forward pass: compute model outputs and loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass: compute gradients
        loss.backward()

        # Update model parameters
        optimizer.step()

        # Accumulate the training loss
        train_loss += loss.item()

        # Update tqdm progress bar description and postfix with current loss
        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(train_loss=loss.item())

    # Calculate average training loss for this epoch
    avg_train_loss = train_loss / len(train_loader)

    # Print average training loss for this epoch
    print(f'Average training loss: {avg_train_loss}')


Epoch 1: 100%|██████████| 1437/1437 [09:24<00:00,  2.54it/s, train_loss=1.14]

Average training loss: 0.7568697169288298





***display training and validatin loss and save the model and tokenizer***

In [10]:
# Training loop
epochs = 1  # Maximum number of epochs to train
for epoch in range(epochs):
    model.train()
    train_loss = 0
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(train_loss=loss.item())

    avg_train_loss = train_loss / len(train_loader)
    print(f'Average training loss: {avg_train_loss}')

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f'Validation loss: {avg_val_loss}')

    # Check if the validation loss improved
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        # Save the model and tokenizer
        model.save_pretrained('/content/drive/My Drive/fine_tuning')
        tokenizer.save_pretrained('/content/drive/My Drive/fine_tuning')
        print("Model improved. Saving the model.")
    else:
        epochs_no_improve += 1
        print(f'No improvement for {epochs_no_improve} epochs.')


    # Check if early stopping should be triggered
    if epochs_no_improve == patience:
        print("Early stopping triggered. Stopping training.")
        break

print("Training completed.")


Epoch 1: 100%|██████████| 1437/1437 [09:18<00:00,  2.57it/s, train_loss=0.72]


Average training loss: 0.7505269895308367
Validation loss: 0.6532170535237701
Model improved. Saving the model.
Training completed.


increasing epochs

In [11]:
# Training loop
epochs = 2  # Maximum number of epochs to train
for epoch in range(epochs):
    model.train()
    train_loss = 0
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(train_loss=loss.item())

    avg_train_loss = train_loss / len(train_loader)
    print(f'Average training loss: {avg_train_loss}')

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f'Validation loss: {avg_val_loss}')

    # Check if the validation loss improved
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        # Save the model and tokenizer
        model.save_pretrained('/content/drive/My Drive/fine_tuning')
        tokenizer.save_pretrained('/content/drive/My Drive/fine_tuning')
        print("Model improved. Saving the model.")
    else:
        epochs_no_improve += 1
        print(f'No improvement for {epochs_no_improve} epochs.')


    # Check if early stopping should be triggered
    if epochs_no_improve == patience:
        print("Early stopping triggered. Stopping training.")
        break

print("Training completed.")


Epoch 1: 100%|██████████| 1437/1437 [09:23<00:00,  2.55it/s, train_loss=0.934]


Average training loss: 0.7435482833106731
Validation loss: 0.6416343261522276
Model improved. Saving the model.


Epoch 2: 100%|██████████| 1437/1437 [09:17<00:00,  2.58it/s, train_loss=0.607]


Average training loss: 0.7372110169134625
Validation loss: 0.6327376671963267
Model improved. Saving the model.
Training completed.


# **EVALUATION**

**Install the rouge-score package**

In [12]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=166adcc202227fd8d5996d906aa9de6450901b036b521066073987dd834cd6ee
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


## **ImportLibraries**

In [13]:


import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
from rouge_score import rouge_scorer

# Define a custom dataset class
class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_output_length=150):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        article = self.data.iloc[idx]['article']
        highlight = self.data.iloc[idx]['highlight']

        inputs = self.tokenizer.encode_plus(
            article,
            max_length=self.max_input_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        targets = self.tokenizer.encode_plus(
            highlight,
            max_length=self.max_output_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs.input_ids.flatten(),
            'attention_mask': inputs.attention_mask.flatten(),
            'labels': targets.input_ids.flatten()
        }

# Function to calculate ROUGE scores
def calculate_rouge_scores(hypotheses, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for hyp, ref in zip(hypotheses, references):
        scores = scorer.score(hyp, ref)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)

    return avg_rouge1, avg_rouge2, avg_rougeL

# Load the validation data
val_data = pd.read_csv('/content/drive/My Drive/validate_data.csv')

# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('/content/drive/My Drive/fine_tuning')
model = T5ForConditionalGeneration.from_pretrained('/content/drive/My Drive/fine_tuning')

# Create the dataset and dataloader
val_dataset = MyDataset(val_data, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set model to evaluation mode
model.eval()
# Initialize empty lists to store generated summaries and target summaries
hypotheses = []
references = []
# Disable gradient calculation for inference
with torch.no_grad():
  # Iterate over batches in the validation data loader with progress bar
    for batch in tqdm(val_loader):
      # Move batch tensors to the appropriate device (GPU if available)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Generate summaries
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=150, num_beams=2, early_stopping=True)
         # Decode generated summaries and target summaries to text
        preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
        targets = [tokenizer.decode(l, skip_special_tokens=True, clean_up_tokenization_spaces=True) for l in labels]
 # Extend the hypotheses and references lists with the generated and target summaries
        hypotheses.extend(preds)
        references.extend(targets)

# Calculate ROUGE scores
rouge1, rouge2, rougeL = calculate_rouge_scores(hypotheses, references)

print(f'ROUGE-1: {rouge1:.4f} ROUGE-2: {rouge2:.4f} ROUGE-L: {rougeL:.4f}')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 216/216 [06:09<00:00,  1.71s/it]


ROUGE-1: 0.3970 ROUGE-2: 0.2024 ROUGE-L: 0.3125


**example**

In [None]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load validation data (assuming 'val_data.csv' contains 'article' and 'highlight' columns)
val_data = pd.read_csv('/content/drive/My Drive/validate_data.csv')

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('/content/drive/My Drive/fine_tuning')
model = T5ForConditionalGeneration.from_pretrained('/content/drive/My Drive/fine_tuning')

# Set device to GPU if available, otherwise to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to generate summaries
def generate_summary(article_text, tokenizer, model):
    inputs = tokenizer.encode_plus(
        article_text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Generate summary
    summary_ids = model.generate(
        inputs.input_ids.to(device),
        attention_mask=inputs.attention_mask.to(device),
        max_length=150,
        num_beams=2,
        early_stopping=True
    )

    # Decode the summary tokens to text
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    return summary

# Select two records for demonstration
record1 = val_data.iloc[0]
record2 = val_data.iloc[1]

# Generate summaries
summary1 = generate_summary(record1['article'], tokenizer, model)
summary2 = generate_summary(record2['article'], tokenizer, model)

# Display the results
print("Record 1:")
print("Article:")
print(record1['article'])
print("\nHighlight:")
print(record1['highlight'])
print("\nGenerated Summary:")
print(summary1)

print("\n-------------------------------------------------\n")

print("Record 2:")
print("Article:")
print(record2['article'])
print("\nHighlight:")
print(record2['highlight'])
print("\nGenerated Summary:")
print(summary2)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Record 1:
Article:
brendan rodgers deliver weekly briefing media friday afternoon previewing showdown arsenal saturday discussing future raheem sterling young england star moved step closer anfield exit revealing tv interview ready sign new contract anfield sterling 20 offered new 100000aweek contract stay club admitted flattered interest arsenal follow press conference happens 2pm host commentator brendan rodgers brought close broadcast section press conference rest quotes embargoed tonight summary rodgers insisted raheem sterling going anywhere summer focus moment purely football manager said relaxed contract situation liverpool superpower world football dont sell best players admit however sterlings interview conducted without prior consent club took surprise concentration weeks back purely focus football help continue development place hes made great strides last couple years last time sat plays representatives intention raheem come made feelings clear hopefully continue focus foot