In [1]:
# Import necessary libraries
import pandas as pd  # pandas for data manipulation
import torch  # PyTorch for deep learning
from torch.utils.data import DataLoader, Dataset  # DataLoader and Dataset for handling data
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW  # T5 model and AdamW Optimizer from Hugging Face
from tqdm import tqdm  # tqdm for progress bars

# Load the train and validation data
train_data = pd.read_csv('train_data.csv')  # Load training data 
val_data = pd.read_csv('validate_data.csv')  # Load validation data 

In [2]:
# Define a  dataset class
class MyDataset(Dataset):  #Mydataset class inheriting from torch.utils.data.Dataset
    def __init__(self, data, tokenizer, max_input_length=512, max_output_length=150):  # Constructor
        self.data = data  # Store the data
        self.tokenizer = tokenizer  # Store the tokenizer
        self.max_input_length = max_input_length  # Maximum length of input sequences
        self.max_output_length = max_output_length  # Maximum length of output sequences

    def __len__(self):  # Length method for the dataset
        return len(self.data)  # Return the length of the data

    def __getitem__(self, idx):  # Get item method for the dataset
        article = self.data.iloc[idx]['article']  # Get the article at the specified index
        highlight = self.data.iloc[idx]['highlight']  # Get the highlight at the specified index

        inputs = self.tokenizer.encode_plus(  # Tokenize the article
            article,
            max_length=self.max_input_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        targets = self.tokenizer.encode_plus(  # Tokenize the highlight
            highlight,
            max_length=self.max_output_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
             #The flatten() method is used to convert a tensor into a 1-dimensional tensor.
            'input_ids': inputs.input_ids.flatten(),  # Flatten the input IDs tensor 
            'attention_mask': inputs.attention_mask.flatten(),  # Flatten the attention mask tensor
            'labels': targets.input_ids.flatten()  # Flatten the target labels tensor
        }


In [3]:
# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('fine_tuning')  # reLoad the tokenizer
model = T5ForConditionalGeneration.from_pretrained('fine_tuning')  # reLoad the model

# Create the dataset and dataloader with  batch size
train_dataset = MyDataset(train_data, tokenizer)  # Create a dataset for training data
val_dataset = MyDataset(val_data, tokenizer)  # Create a dataset for validation data
train_loader = DataLoader(train_dataset, batch_size=20, shuffle=True)  # DataLoader for training data
val_loader = DataLoader(val_dataset, batch_size=20, shuffle=False)  # DataLoader for validation data

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Define the optimizer with the  learning rate
optimizer = AdamW(model.parameters(), lr=5e-5)  # AdamW optimizer with a learning rate of 5e-5 for updating the weights of the model during training.

# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Use GPU if available, else use CPU
model.to(device)  # Move the model to the selected device




T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [5]:
# Training loop
epochs = 1  # Number of epochs
for epoch in range(epochs):  # Loop over each epoch
    model.train()  # Set model to training mode
    train_loss = 0  # Initialize training loss
    loop = tqdm(train_loader, leave=True)  # tqdm progress bar for training data
    for batch in loop:  # Loop over each batch in the training data
        input_ids = batch['input_ids'].to(device)  # Move input IDs to the selected device
        attention_mask = batch['attention_mask'].to(device)  # Move attention mask to the selected device
        labels = batch['labels'].to(device)  # Move labels to the selected device

        optimizer.zero_grad()  # Zero the gradients
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)  # Forward pass
        loss = outputs.loss  # Get the loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update the model parameters

        train_loss += loss.item()  # Accumulate the training loss
        loop.set_description(f'Epoch {epoch+1}')  # Update the progress bar description
        loop.set_postfix(train_loss=loss.item())  # Update the progress bar with the current loss

    avg_train_loss = train_loss / len(train_loader)  # Calculate average training loss
    print(f'Average training loss: {avg_train_loss}')  # Print average training loss

    model.eval()  # Set model to evaluation mode
    val_loss = 0  # Initialize validation loss
    with torch.no_grad():  # Disable gradient calculations for validation
        for batch in val_loader:  # Loop over each batch in the validation data
            input_ids = batch['input_ids'].to(device)  # Move input IDs to the selected device
            attention_mask = batch['attention_mask'].to(device)  # Move attention mask to the selected device
            labels = batch['labels'].to(device)  # Move labels to the selected device

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)  # Forward pass
            loss = outputs.loss  # Get the loss
            val_loss += loss.item()  # Accumulate the validation loss

    avg_val_loss = val_loss / len(val_loader)  # Calculate average validation loss
    print(f'Validation loss: {avg_val_loss}')  # Print average validation loss

# Save the model and tokenizer
model.save_pretrained('fine_tuning')  # Save the model
tokenizer.save_pretrained('fine_tuning')  # Save the tokenizer

print("Training completed and model saved to fine_tuning.")  # Print completion message


Epoch 1: 100%|███████████████████████████████████████████████████████████████| 403/403 [10:07:40<00:00, 90.47s/it, train_loss=0.639]


Average training loss: 1.0212344710643475
Validation loss: 0.9379693399900677
Training completed and model saved to fine_tuning.
