In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Load the train_split and validation_split datasets
train_split = pd.read_csv('train_split.csv')
validation_split = pd.read_csv('validate_split.csv')

# Display the first few rows of the train_split dataset
print("First few rows of the train_split dataset:")
print(train_split.head())

# Define a custom dataset class
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_max_length=512, target_max_length=64):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = str(self.data.iloc[idx]['cleaned_article'])
        target_text = str(self.data.iloc[idx]['cleaned_highlights'])

        source = self.tokenizer.encode_plus(
            source_text,
            max_length=self.source_max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target = self.tokenizer.encode_plus(
            target_text,
            max_length=self.target_max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': source['input_ids'].flatten(),
            'attention_mask': source['attention_mask'].flatten(),
            'labels': target['input_ids'].flatten(),
            'decoder_attention_mask': target['attention_mask'].flatten()
        }




First few rows of the train_split dataset:
                                         id  \
0  2cec2d38927631aa4816ccbed4e9247db4a2df9a   
1  d00aafa6ec89ff519343e72aed8ebf19aac7ca99   
2  751f6154994bf71e246ee9197ba7d099f2005dac   
3  4c0a8ea9ad876034fd5811f51d0ca8898ed61e7f   
4  44fff8cb04369c017e712f812706d4cfa9d46d27   

                                     cleaned_article  \
0  britain buck trend western nation increas spen...   
1  dr dorin scladan remov 11lb 5kg growth woman b...   
2  unpreced move russia cancel releas hollywood t...   
3  water main break new york citi caus even commu...   
4  cancer patient told larg tumour underneath lef...   

                                  cleaned_highlights  
0  half member group industri nation reduc aid br...  
1  warn graphic content madalina neagu 42 arriv r...  
2  russian author cancel releas hollywood child 4...  
3  12inch main broke manhattan 7pm send water cas...  
4  doctor memori sloan ketter cancer center new y...  


In [3]:
# Load the fine-tuned model and TokeniZer
model_path = 'fine_tuned_t5_model'
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

print("Model And tokenizer initialized.")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model And tokenizer initialized.


In [4]:
# Initialize the dataset and data loader
train_dataset = TextDataset(train_split, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

validation_dataset = TextDataset(validation_split, tokenizer)
validation_loader = DataLoader(validation_dataset, batch_size=8, shuffle=False)

# Display a sample batch of tokenized data
sample_batch = next(iter(train_loader))
print("Sample batch of tokenized data:")
print("Input IDs:", sample_batch['input_ids'][0])
print("Attention Mask:", sample_batch['attention_mask'][0])
print("Labels:", sample_batch['labels'][0])
print("Decoder Attention Mask:", sample_batch['decoder_attention_mask'][0])




Sample batch of tokenized data:
Input IDs: tensor([ 1798,   765,    49,  3476,    23,  1614, 24416,  6949, 30139,  9953,
         1798,     3, 14312,   122,  2832,   333, 14193,     3, 16812,     3,
        20868,  3136,  6957, 16545,     3,   189,  3589,  1135,  2760,  2307,
        19568,  3157,  4915,     3,    15,   159,    35,  5842,   253,  1798,
          131,   447,  1614,     3, 14312,   122,     3,  2935,    23,   122,
         1078,    63, 27857,   269,  4081,  1822,     3, 20868,   435,  1078,
           63,     3,    40,    23, 15403,   997, 19568,   836,  7493,  3476,
           23,     3,    40,    23, 15403,   880,  1078,    63,  2832,  4101,
            3,    23,    26,   333, 10515,  2093, 11457,   580,     3,    15,
          159,    35,  5842,     3, 22310,  1063,   589,     3,     9, 20580,
         1078,    63,     3, 14312,   122,   765,    49,  3476,    23,   131,
          447,  1614, 13480,  2735,   126,     3,    32,   122,   537,   131,
          447,  1614,

In [5]:
# Define training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)




T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [6]:
from tqdm import tqdm


epochs = 1   

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1} - Training', leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        decoder_attention_mask = batch['decoder_attention_mask'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_attention_mask=decoder_attention_mask)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f'Epoch {epoch+1} - Training Loss: {avg_train_loss:.4f}')
    
    # Validation phase
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(validation_loader, desc=f'Epoch {epoch+1} - Validation', leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            decoder_attention_mask = batch['decoder_attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_attention_mask=decoder_attention_mask)
            loss = outputs.loss

            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(validation_loader)
    print(f'Epoch {epoch+1} - Validation Loss: {avg_val_loss:.4f}')

print("Training complete.")

# Save the fine-tuned model
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

print("Model saved.")


                                                                                                                                    

Epoch 1 - Training Loss: 1.8424


                                                                                                                                    

Epoch 1 - Validation Loss: 1.9955
Training complete.
Model saved.
