In [1]:
import torch
from datasets import load_dataset
from tqdm import tqdm
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, SequentialSampler
from transformers import (AdamW,T5ForConditionalGeneration,T5Tokenizer,get_linear_schedule_with_warmup)

In [2]:
dataset = load_dataset('xsum')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

Found cached dataset xsum (C:/Users/NIT/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)


  0%|          | 0/3 [00:00<?, ?it/s]

device(type='cuda')

In [3]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base-finetuned-xsum")
model.to(device)
print('Moved model to', device)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Moved model to cuda


In [4]:
def getDataLoader(input):
    inputs = ["summarize: " + sent for sent in input['document']]
    tokenized_inputs = tokenizer(inputs, padding = True, truncation = True, return_tensors="pt")
    source_ids = tokenized_inputs['input_ids']
    source_mask = tokenized_inputs['attention_mask']
    
    tokenized_outputs = tokenizer(input['summary'], padding = True, truncation = True, max_length = 256, return_tensors="pt")
    target_ids = tokenized_outputs['input_ids']
    target_mask = tokenized_outputs['attention_mask']

    # Create a TensorDataset
    batch_size = 4
    data = TensorDataset(source_ids, source_mask, target_ids, target_mask)

    # Create a data loader
    sampler = SequentialSampler(data)
    return DataLoader(data, sampler = sampler, batch_size=batch_size)

#train_dataloader = getDataLoader(dataset['train'])
#validation_dataloader = getDataLoader(dataset['validation'])
#test_dataloader = getDataLoader(dataset['test'])

In [5]:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [
            p
            for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
    {
        "params": [
            p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-4,
    eps=1e-8,
)


num_epochs = 1
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)



In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
best_rouge_1 = 0
from rouge import Rouge

# Initialize ROUGE scorer
rouge_scorer = Rouge()

for epoch in range(1):
    model.train()
    
    total_loss = 0
    for batch in tqdm(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        source_ids, source_mask, lm_labels, target_mask = batch
        lm_labels[lm_labels[:, :] == tokenizer.pad_token_id] = -100

        optimizer.zero_grad()

        outputs = model(
            input_ids=source_ids,
            attention_mask=source_mask,
            labels = lm_labels,
            decoder_attention_mask=target_mask,
        )
        
        loss = outputs[0]
        loss.backward()
        
        total_loss += loss.item()
        optimizer.step()
        scheduler.step()
    
    # Compute the average loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}/{num_epochs} - loss: {avg_loss:.4f}')
        


    # Evaluate the model on the validation set
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            batch = tuple(t.to(device) for t in batch)
            source_ids, source_mask, lm_labels, target_mask = batch
            outs = model.generate(input_ids=source_ids, 
                                        attention_mask=source_mask, 
                                        max_length=256)

            predictions.extend([tokenizer.decode(output, skip_special_tokens=True) for output in outs])
            
    predictions = [prediction if prediction else "empty" for prediction in predictions]
    rouge_scores = rouge_scorer.get_scores(predictions, dataset['test']['summary'])
    rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
    rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
    rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    print(f'Epoch {epoch+1}/{num_epochs} - test Rougue-1: {avg_rouge_1:.4f}')
    print(f'Epoch {epoch+1}/{num_epochs} - test Rougue-2: {avg_rouge_2:.4f}')
    print(f'Epoch {epoch+1}/{num_epochs} - test Rougue-l: {avg_rouge_l:.4f}')
    
    if avg_rouge_1 > best_rouge_1:
        best_rouge_1 = avg_rouge_1
        model.save_pretrained("t5-base-finetuned-xsum")

In [39]:
#predictions = [prediction if prediction else "empty" for prediction in predictions]
for i in range(len(predictions)):
    if predictions[i]=='' or predictions[i]=='.':
        predictions[i]=="empty"
    
rouge_scores = rouge_scorer.get_scores(predictions, dataset['test']['summary'])
rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

print(f'Epoch {epoch+1}/{num_epochs} - test Rougue-1: {avg_rouge_1:.4f}')
print(f'Epoch {epoch+1}/{num_epochs} - test Rougue-2: {avg_rouge_2:.4f}')
print(f'Epoch {epoch+1}/{num_epochs} - test Rougue-l: {avg_rouge_l:.4f}')
    
if avg_rouge_1 > best_rouge_1:
    best_rouge_1 = avg_rouge_1
    model.save_pretrained("t5-base-finetuned-xsum")

Epoch 1/1 - test Rougue-1: 0.3414
Epoch 1/1 - test Rougue-2: 0.1260
Epoch 1/1 - test Rougue-l: 0.2832


In [None]:
#Fine-tuned model is available at PavanNeerudu/t5-base-finetuned-xsum on huggingface

In [5]:
def gen_sentences(model, testDataLoader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(testDataLoader):
            batch = tuple(t.to(device) for t in batch)
            source_ids, source_mask, lm_labels, target_mask = batch
            outs = model.generate(input_ids=source_ids, 
                                        attention_mask=source_mask, 
                                        max_length=256)

            predictions.extend([tokenizer.decode(output, skip_special_tokens=True) for output in outs])         
    return [prediction if prediction else "empty" for prediction in predictions]



In [None]:
def gen_sentences(model, testDataLoader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(testDataLoader):
            batch = tuple(t.to(device) for t in batch)
            source_ids, source_mask, lm_labels, target_mask = batch
            outs = model.generate(input_ids=source_ids, 
                                        attention_mask=source_mask, 
                                        max_length=256)

            predictions.extend([tokenizer.decode(output, skip_special_tokens=True) for output in outs])         
    return [prediction if prediction else "empty" for prediction in predictions]



100%|████████████████████████████████████████████████████████████████████████████| 2834/2834 [2:07:15<00:00,  2.69s/it]


Perturbation: noNouns
Average ROUGE-1 Score: 0.21317073492569602
Average ROUGE-2 Score: 0.04910935556845253
Average ROUGE-L Score: 0.17636551457881022



100%|██████████████████████████████████████████████████████████████████████████████| 2834/2834 [59:29<00:00,  1.26s/it]


Perturbation: noVerbs
Average ROUGE-1 Score: 0.31446747838549044
Average ROUGE-2 Score: 0.104909923774699
Average ROUGE-L Score: 0.257287038255341



100%|██████████████████████████████████████████████████████████████████████████████| 2834/2834 [52:26<00:00,  1.11s/it]


Perturbation: noFirst
Average ROUGE-1 Score: 0.336372482385272
Average ROUGE-2 Score: 0.12269199370218474
Average ROUGE-L Score: 0.277757994858049



100%|██████████████████████████████████████████████████████████████████████████████| 2834/2834 [54:30<00:00,  1.15s/it]


Perturbation: noLast
Average ROUGE-1 Score: 0.34178809954677397
Average ROUGE-2 Score: 0.12620087503519975
Average ROUGE-L Score: 0.28308476877078603



 10%|████████                                                                       | 289/2834 [06:06<37:44,  1.12it/s]

In [None]:
from datasets import load_from_disk
pertNames = [ "swapText", "addText", "changeChar", "bias"]

from rouge import Rouge


for pert in pertNames:
    testDs = load_from_disk('../../Datasets/'+'xsum'+'test'+pert)
    testDataLoader = getDataLoader(testDs)    
    predictions = gen_sentences(model, testDataLoader)
    predictions[5581] = "empty"
    rouge_scorer = Rouge()

    # Calculate ROUGE scores for each sentence
    rouge_scores = rouge_scorer.get_scores(predictions, testDs['summary'])

    # Calculate average ROUGE scores
    rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
    rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
    rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    print("Perturbation:", pert)
    print("Average ROUGE-1 Score:", avg_rouge_1)
    print("Average ROUGE-2 Score:", avg_rouge_2)
    print("Average ROUGE-L Score:", avg_rouge_l)
    print()

100%|██████████████████████████████████████████████████████████████████████████████| 2834/2834 [50:31<00:00,  1.07s/it]


Perturbation: swapText
Average ROUGE-1 Score: 0.33725192616600214
Average ROUGE-2 Score: 0.1228066742001371
Average ROUGE-L Score: 0.2782844099922448



 21%|███████████████▏                                                          | 581/2834 [1:57:43<11:55:58, 19.07s/it]

In [6]:
from datasets import load_from_disk
pertNames = [ "addText","changeChar", "bias"]

from rouge import Rouge


for pert in pertNames:
    testDs = load_from_disk('../../Datasets/'+'xsum'+'test'+pert)
    testDataLoader = getDataLoader(testDs)    
    predictions = gen_sentences(model, testDataLoader)
    predictions[5581] = "empty"
    rouge_scorer = Rouge()

    # Calculate ROUGE scores for each sentence
    rouge_scores = rouge_scorer.get_scores(predictions, testDs['summary'])

    # Calculate average ROUGE scores
    rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
    rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
    rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    print("Perturbation:", pert)
    print("Average ROUGE-1 Score:", avg_rouge_1)
    print("Average ROUGE-2 Score:", avg_rouge_2)
    print("Average ROUGE-L Score:", avg_rouge_l)
    print()

100%|████████████████████████████████████████████████████████████████████████████| 2834/2834 [1:02:06<00:00,  1.31s/it]


Perturbation: addText
Average ROUGE-1 Score: 0.3255069008556106
Average ROUGE-2 Score: 0.11393425889900592
Average ROUGE-L Score: 0.26883626566024005



 26%|███████████████████▏                                                       | 723/2834 [1:05:35<3:11:31,  5.44s/it]


KeyboardInterrupt: 

In [14]:
from datasets import load_from_disk
pertNames = [ "changeChar"]

from rouge import Rouge


for pert in pertNames:
    testDs = load_from_disk('Datasets/'+'xsum'+'test'+pert)
    testDataLoader = getDataLoader(testDs)    
    predictions = gen_sentences(model, testDataLoader)
    predictions[5581] = "empty"
    rouge_scorer = Rouge()

    # Calculate ROUGE scores for each sentence
    rouge_scores = rouge_scorer.get_scores(predictions, testDs['summary'])

    # Calculate average ROUGE scores
    rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
    rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
    rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    print("Perturbation:", pert)
    print("Average ROUGE-1 Score:", avg_rouge_1)
    print("Average ROUGE-2 Score:", avg_rouge_2)
    print("Average ROUGE-L Score:", avg_rouge_l)
    print()

100%|████████████████████████████████████████████████████████████████████████████| 2834/2834 [4:12:58<00:00,  5.36s/it]


Perturbation: changeChar
Average ROUGE-1 Score: 0.20109146975758316
Average ROUGE-2 Score: 0.048934469958334316
Average ROUGE-L Score: 0.16936630050729184



In [10]:
from datasets import load_from_disk
pertNames = [ "changeChar","bias"]

from rouge import Rouge


for pert in pertNames:
    testDs = load_from_disk('../../Datasets/'+'xsum'+'test'+pert)
    testDataLoader = getDataLoader(testDs)    
    predictions = gen_sentences(model, testDataLoader)
    predictions[5581] = "empty"
    rouge_scorer = Rouge()

    # Calculate ROUGE scores for each sentence
    rouge_scores = rouge_scorer.get_scores(predictions, testDs['summary'])

    # Calculate average ROUGE scores
    rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
    rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
    rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    print("Perturbation:", pert)
    print("Average ROUGE-1 Score:", avg_rouge_1)
    print("Average ROUGE-2 Score:", avg_rouge_2)
    print("Average ROUGE-L Score:", avg_rouge_l)
    print()

100%|██████████████████████████████████████████████████████████████████████████████| 2834/2834 [48:15<00:00,  1.02s/it]


Perturbation: bias
Average ROUGE-1 Score: 0.3351160691637756
Average ROUGE-2 Score: 0.12086573435273722
Average ROUGE-L Score: 0.27715272511448974

