In [30]:
from datasets import load_dataset
import pandas as pd
import torch
from transformers.tokenization_utils_base import BatchEncoding
from typing import List, Tuple
from tqdm import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup
from datasets import load_metric

In [2]:
dataset = load_dataset("cnn_dailymail",'3.0.0')
dataset

Reusing dataset cnn_dailymail (C:\Users\Siri\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [14]:
df_train = dataset['train'].to_pandas().sample(frac=1).reset_index(drop=True)[:80][['article', 'highlights']]
df_test = dataset['test'].to_pandas().sample(frac=1).reset_index(drop=True)[:40][['article', 'highlights']]

In [26]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-xsum-12-6")

model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-xsum-12-6")

In [17]:
batch_size = 8
epochs = 4
lr_init = 5e-5
max_len = 256
warmup_steps = 3
max_input_length = 1024
max_target_length = 64

In [22]:
def batch_data(data: pd.DataFrame, bsize: int) -> List[Tuple[BatchEncoding, List[int], List[str]]]:
    lst = []
    l = len(data)//bsize
    for i in range(l):
        batch_text = data['article'][bsize*i:bsize*(i+1)].tolist()
        batch_label = data['highlights'][bsize*i:bsize*(i+1)].tolist()
        model_inputs = tokenizer.batch_encode_plus(batch_text, padding= 'max_length', max_length=max_input_length, truncation=True, add_special_tokens=True, return_tensors='pt')
        labels = tokenizer.batch_encode_plus(batch_label, padding= 'max_length', max_length=max_target_length, truncation=True, add_special_tokens=True, return_tensors='pt')
        Y = labels["input_ids"]
        s1 = batch_text
        s2 = batch_label
        lst.append((model_inputs,Y,s1,s2))
    return lst

In [23]:
train_batches = batch_data(df_train, bsize=batch_size)
test_batches = batch_data(df_test, bsize=batch_size)

In [29]:
# The torch `device` on which to execute the model computation
if torch.cuda.is_available():
    device = torch.device('cuda:0') # GPU
else:
    device = torch.device('cpu') # CPU
model.to(device)

# The gradient descent optimizer used for fine tuning
optimizer = AdamW(model.parameters(), lr=lr_init)

# The gradient descent learning rate
lr = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, 
    num_training_steps=len(train_batches))



In [96]:
summary_ids = model.generate(train_batches[0][0]['input_ids'],num_beams=2, min_length=0,max_length=max_target_length)
sumy_results=tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

In [97]:
sumy_results

[" The world's most famous Thanksgiving Day parade is getting ready to kick off this year.",
 ' As part of a new Channel 4 documentary on tattoo addiction, we look at some of the people who have been getting their hands on the art.',
 ' As the debate over whether waterboarding is illegal or acceptable as an interrogation practice heats up, questions are being asked about the legal and ethical basis for the practice.',
 ' The US Air Force is preparing to land a spacecraft that has been in orbit for more than two years.',
 'England and Portugal both suffered defeat in their Euro 2012 qualifiers on Saturday.',
 " Cristiano Ronaldo was crowned the world's best footballer for the second time on Monday.",
 ' A new survey suggests that men are spending more than women on Mother’s Day this year.',
 'Police in Saudi Arabia have detained a groom and a number of other people after a fatal shooting at a wedding.']

In [104]:
sumy_truth=train_batches[0][3]

In [105]:
#calculate rouge
# define a function
def rouge(predictions,references):
    import evaluate
    rouge = evaluate.load('rouge')
    predictions = predictions
    references = references
    results = rouge.compute(predictions=predictions,references=references)
    return(results)

In [106]:
rouge(sumy_results,sumy_truth)

{'rouge1': 0.25754204800984815,
 'rouge2': 0.09884327897207809,
 'rougeL': 0.19055411037583941,
 'rougeLsum': 0.2286331337897381}

In [108]:
# train process

from datasets import load_metric
from tqdm.auto import tqdm

model.train()
for epoch in range(epochs):
    progress_bar = tqdm(range(len(train_batches))) # add tqdm bar
    loss_lst = []
    
    for item in train_batches:
        # get each batch from batches
        batch = {'input_ids':torch.as_tensor(item[0]['input_ids'], device=device),
                'attention_mask':torch.as_tensor(item[0]['attention_mask'], device=device),
                'labels': torch.as_tensor(item[1], device=device)}
                
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr.step()
        optimizer.zero_grad()

        progress_bar.update(1) # update progree bar
            
        loss_lst.append(loss)
        
    # compute mean metrics computed over data in batches
    mean_loss = float(sum(loss_lst)/len(loss_lst))

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [109]:
trained_summary_ids = model.generate(train_batches[0][0]['input_ids'],num_beams=2, min_length=0,max_length=max_target_length)
trained_sumy_results=tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
rouge(trained_sumy_results,sumy_truth)

{'rouge1': 0.25754204800984815,
 'rouge2': 0.09884327897207809,
 'rougeL': 0.19055411037583941,
 'rougeLsum': 0.2286331337897381}