In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk
from time import sleep

  from .autonotebook import tqdm as notebook_tqdm


Load BART model and tokenizer

In [2]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

In [3]:
# Define directory paths
dataset_dir = "D:/TUP SCHOOLWORKS/3rd Year/ACTIVITIES/2ND SEM\AUTOMATA/Youtube-Link-Content-Summarizer/data"
tokenized_dataset_dir = "../../data/tokenized_dataset"

Load CNN Dailymail dataset

In [4]:
def load_cnn_dailymail_dataset(dataset_dir):
    dataset = load_dataset('cnn_dailymail', '3.0.0', cache_dir=dataset_dir)
    return dataset

dataset = load_cnn_dailymail_dataset(dataset_dir)

In [14]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


Preprocess the dataset get text and highlights

In [5]:
# Preprocess the Data
def preprocess_function(examples):
    inputs = examples['article']
    targets = examples['highlights']
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=150, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

Tokenize Train Dataset using BART Tokenizer

In [6]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 287113/287113 [39:47<00:00, 120.26 examples/s]
Map: 100%|██████████| 13368/13368 [01:31<00:00, 146.73 examples/s]
Map: 100%|██████████| 11490/11490 [01:13<00:00, 157.09 examples/s]


Save the encoded_dataset


In [7]:
# encoded_dataset.save_to_disk(encoded_train_dataset_dir)
# encoded_dataset.save_to_disk(tokenized_dataset_dir)

Saving the dataset (6/6 shards): 100%|██████████| 287113/287113 [00:50<00:00, 5706.32 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 13368/13368 [00:04<00:00, 3048.41 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 11490/11490 [00:01<00:00, 9107.39 examples/s]


Retrieve the encoded_dataset

In [8]:
# tokenized_dataset = load_from_disk(tokenized_dataset_dir)
# print(encoded_dataset)
# print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11490
    })
})
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11490
    })
})


Set up training arguments

In [10]:
training_args = TrainingArguments (
    output_dir = '../../data/training_results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy='steps',
    logging_dir='../../data/logs'
)

trainer = Trainer (
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    evaluation_strategy='steps'
)



Train the BART model

In [None]:
trainer.train()

Evaluate the fine-tuned BART model

In [None]:
trainer.evaluate()

Save the model

In [None]:
model.save_model("path_to_save_model")