## Installing and Importing Required Packages

In [3]:
!pip install transformers datasets evaluate rouge_score -q

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import evaluate
import numpy as np
import torch

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


## LOADING DATASET

In [4]:
dataset = load_dataset("ILSUM/ILSUM-1.0", "English")
dataset

README.md:   0%|          | 0.00/4.78k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/46.5M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

val.csv:   0%|          | 0.00/3.37M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12565 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4487 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/898 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'Article', 'Heading', 'Summary'],
        num_rows: 12565
    })
    test: Dataset({
        features: ['id', 'Article', 'Heading', 'Summary'],
        num_rows: 4487
    })
    validation: Dataset({
        features: ['id', 'Article', 'Heading', 'Summary'],
        num_rows: 898
    })
})

## SPLITTING INTO TRAIN, VALIDATION AND TEST

In [5]:
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128

def filter_long_examples(example):
    return len(example["Article"].split()) < 500 and len(example["Summary"].split()) < 100

# IMPORTANT: use dataset["train"], dataset["test"], dataset["validation"]
filtered_train = dataset["train"].filter(filter_long_examples)
filtered_test = dataset["test"].filter(filter_long_examples)
filtered_validation = dataset["validation"].filter(filter_long_examples)

dataset_final = {
    "train": filtered_train,
    "validation": filtered_validation,
    "test": filtered_test
}

Filter:   0%|          | 0/12565 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4487 [00:00<?, ? examples/s]

Filter:   0%|          | 0/898 [00:00<?, ? examples/s]

## IMPORTING BART MODEL

In [6]:
model_checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## PREPROCESSING THE DATA

In [7]:
def preprocess_function(examples):
    inputs = examples["Article"]
    targets = examples["Summary"]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)
    labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = {k: v.map(preprocess_function, batched=True) for k, v in dataset_final.items()}

Map:   0%|          | 0/7710 [00:00<?, ? examples/s]

Map:   0%|          | 0/551 [00:00<?, ? examples/s]

Map:   0%|          | 0/2707 [00:00<?, ? examples/s]

## LOADING PRE-TRAINED WEIGHTS

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

## SETTING EVALUATION METRICS AND DEFINING BATCHSIZE

In [9]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v * 100, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [10]:
batch_size = 2

## SETTING TRAINING ARGS

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart-news-summarizer",
    run_name="bart-news-summarization-run",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    report_to=None,
    fp16=torch.cuda.is_available()
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

## DEFINE TRAINER

In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [14]:
import os
os.environ['WANDB_DISABLED'] = "true"

## TRAINING THE MODEL

In [15]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.3783,0.316255,36.3309,26.5583,33.8139,33.7809
2,0.2973,0.305778,36.7421,27.0495,34.1512,34.1875
3,0.2572,0.307433,37.0937,27.8078,34.6509,34.6416




TrainOutput(global_step=5784, training_loss=0.3341655243152075, metrics={'train_runtime': 1957.7872, 'train_samples_per_second': 11.814, 'train_steps_per_second': 2.954, 'total_flos': 6678980519178240.0, 'train_loss': 0.3341655243152075, 'epoch': 3.0})

## EVALUATING THE MODEL

In [16]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.29115334153175354,
 'eval_rouge1': 37.2619,
 'eval_rouge2': 27.5054,
 'eval_rougeL': 34.4019,
 'eval_rougeLsum': 34.3945,
 'eval_runtime': 306.2583,
 'eval_samples_per_second': 8.839,
 'eval_steps_per_second': 2.211,
 'epoch': 3.0}

## TESTING ON RANDOM SAMPLE

In [20]:
sample = dataset_final["test"][0]
inputs = tokenizer(sample["Article"], return_tensors="pt", max_length=MAX_INPUT_LENGTH, truncation=True).to(model.device)

summary_ids = model.generate(**inputs, max_length=MAX_TARGET_LENGTH)
print("Original Article:", sample["Article"])
print("\nReference Summary:", sample["Summary"])
print("\nGenerated Summary:", tokenizer.decode(summary_ids[0], skip_special_tokens=True))

Original Article: Indian-origin boy finds millions of years old fossil in UK gardenA six-year-old Indian-origin boy says he is “really excited” after he found a fossil from millions of years ago while digging in his garden in the West Midlands region of England. Siddak Singh Jhamat, known as Sid, was using a fossil-hunting set he received as a Christmas present when he came across a rock that looked like a horn."I was just digging for worms and things like pottery and bricks and I just came across this rock which looked a bit like a horn, and thought it could be a tooth or a claw or a horn, but it was actually a piece of coral which is called horn coral," the schoolboy said."I was really excited about what it really was," he said.According to a BBC report, his father Vish Singh was able to identify the horn coral through a fossil group he is a member of on Facebook and estimates the fossil is between 251 to 488 million years old."We were surprised he found something so odd-shaped in th