# Implementing Abstractive Indonesian Text Summarization Using BART Model

In [1]:
import numpy as np
import pandas as pd
import nltk 
import torch 
import transformers
import datasets
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

In [3]:
torch.cuda.is_available()

True

In [4]:
max_input = 512
max_target = 128
batch_size = 3

### I. Data Preparation

The dataset used for this project is the Indonesian News Dataset. The dataset contains articles and summary from seven news platform in Indonesia, which are Tempo, CNN Indonesia, CNBC Indonesia, Okezone, Suara, Kumparan, and JawaPos.

In [5]:
from datasets import Dataset

In [6]:
df = pd.read_csv("../data/data.csv")
df = df[["content", "summary"]].dropna() 

In [7]:
df = df.head(500)

In [8]:
dataset = Dataset.from_pandas(df)

### II. Data Preprocessing

The preprocessing technique used for this project is the BART tokenizer. The BART tokenizer is a subword tokenizer used with the BART (Bidirectional and Auto-Regressive Transformer) model. It is based on Byte-Pair Encoding (BPE) and uses SentencePiece to handle tokenization. The tokenizer work as follows:
- Step 1: Preprocessing
    - The input text is lowercased and normalized (handles Unicode characters, punctuation, and spacing).
    - It can process unseen words using subword tokenization.
- Step 2: Tokenization (Subword Splitting)
    - The tokenizer breaks words into subwords using Byte-Pair Encoding (BPE).
    - Common words remain whole ("hello" → ["hello"]), while rare words split into subwords ("unhappiness" → ["un", "happiness"]).
- Step 3: Convert Tokens to IDs
    - Each token (or subword) is mapped to a unique integer ID from the vocabulary.
    - Example:
        - "Hello World"
        - tensor([[    0,  31414,   232,     2]])
- Step 4: Special Tokens
    - BART uses special tokens for sequence modeling:
        - ```<s>``` (Start of sentence)
        - ```</s>``` (End of sentence)
        - ```<mask>``` (Masked token for denoising pretraining)
        - ```<pad>``` (Padding token for batching)
- Step 5: Decoding (Reverse Tokenization)
    - The model generates output as token IDs, which the tokenizer converts back to human-readable text.
    - Example:
        - tensor([[    0,  31414,   232,     2]])
        - "Hello World"

In [9]:
from transformers import AutoTokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

In [11]:
def preprocess_data(data_to_process):
  inputs = [dialogue for dialogue in data_to_process["content"]]

  model_inputs = tokenizer(inputs,  max_length = max_input, padding = "max_length", truncation = True)

  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process["summary"], max_length = max_target, padding = "max_length", truncation = True)
    
  model_inputs["labels"] = targets["input_ids"]

  return model_inputs

In [12]:
tokenize_data = dataset.map(preprocess_data, batched = True)

Map: 100%|██████████| 500/500 [00:00<00:00, 1454.12 examples/s]


### III. Data Splitting

The dataset is split into training and testing dataset with a ratio of 80-20.

In [13]:
dataset = tokenize_data.train_test_split(test_size = 0.2)

In [14]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

### IV. Modeling and Evaluation

The model used for this project is the BART model. BART is a transformer model introduced by Facebook AI, that combines bidirectional and autoregressive transformers. BART uses encoder-decoder architecture that is essential for tasks involving sequences of events, such as summarization. The bidirectional approach allows the model to capture contextual information, understanding, and representing input text from both directions. Meanwhile, the autoregressive approach allows the model to create coherent and contextually rich abstractive summaries.

![](https://production-media.paperswithcode.com/methods/Screen_Shot_2020-06-01_at_9.49.47_PM.png)

The metric used for the model evaluation is the Recall Oriented Understudy for Gisting Evaluation (ROUGE) metric. 

In [15]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

In [16]:
metric = evaluate.load("rouge")

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens = True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens = True)

    result = metric.compute(predictions = decoded_preds, references = decoded_labels, use_stemmer = True)

    result = {key: value * 100 for key, value in result.items()}  

    return result

In [18]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [19]:
training_args = Seq2SeqTrainingArguments(
    "../models/bart", 
    evaluation_strategy = "steps",
    save_steps = 100,
    eval_steps = 100,    
    logging_steps = 10,
    warmup_steps = 500,    
    learning_rate = 2e-5,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    gradient_accumulation_steps = 1,
    weight_decay = 0.01,
    save_total_limit = 2,
    num_train_epochs = 3,
    predict_with_generate = True,
    eval_accumulation_steps = 1,
    fp16 = True   
)



In [20]:
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = model)

In [21]:
trainer = Seq2SeqTrainer(
    model, 
    training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

  trainer = Seq2SeqTrainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [22]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
100,1.7269,1.714873,45.951865,29.282883,38.623196,38.608438
200,1.3763,1.574822,47.155681,30.45547,39.674232,39.68822
300,1.3162,1.467202,43.887727,27.782656,37.208616,37.224173
400,1.8021,1.438544,45.717147,28.858607,39.03618,39.032322
500,1.2779,1.44716,45.973286,30.236243,39.591039,39.567135
600,1.0958,1.428631,43.2141,29.059542,37.423635,37.500033
700,1.1687,1.389247,45.303456,28.846976,37.797551,37.819783
800,1.1529,1.361648,45.427064,29.888587,39.069529,39.109448
900,0.7211,1.365604,46.01967,30.228271,39.764425,39.717266
1000,1.0686,1.356099,45.249108,29.16375,38.816451,38.841091




TrainOutput(global_step=1200, training_loss=1.344896384080251, metrics={'train_runtime': 3092.8869, 'train_samples_per_second': 0.388, 'train_steps_per_second': 0.388, 'total_flos': 1300262761267200.0, 'train_loss': 1.344896384080251, 'epoch': 3.0})

In [23]:
trainer.save_model("../models/bart")
tokenizer.save_pretrained("../models/bart")

('../models/bart\\tokenizer_config.json',
 '../models/bart\\special_tokens_map.json',
 '../models/bart\\vocab.json',
 '../models/bart\\merges.txt',
 '../models/bart\\added_tokens.json',
 '../models/bart\\tokenizer.json')