In [None]:
!cp drive/MyDrive/liputan6_data.tar.gz ./

In [None]:
!tar -xzf liputan6_data.tar.gz

In [None]:
!pip install -q transformers accelerate datasets==2.17.1 evaluate==0.4.1 seqeval

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset, load_from_disk
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, EncoderDecoderModel, TrainingArguments, Trainer, DataCollatorForSeq2Seq, EarlyStoppingCallback
import evaluate, seqeval

In [None]:
import glob
import json
import re

train_file = glob.glob("liputan6_data/canonical/train/*.json")
train_file.sort(key=lambda f: int(re.sub('\D', '', f)))

eval_file = glob.glob("liputan6_data/canonical/test/*.json")
eval_file.sort(key=lambda f: int(re.sub('\D', '', f)))

train_data = []
eval_data = []

for i in train_file:
  with open(i, "r", encoding="utf-8") as f:
    d = json.load(f)
    train_data.append(d)

for i in eval_file:
  with open(i, "r", encoding="utf-8") as f:
    d = json.load(f)
    eval_data.append(d)

print(f"train data: {len(train_data)}")
print(f"eval data: {len(eval_data)}")

In [None]:
train_data[0].keys()

In [None]:
train_data = train_data[:100000]
eval_data = eval_data[:10000]

In [None]:
import numpy as np
import pandas as pd

def custom_join(words):
  result = ' '.join(words)
  result = result.replace("Liputan6 . com", "Liputan6.com")
  result = result.replace(" , ", ", ")
  result = result.replace(" . ", ". ")
  result = result.replace(" ( ", " (")
  result = result.replace(" ) ", ") ")
  return result


def make_dataset_df(data):
  clean_article = []
  clean_summary = []

  for item in data:
    clean_article_sentence = []
    for arr in item['clean_article']:
      clean_article_sentence.extend(arr)
    joined_str1 = custom_join(clean_article_sentence)
    clean_article.append(joined_str1)

    clean_summary_sentence = []
    for arr in item['clean_summary']:
      clean_summary_sentence.extend(arr)
    joined_str2 = custom_join(clean_summary_sentence)
    clean_summary.append(joined_str2)

  df = pd.DataFrame({'clean_article': clean_article, 'clean_summary': clean_summary})
  return df

In [None]:
train_df = make_dataset_df(train_data)
eval_df = make_dataset_df(eval_data)

In [None]:
train_df

In [None]:
eval_df

In [None]:
train_df['clean_article'][0]

In [None]:
train_df['clean_summary'][0]

In [None]:
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

In [None]:
train_dataset

In [None]:
eval_dataset

In [None]:
encoder_tokenizer = BertTokenizer.from_pretrained('cahya/bert-base-indonesian-1.5G')
decoder_tokenizer = GPT2Tokenizer.from_pretrained('cahya/gpt2-small-indonesian-522M')

In [None]:
# GPT2 does not have pad token, so we add it
decoder_tokenizer.pad_token = decoder_tokenizer.eos_token

In [None]:
def tokenize_data(example):
    input_encoding = encoder_tokenizer(example['clean_article'], padding='max_length', truncation=True, max_length=512)
    target_encoding = decoder_tokenizer(example['clean_summary'], padding='max_length', truncation=True, max_length=128)
    return {
        'input_ids': input_encoding['input_ids'],
        'attention_mask': input_encoding['attention_mask'],
        'labels': target_encoding['input_ids']
    }

In [None]:
tokenized_train = train_dataset.map(tokenize_data, batched=True, num_proc=4)
tokenized_eval = eval_dataset.map(tokenize_data, batched=True, num_proc=4)

In [None]:
encoder_tokenizer.pad_token

In [None]:
encoder_tokenizer.cls_token

In [None]:
decoder_tokenizer.pad_token

In [None]:
decoder_tokenizer.bos_token

In [None]:
# make Bert2gpt (Encoder-Decoder model)
model = EncoderDecoderModel.from_encoder_decoder_pretrained('cahya/bert-base-indonesian-1.5G', 'cahya/gpt2-small-indonesian-522M')

In [None]:
# Define special tokens
model.config.decoder_start_token_id = decoder_tokenizer.bos_token_id
model.config.pad_token_id = encoder_tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

In [None]:
model.config.encoder.vocab_size

In [None]:
# Set configurations for the encoder and decoder
model.config.encoder.max_length = 512
model.config.decoder.max_length = 128
model.config.decoder.min_length = 12
model.config.length_penalty = 2.0
model.config.early_stopping = True

In [None]:
# Necessary to resize token embeddings for the newly added [PAD] token
model.decoder.resize_token_embeddings(len(decoder_tokenizer))

In [None]:
data_collator = DataCollatorForSeq2Seq(encoder_tokenizer, model=model)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    gradient_accumulation_steps=1,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=8,
    warmup_steps=5,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_strategy='steps',
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_steps=100,
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=encoder_tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate(tokenized_eval)
print(metrics)

In [None]:
import math

try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")

metrics["perplexity"] = perplexity

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

print(perplexity)

In [None]:
trainer.save_model('./Bert2gpt_trained_100k/')