In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset, load_from_disk
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer, BertTokenizer, BertModel, EncoderDecoderModel, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import evaluate, seqeval

comet_ml is installed but `COMET_API_KEY` is not set.
2024-07-12 08:42:53.875160: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-12 08:42:53.903834: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import glob
import json
import re

eval_file = glob.glob("liputan6_data/canonical/test/*.json")
eval_file.sort(key=lambda f: int(re.sub('\D', '', f)))

test_file = glob.glob("liputan6_data/canonical/dev/*.json")
test_file.sort(key=lambda f: int(re.sub('\D', '', f)))

eval_data = []
test_data = []

for i in eval_file:
  with open(i, "r", encoding="utf-8") as f:
    d = json.load(f)
    eval_data.append(d)

for i in test_file:
  with open(i, "r", encoding="utf-8") as f:
    d = json.load(f)
    test_data.append(d)

print(f"eval data: {len(eval_data)}")
print(f"test data: {len(test_data)}")

eval data: 10972
test data: 10972


In [3]:
eval_data[0].keys()

dict_keys(['id', 'url', 'clean_article', 'clean_summary', 'extractive_summary'])

In [4]:
eval_data = eval_data[:10000]
test_data = test_data[:10000]

In [5]:
print(f"eval data: {len(eval_data)}")
print(f"test data: {len(test_data)}")

eval data: 10000
test data: 10000


In [6]:
import numpy as np
import pandas as pd

def custom_join(words):
  result = ' '.join(words)
  result = result.replace("Liputan6 . com", "Liputan6.com")
  result = result.replace(" , ", ", ")
  result = result.replace(" . ", ". ")
  result = result.replace(" ( ", " (")
  result = result.replace(" ) ", ") ")
  return result


def make_dataset_df(data):
  clean_article = []
  clean_summary = []

  for item in data:
    clean_article_sentence = []
    for arr in item['clean_article']:
      clean_article_sentence.extend(arr)
    joined_str1 = custom_join(clean_article_sentence)
    clean_article.append(joined_str1)

    clean_summary_sentence = []
    for arr in item['clean_summary']:
      clean_summary_sentence.extend(arr)
    joined_str2 = custom_join(clean_summary_sentence)
    clean_summary.append(joined_str2)

  df = pd.DataFrame({'clean_article': clean_article, 'clean_summary': clean_summary})
  return df

In [7]:
eval_df = make_dataset_df(eval_data)
test_df = make_dataset_df(test_data)

In [8]:
eval_df

Unnamed: 0,clean_article,clean_summary
0,"Liputan6.com, Jakarta : Kepolisian Daerah Riau...",Kapolda Riau baru Brigjen Pol. Johny Yodjana b...
1,"Liputan6.com, Jakarta : Bank Indonesia dinilai...",Kendati Bank Sentral AS menurunkan suku bungan...
2,"Liputan6.com, Jakarta : Berbagai kendala mengh...",Pemerintah bermaksud akan lebih mengandalkan s...
3,"Liputan6.com, Jakarta : Penghapusan beberapa p...","Revisi Kepmennaker Nomor 78 Tahun 2001, dinila..."
4,"Liputan6.com, Jakarta : Operasi Sadar Jaya yan...",Polisi menangkap 32 pengunjung Diskotik Mileni...
...,...,...
9995,"Liputan6.com, Bekasi : Dalam waktu semalam, du...",Dua sopir taksi ditemukan tewas di dua lokasi ...
9996,"Liputan6.com, Aceh : Polisi menembak dua orang...",Dua orang yang diduga anggota Gerakan Aceh Mer...
9997,"Liputan6.com, Bekasi : Setelah bentrokan antar...",Situasi di tempat pembuangan akhir sampah (TPA...
9998,"Liputan6.com, Losari : Kecelakaan bus terulang...",Kecelakaan Bus Sinar Jaya tujuan Wonosobo mene...


In [9]:
test_df

Unnamed: 0,clean_article,clean_summary
0,"Liputan6.com, Jakarta : Pemerintah masih membe...",Pemerintah memberikan tenggat 14 hari kepada p...
1,"Liputan6.com, Jakarta : Kecaman demi kecaman k...",MPR dan DPR mengutuk tindakan kekerasan tentar...
2,"Liputan6.com, Jakarta : Janda mendiang Amir Bi...","Dewi Wardah, janda korban peristiwa Tanjungpri..."
3,"Liputan6.com, Jakarta : Polisi telah menyerahk...",Kapolda Metro Jaya mengaku telah menyerahkan B...
4,"Liputan6.com, Jakarta : Kepolisian Resor Kota ...","Dalam operasinya, polisi Pekanbaru berhasil me..."
...,...,...
9995,"Liputan6.com, Jakarta : Laskar Jihad Ahlussunn...",Sekitar 600 personel Laskar Jihad Ahlussunnah ...
9996,"Liputan6.com, Pati : Hama tikus menyerang ratu...",Serangan tikus mengakibatkan ratusan hektare s...
9997,"Liputan6.com, Tangerang : Ketua Pengadilan Neg...",Ketua PN Tangerang menguatkan vonis hukuman ma...
9998,"Liputan6.com, Jakarta : Sekretaris Jenderal PD...",PDI-P tidak melihat alasan menolak Memorandum ...


In [10]:
test_df['clean_article'][0]

'Liputan6.com, Jakarta : Pemerintah masih memberikan waktu dua minggu lagi kepada seluruh konglomerat yang telah menandatangani perjanjian pengembalian bantuan likuiditas Bank Indonesia dengan jaminan aset (MSAA ), untuk secepatnya menyerahkan jaminan pribadi serta aset. Jika lewat dari tenggat tersebut, pemerintah akan menerapkan tindakan hukum. Hal tersebut dikemukakan Menteri Koordinator Bidang Perekonomian Rizal Ramli di Jakarta, baru-baru ini. Rizal mengakui bahwa permintaan untuk meminta jaminan pribadi atau personal guarantee pada awalnya ditentang sejumlah konglomerat. Sebab para debitor menganggap tindakan tersebut memungkinkan pemerintah untuk menyita seluruh aset mereka baik yang berada di dalam maupun luar negeri. Sejauh ini, penilaian jaminan MSAA baru dilakukan atas aset milik Grup Salim. Tetapi, nilai aset yang dijaminkan Kelompok Salim atas utang BLBI Bank Central Asia diperkirakan tak lebih dari Rp 20 triliun. Padahal, kewajiban mereka mencapai Rp 52 triliun. Sementara

In [11]:
test_df['clean_summary'][0]

'Pemerintah memberikan tenggat 14 hari kepada para konglomerat penandatangan MSAA untuk menyerahkan aset. Jika mangkir, mereka bakal dihukum .'

In [12]:
eval_dataset = Dataset.from_pandas(eval_df)
test_dataset = Dataset.from_pandas(test_df)

In [13]:
eval_dataset

Dataset({
    features: ['clean_article', 'clean_summary'],
    num_rows: 10000
})

In [14]:
test_dataset

Dataset({
    features: ['clean_article', 'clean_summary'],
    num_rows: 10000
})

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [16]:
tokenizer = AutoTokenizer.from_pretrained("./Bert2Bert_trained_IndoBERT_100k/")

In [17]:
model = EncoderDecoderModel.from_pretrained("./Bert2Bert_trained_IndoBERT_100k/")



In [18]:
# Adjust the generation configuration for beam search
model.config.num_beams = 4
model.config.early_stopping = True
model.config.length_penalty = 2.0

In [19]:
# Generate summaries using the adjusted configuration
def generate_summary(batch):
    inputs = tokenizer(batch["clean_article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    with torch.no_grad():
      outputs = model.generate(input_ids, attention_mask=attention_mask,
                               max_length=128,
                               num_beams = model.config.num_beams,
                               early_stopping = model.config.early_stopping,
                               length_penalty = model.config.length_penalty)

    batch["predicted_summary"] = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return batch

In [20]:
model.to(device)

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [21]:
tokenized_eval = eval_dataset.map(generate_summary, batched=True, batch_size=32)
tokenized_test = test_dataset.map(generate_summary, batched=True, batch_size=32)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [22]:
import evaluate

# Load ROUGE metric
rouge = evaluate.load('rouge')

In [23]:
# Compute ROUGE scores
val_rouge = rouge.compute(predictions=tokenized_eval["predicted_summary"], references=tokenized_eval["clean_summary"])
test_rouge = rouge.compute(predictions=tokenized_test["predicted_summary"], references=tokenized_test["clean_summary"])

# Print ROUGE scores
print("Validation ROUGE scores:", val_rouge)
print("Test ROUGE scores:", test_rouge)

Validation ROUGE scores: {'rouge1': 0.4060839704606496, 'rouge2': 0.2264337003254389, 'rougeL': 0.3440749677736598, 'rougeLsum': 0.3441308434219651}
Test ROUGE scores: {'rouge1': 0.36637272062852533, 'rouge2': 0.1871306705901744, 'rougeL': 0.3100321570248371, 'rougeLsum': 0.3100354767398167}


In [24]:
val_rouge

{'rouge1': 0.4060839704606496,
 'rouge2': 0.2264337003254389,
 'rougeL': 0.3440749677736598,
 'rougeLsum': 0.3441308434219651}

In [25]:
test_rouge

{'rouge1': 0.36637272062852533,
 'rouge2': 0.1871306705901744,
 'rougeL': 0.3100321570248371,
 'rougeLsum': 0.3100354767398167}