In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp drive/MyDrive/liputan6_data.tar.gz ./

In [3]:
!tar -xzf liputan6_data.tar.gz

In [4]:
!pip install -q transformers accelerate datasets==2.17.1 evaluate==0.4.1 seqeval rouge_score

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/314.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m307.2/314.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/536.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [5]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset, load_from_disk
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer, BertTokenizer, GPT2Tokenizer, EncoderDecoderModel
import evaluate, seqeval

In [6]:
import glob
import json
import re

eval_file = glob.glob("liputan6_data/canonical/test/*.json")
eval_file.sort(key=lambda f: int(re.sub('\D', '', f)))

test_file = glob.glob("liputan6_data/canonical/dev/*.json")
test_file.sort(key=lambda f: int(re.sub('\D', '', f)))

eval_data = []
test_data = []

for i in eval_file:
  with open(i, "r", encoding="utf-8") as f:
    d = json.load(f)
    eval_data.append(d)

for i in test_file:
  with open(i, "r", encoding="utf-8") as f:
    d = json.load(f)
    test_data.append(d)

print(f"eval data: {len(eval_data)}")
print(f"test data: {len(test_data)}")

eval data: 10972
test data: 10972


In [7]:
eval_data[0].keys()

dict_keys(['id', 'url', 'clean_article', 'clean_summary', 'extractive_summary'])

In [8]:
test_data = test_data[:10000]
eval_data = eval_data[:10000]

In [9]:
print(f"eval data: {len(eval_data)}")
print(f"test data: {len(test_data)}")

eval data: 10000
test data: 10000


In [10]:
import numpy as np
import pandas as pd

def custom_join(words):
  result = ' '.join(words)
  result = result.replace("Liputan6 . com", "Liputan6.com")
  result = result.replace(" , ", ", ")
  result = result.replace(" . ", ". ")
  result = result.replace(" ( ", " (")
  result = result.replace(" ) ", ") ")
  return result


def make_dataset_df(data):
  clean_article = []
  clean_summary = []

  for item in data:
    clean_article_sentence = []
    for arr in item['clean_article']:
      clean_article_sentence.extend(arr)
    joined_str1 = custom_join(clean_article_sentence)
    clean_article.append(joined_str1)

    clean_summary_sentence = []
    for arr in item['clean_summary']:
      clean_summary_sentence.extend(arr)
    joined_str2 = custom_join(clean_summary_sentence)
    clean_summary.append(joined_str2)

  df = pd.DataFrame({'clean_article': clean_article, 'clean_summary': clean_summary})
  return df

In [11]:
eval_df = make_dataset_df(eval_data)
test_df = make_dataset_df(test_data)

In [12]:
eval_df

Unnamed: 0,clean_article,clean_summary
0,"Liputan6.com, Jakarta : Kepolisian Daerah Riau...",Kapolda Riau baru Brigjen Pol. Johny Yodjana b...
1,"Liputan6.com, Jakarta : Bank Indonesia dinilai...",Kendati Bank Sentral AS menurunkan suku bungan...
2,"Liputan6.com, Jakarta : Berbagai kendala mengh...",Pemerintah bermaksud akan lebih mengandalkan s...
3,"Liputan6.com, Jakarta : Penghapusan beberapa p...","Revisi Kepmennaker Nomor 78 Tahun 2001, dinila..."
4,"Liputan6.com, Jakarta : Operasi Sadar Jaya yan...",Polisi menangkap 32 pengunjung Diskotik Mileni...
...,...,...
9995,"Liputan6.com, Bekasi : Dalam waktu semalam, du...",Dua sopir taksi ditemukan tewas di dua lokasi ...
9996,"Liputan6.com, Aceh : Polisi menembak dua orang...",Dua orang yang diduga anggota Gerakan Aceh Mer...
9997,"Liputan6.com, Bekasi : Setelah bentrokan antar...",Situasi di tempat pembuangan akhir sampah (TPA...
9998,"Liputan6.com, Losari : Kecelakaan bus terulang...",Kecelakaan Bus Sinar Jaya tujuan Wonosobo mene...


In [13]:
test_df

Unnamed: 0,clean_article,clean_summary
0,"Liputan6.com, Jakarta : Pemerintah masih membe...",Pemerintah memberikan tenggat 14 hari kepada p...
1,"Liputan6.com, Jakarta : Kecaman demi kecaman k...",MPR dan DPR mengutuk tindakan kekerasan tentar...
2,"Liputan6.com, Jakarta : Janda mendiang Amir Bi...","Dewi Wardah, janda korban peristiwa Tanjungpri..."
3,"Liputan6.com, Jakarta : Polisi telah menyerahk...",Kapolda Metro Jaya mengaku telah menyerahkan B...
4,"Liputan6.com, Jakarta : Kepolisian Resor Kota ...","Dalam operasinya, polisi Pekanbaru berhasil me..."
...,...,...
9995,"Liputan6.com, Jakarta : Laskar Jihad Ahlussunn...",Sekitar 600 personel Laskar Jihad Ahlussunnah ...
9996,"Liputan6.com, Pati : Hama tikus menyerang ratu...",Serangan tikus mengakibatkan ratusan hektare s...
9997,"Liputan6.com, Tangerang : Ketua Pengadilan Neg...",Ketua PN Tangerang menguatkan vonis hukuman ma...
9998,"Liputan6.com, Jakarta : Sekretaris Jenderal PD...",PDI-P tidak melihat alasan menolak Memorandum ...


In [14]:
test_df['clean_article'][0]

'Liputan6.com, Jakarta : Pemerintah masih memberikan waktu dua minggu lagi kepada seluruh konglomerat yang telah menandatangani perjanjian pengembalian bantuan likuiditas Bank Indonesia dengan jaminan aset (MSAA ), untuk secepatnya menyerahkan jaminan pribadi serta aset. Jika lewat dari tenggat tersebut, pemerintah akan menerapkan tindakan hukum. Hal tersebut dikemukakan Menteri Koordinator Bidang Perekonomian Rizal Ramli di Jakarta, baru-baru ini. Rizal mengakui bahwa permintaan untuk meminta jaminan pribadi atau personal guarantee pada awalnya ditentang sejumlah konglomerat. Sebab para debitor menganggap tindakan tersebut memungkinkan pemerintah untuk menyita seluruh aset mereka baik yang berada di dalam maupun luar negeri. Sejauh ini, penilaian jaminan MSAA baru dilakukan atas aset milik Grup Salim. Tetapi, nilai aset yang dijaminkan Kelompok Salim atas utang BLBI Bank Central Asia diperkirakan tak lebih dari Rp 20 triliun. Padahal, kewajiban mereka mencapai Rp 52 triliun. Sementara

In [15]:
test_df['clean_summary'][0]

'Pemerintah memberikan tenggat 14 hari kepada para konglomerat penandatangan MSAA untuk menyerahkan aset. Jika mangkir, mereka bakal dihukum .'

In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [17]:
encoder_tokenizer = BertTokenizer.from_pretrained('cahya/bert-base-indonesian-1.5G')
decoder_tokenizer = GPT2Tokenizer.from_pretrained('cahya/gpt2-small-indonesian-522M')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/230k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/468 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/894k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/452k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

In [18]:
model = EncoderDecoderModel.from_pretrained("Willy030125/Bert2gpt_Liputan6_100k_8epoch")

config.json:   0%|          | 0.00/4.91k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [19]:
# Adjust the generation configuration for beam search
model.config.num_beams = 4
model.config.early_stopping = True
model.config.length_penalty = 2.0

In [20]:
model.to(device)

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

# 1

In [28]:
input_texts = eval_df['clean_article'][0]

In [29]:
# Inference mode (text generation)
import time

start_time = time.time()

inputs = encoder_tokenizer(input_texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

with torch.no_grad():
  outputs = model.generate(input_ids, attention_mask=attention_mask,
                           max_length=128,
                           num_beams = model.config.num_beams,
                           early_stopping = model.config.early_stopping,
                           length_penalty = model.config.length_penalty)

generated_texts = decoder_tokenizer.batch_decode(outputs, skip_special_tokens=True)

end_time = time.time()
inference_time = end_time - start_time

print(f"Inference Time: {inference_time:.2f} seconds")
print("Generated Texts:", generated_texts)

Inference Time: 2.88 seconds
Generated Texts: ['Polda Riau bertekad memberantas pembalakan kayu yang kerap terjadi di Riau. Pelaku tindak kriminal akan ditindak tegas.']


In [30]:
import evaluate

# Load ROUGE metric
rouge = evaluate.load('rouge')

In [31]:
actual_answer = [eval_df['clean_summary'][0]]
actual_answer

['Kapolda Riau baru Brigjen Pol. Johny Yodjana bertekad memberantas pelaku penyelundupan kayu di Riau. Ia berjanji akan menindak tegas pelaku tanpa pandang bulu .']

In [32]:
generated_texts

['Polda Riau bertekad memberantas pembalakan kayu yang kerap terjadi di Riau. Pelaku tindak kriminal akan ditindak tegas.']

In [33]:
# Compute ROUGE scores
rouge_scores = rouge.compute(predictions=generated_texts, references=actual_answer)

# Print ROUGE scores
print("Inference ROUGE scores:", rouge_scores)

Inference ROUGE scores: {'rouge1': 0.45, 'rouge2': 0.10526315789473685, 'rougeL': 0.39999999999999997, 'rougeLsum': 0.39999999999999997}


In [34]:
rouge_scores

{'rouge1': 0.45,
 'rouge2': 0.10526315789473685,
 'rougeL': 0.39999999999999997,
 'rougeLsum': 0.39999999999999997}

# 2

In [29]:
input_texts = test_df['clean_article'][0]

In [30]:
# Inference mode (text generation)
import time

start_time = time.time()

inputs = encoder_tokenizer(input_texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

with torch.no_grad():
  outputs = model.generate(input_ids, attention_mask=attention_mask,
                           max_length=128,
                           num_beams = model.config.num_beams,
                           early_stopping = model.config.early_stopping,
                           length_penalty = model.config.length_penalty)

generated_texts = decoder_tokenizer.batch_decode(outputs, skip_special_tokens=True)

end_time = time.time()
inference_time = end_time - start_time

print(f"Inference Time: {inference_time:.2f} seconds")
print("Generated Texts:", generated_texts)

Inference Time: 3.10 seconds
Generated Texts: ['Pemerintah masih memberikan waktu dua minggu lagi kepada konglomerat yang telah menandatangani perjanjian pengembalian BLBI. Jika lewat dari tenggat waktu tersebut, pemerintah akan menerapkan tindakan hukum.']


In [31]:
actual_answer = [test_df['clean_summary'][0]]
actual_answer

['Pemerintah memberikan tenggat 14 hari kepada para konglomerat penandatangan MSAA untuk menyerahkan aset. Jika mangkir, mereka bakal dihukum .']

In [32]:
generated_texts

['Pemerintah masih memberikan waktu dua minggu lagi kepada konglomerat yang telah menandatangani perjanjian pengembalian BLBI. Jika lewat dari tenggat waktu tersebut, pemerintah akan menerapkan tindakan hukum.']

In [33]:
# Compute ROUGE scores
rouge_scores = rouge.compute(predictions=generated_texts, references=actual_answer)

# Print ROUGE scores
print("Inference ROUGE scores:", rouge_scores)

Inference ROUGE scores: {'rouge1': 0.27272727272727276, 'rouge2': 0.0, 'rougeL': 0.2272727272727273, 'rougeLsum': 0.2272727272727273}


In [34]:
rouge_scores

{'rouge1': 0.27272727272727276,
 'rouge2': 0.0,
 'rougeL': 0.2272727272727273,
 'rougeLsum': 0.2272727272727273}

# 3

In [35]:
input_texts = eval_df['clean_article'][1]

In [36]:
# Inference mode (text generation)
import time

start_time = time.time()

inputs = encoder_tokenizer(input_texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

with torch.no_grad():
  outputs = model.generate(input_ids, attention_mask=attention_mask,
                           max_length=128,
                           num_beams = model.config.num_beams,
                           early_stopping = model.config.early_stopping,
                           length_penalty = model.config.length_penalty)

generated_texts = decoder_tokenizer.batch_decode(outputs, skip_special_tokens=True)

end_time = time.time()
inference_time = end_time - start_time

print(f"Inference Time: {inference_time:.2f} seconds")
print("Generated Texts:", generated_texts)

Inference Time: 2.84 seconds
Generated Texts: ['Bank Indonesia dinilai masih akan menghadapi situasi sulit meski Bank Sentral Amerika Serikat terus menurunkan tingkat suku bunga.']


In [37]:
actual_answer = [eval_df['clean_summary'][1]]
actual_answer

['Kendati Bank Sentral AS menurunkan suku bunganya, namun BI dinilai masih akan menemui masa sulit. Suku bunga Bank Sentral AS akan diturunkan menjadi empat persen .']

In [38]:
generated_texts

['Bank Indonesia dinilai masih akan menghadapi situasi sulit meski Bank Sentral Amerika Serikat terus menurunkan tingkat suku bunga.']

In [39]:
# Compute ROUGE scores
rouge_scores = rouge.compute(predictions=generated_texts, references=actual_answer)

# Print ROUGE scores
print("Inference ROUGE scores:", rouge_scores)

Inference ROUGE scores: {'rouge1': 0.46511627906976744, 'rouge2': 0.19512195121951217, 'rougeL': 0.32558139534883723, 'rougeLsum': 0.32558139534883723}


In [40]:
rouge_scores

{'rouge1': 0.46511627906976744,
 'rouge2': 0.19512195121951217,
 'rougeL': 0.32558139534883723,
 'rougeLsum': 0.32558139534883723}

# 4

In [41]:
input_texts = test_df['clean_article'][1]

In [42]:
# Inference mode (text generation)
import time

start_time = time.time()

inputs = encoder_tokenizer(input_texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

with torch.no_grad():
  outputs = model.generate(input_ids, attention_mask=attention_mask,
                           max_length=128,
                           num_beams = model.config.num_beams,
                           early_stopping = model.config.early_stopping,
                           length_penalty = model.config.length_penalty)

generated_texts = decoder_tokenizer.batch_decode(outputs, skip_special_tokens=True)

end_time = time.time()
inference_time = end_time - start_time

print(f"Inference Time: {inference_time:.2f} seconds")
print("Generated Texts:", generated_texts)

Inference Time: 3.02 seconds
Generated Texts: ['Ketua DPR Akbar Tandjung dan Ketua MPR Amien Rais mengecam tindakan Israel terhadap Palestina. Ketua DPR Amien meminta Presiden tak membuka hubungan dagang dengan Israel.']


In [43]:
actual_answer = [test_df['clean_summary'][1]]
actual_answer

['MPR dan DPR mengutuk tindakan kekerasan tentara Israel terhadap warga Palestina. Hal itu akan dicetuskan dalam sidang Antarparlemen di Jakarta .']

In [44]:
generated_texts

['Ketua DPR Akbar Tandjung dan Ketua MPR Amien Rais mengecam tindakan Israel terhadap Palestina. Ketua DPR Amien meminta Presiden tak membuka hubungan dagang dengan Israel.']

In [45]:
# Compute ROUGE scores
rouge_scores = rouge.compute(predictions=generated_texts, references=actual_answer)

# Print ROUGE scores
print("Inference ROUGE scores:", rouge_scores)

Inference ROUGE scores: {'rouge1': 0.3111111111111111, 'rouge2': 0.04651162790697675, 'rougeL': 0.22222222222222224, 'rougeLsum': 0.22222222222222224}


In [46]:
rouge_scores

{'rouge1': 0.3111111111111111,
 'rouge2': 0.04651162790697675,
 'rougeL': 0.22222222222222224,
 'rougeLsum': 0.22222222222222224}

# 5

In [47]:
input_texts = eval_df['clean_article'][2]

In [48]:
# Inference mode (text generation)
import time

start_time = time.time()

inputs = encoder_tokenizer(input_texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

with torch.no_grad():
  outputs = model.generate(input_ids, attention_mask=attention_mask,
                           max_length=128,
                           num_beams = model.config.num_beams,
                           early_stopping = model.config.early_stopping,
                           length_penalty = model.config.length_penalty)

generated_texts = decoder_tokenizer.batch_decode(outputs, skip_special_tokens=True)

end_time = time.time()
inference_time = end_time - start_time

print(f"Inference Time: {inference_time:.2f} seconds")
print("Generated Texts:", generated_texts)

Inference Time: 3.69 seconds
Generated Texts: ['Pemerintah berniat beralih ke sektor perikanan budidaya yang lebih menguntungkan. Saat ini, potensi perikanan tangkap di Indonesia hanya sekitar enam juta ton per tahun.']


In [49]:
actual_answer = [eval_df['clean_summary'][2]]
actual_answer

['Pemerintah bermaksud akan lebih mengandalkan sektor perikanan budidaya untuk meningkatkan pendapatan negara. Pasalnya, sektor perikanan tangkap yang selama ini dijadikan andalan sudah tidak optimal lagi .']

In [50]:
generated_texts

['Pemerintah berniat beralih ke sektor perikanan budidaya yang lebih menguntungkan. Saat ini, potensi perikanan tangkap di Indonesia hanya sekitar enam juta ton per tahun.']

In [51]:
# Compute ROUGE scores
rouge_scores = rouge.compute(predictions=generated_texts, references=actual_answer)

# Print ROUGE scores
print("Inference ROUGE scores:", rouge_scores)

Inference ROUGE scores: {'rouge1': 0.3673469387755102, 'rouge2': 0.1276595744680851, 'rougeL': 0.24489795918367346, 'rougeLsum': 0.24489795918367346}


In [52]:
rouge_scores

{'rouge1': 0.3673469387755102,
 'rouge2': 0.1276595744680851,
 'rougeL': 0.24489795918367346,
 'rougeLsum': 0.24489795918367346}