In [None]:
!nvidia-smi

Fri Jul  5 11:12:55 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:01:00.0 Off |                  N/A |
| 30%   45C    P8               7W / 370W |     35MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset, load_from_disk
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import BertTokenizer, BertModel, EncoderDecoderModel, TrainingArguments, Trainer, DataCollatorForSeq2Seq

2024-07-05 11:12:58.376015: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-05 11:12:58.405519: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
import glob
import json
import re

train_file = glob.glob("liputan6_data/canonical/train/*.json")
train_file.sort(key=lambda f: int(re.sub('\D', '', f)))

eval_file = glob.glob("liputan6_data/canonical/test/*.json")
eval_file.sort(key=lambda f: int(re.sub('\D', '', f)))

train_data = []
eval_data = []

for i in train_file:
  with open(i, "r", encoding="utf-8") as f:
    d = json.load(f)
    train_data.append(d)

for i in eval_file:
  with open(i, "r", encoding="utf-8") as f:
    d = json.load(f)
    eval_data.append(d)

print(f"train data: {len(train_data)}")
print(f"eval data: {len(eval_data)}")

train data: 193883
eval data: 10972


In [None]:
train_data[0].keys()

dict_keys(['id', 'url', 'clean_article', 'clean_summary', 'extractive_summary'])

In [None]:
train_data = train_data[:50000]
eval_data = eval_data[:5000]

In [None]:
import numpy as np
import pandas as pd

def custom_join(words):
  result = ' '.join(words)
  result = result.replace("Liputan6 . com", "Liputan6.com")
  result = result.replace(" , ", ", ")
  result = result.replace(" . ", ". ")
  result = result.replace(" ( ", " (")
  result = result.replace(" ) ", ") ")
  return result


def make_dataset_df(data):
  clean_article = []
  clean_summary = []

  for item in data:
    clean_article_sentence = []
    for arr in item['clean_article']:
      clean_article_sentence.extend(arr)
    joined_str1 = custom_join(clean_article_sentence)
    clean_article.append(joined_str1)

    clean_summary_sentence = []
    for arr in item['clean_summary']:
      clean_summary_sentence.extend(arr)
    joined_str2 = custom_join(clean_summary_sentence)
    clean_summary.append(joined_str2)

  df = pd.DataFrame({'clean_article': clean_article, 'clean_summary': clean_summary})
  return df

In [None]:
train_df = make_dataset_df(train_data)
eval_df = make_dataset_df(eval_data)

In [None]:
train_df

Unnamed: 0,clean_article,clean_summary
0,"Liputan6.com, Ambon : Partai Bulan Bintang wil...",Konflik Ambon telah berlangsung selama tiga ta...
1,"Liputan6.com, Denpasar : Berbeda dengan sebagi...",Masyarakat Bali merayakan Tahun Baru dengan tr...
2,"Liputan6.com, Jakarta : Partai Keadilan bertek...",Partai Keadilan menargetkan tambahan sejuta pe...
3,"Liputan6.com, Jakarta : Sekitar Rumah Makan Ay...",Pascaledakan granat di depan Rumah Makan Ayam ...
4,"Liputan6.com, Jambi : Ratusan hektare sawah di...","Bencana Banjir di Jambi, juga mengakibatkan ra..."
...,...,...
49995,"Liputan6.com, Semaranfa : Pelantikan Wali Kota...",Pelantikan Wali Kota Semarang di Balai Kota Se...
49996,"Liputan6.com, Serang : Sekelompok orang yang m...",Pengunjuk rasa yang tergabung dalam Gerakan Ma...
49997,"Liputan6.com, Jakarta : Lebih dari seribu eks ...",Ribuan bekas karyawan PT DI berunjuk rasa di d...
49998,"Liputan6.com, Purwakarta : Kelangkaan minyak t...","Warga Purwakarta, Jawa Barat, sudah satu bulan..."


In [None]:
eval_df

Unnamed: 0,clean_article,clean_summary
0,"Liputan6.com, Jakarta : Kepolisian Daerah Riau...",Kapolda Riau baru Brigjen Pol. Johny Yodjana b...
1,"Liputan6.com, Jakarta : Bank Indonesia dinilai...",Kendati Bank Sentral AS menurunkan suku bungan...
2,"Liputan6.com, Jakarta : Berbagai kendala mengh...",Pemerintah bermaksud akan lebih mengandalkan s...
3,"Liputan6.com, Jakarta : Penghapusan beberapa p...","Revisi Kepmennaker Nomor 78 Tahun 2001, dinila..."
4,"Liputan6.com, Jakarta : Operasi Sadar Jaya yan...",Polisi menangkap 32 pengunjung Diskotik Mileni...
...,...,...
4995,"Liputan6.com, Jakarta : Pemerintah tak akan me...",Pemerintah melarang Kapal Tampa yang berisi ra...
4996,"Liputan6.com, Jambi : Ini potret perpecahan di...",Ketua DPC KNPI Jambi Rudi Ardiansyah yang memi...
4997,"Liputan6.com, Padang : Pelantikan lima penguru...",Pelantikan lima pengurus Dewan Pimpinan Daerah...
4998,"Liputan6.com, Semarang : Solar mengalami kelan...",Bahan bakar minyak jenis solar di jalur Pantai...


In [None]:
train_df['clean_article'][0]

'Liputan6.com, Ambon : Partai Bulan Bintang wilayah Maluku bertekad membantu pemerintah menyelesaikan konflik di provinsi tersebut. Syaratnya, penanganan penyelesaian konflik Maluku harus dimulai dari awal kerusuhan, yakni 19 Januari 1999. Demikian hasil Musyawarah Wilayah I PBB Maluku yang dimulai Sabtu pekan silam dan berakhir Senin (31/12) di Ambon. Menurut seorang fungsionaris PBB Ridwan Hasan, persoalan di Maluku bisa selesai asalkan pemerintah dan aparat keamanan serius menangani setiap persoalan di Maluku secara komprehensif dan bijaksana. Itulah sebabnya, PBB wilayah Maluku akan menjadikan penyelesaian konflik sebagai agenda utama partai. PBB Maluku juga akan mendukung penegakan hukum secara terpadu dan tanpa pandang bulu. Siapa saja yang melanggar hukum harus ditindak. Ridwan berharap, Ketua PBB Maluku yang baru, Ali Fauzi, dapat menindak lanjuti agenda politik partai yang telah diamanatkan dan mau mendukung penegakan hukum di Maluku. (ULF/Sahlan Heluth) .'

In [None]:
train_df['clean_summary'][0]

'Konflik Ambon telah berlangsung selama tiga tahun. Partai Bulan Bintang wilayah Maluku siap membantu pemerintah menyelesaikan kasus di provinsi tersebut .'

In [None]:
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

In [None]:
train_dataset

Dataset({
    features: ['clean_article', 'clean_summary'],
    num_rows: 50000
})

In [None]:
eval_dataset

Dataset({
    features: ['clean_article', 'clean_summary'],
    num_rows: 5000
})

In [None]:
tokenizer = BertTokenizer.from_pretrained('cahya/bert-base-indonesian-1.5G')

In [None]:
def tokenize_data(example):
    input_encoding = tokenizer(example['clean_article'], padding='max_length', truncation=True, max_length=512)
    target_encoding = tokenizer(example['clean_summary'], padding='max_length', truncation=True, max_length=128)
    return {
        'input_ids': input_encoding['input_ids'],
        'attention_mask': input_encoding['attention_mask'],
        'labels': target_encoding['input_ids']
    }

In [None]:
tokenized_train = train_dataset.map(tokenize_data, batched=True, num_proc=4)
tokenized_eval = eval_dataset.map(tokenize_data, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/50000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
tokenizer.pad_token

'[PAD]'

In [None]:
tokenizer.cls_token

'[CLS]'

In [None]:
# make Bert2Bert (Encoder-Decoder model)
model = EncoderDecoderModel.from_encoder_decoder_pretrained('cahya/bert-base-indonesian-1.5G', 'cahya/bert-base-indonesian-1.5G')

Some weights of BertLMHeadModel were not initialized from the model checkpoint at cahya/bert-base-indonesian-1.5G and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.

In [None]:
# Define special tokens
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

In [None]:
model.config.encoder.vocab_size

32000

In [None]:
# Set configurations for the encoder and decoder
model.config.encoder.max_length = 512
model.config.decoder.max_length = 128
model.config.decoder.min_length = 12
model.config.length_penalty = 2.0
model.config.early_stopping = True

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=5e-05,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy='epoch',
    save_strategy='epoch',
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

[2024-07-05 11:13:59,215] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/home/willy030125/Downloads/BertSummarization' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/willy030125/huggingface/2f29c273ec4f433c8d57a698f2708043

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: willy030125. Use `wandb login --relogin` to force relogin




Epoch,Training Loss,Validation Loss
0,1.0477,0.851269
1,0.8353,0.730656
2,0.6533,0.606682
4,0.4597,0.550588
5,0.4146,0.549792
6,0.379,0.546868
8,0.3322,0.554853
9,0.3184,0.557019


Non-default generation parameters: {'early_stopping': True, 'length_penalty': 2.0}
Non-default generation parameters: {'early_stopping': True, 'length_penalty': 2.0}
Non-default generation parameters: {'early_stopping': True, 'length_penalty': 2.0}
Non-default generation parameters: {'early_stopping': True, 'length_penalty': 2.0}
Non-default generation parameters: {'early_stopping': True, 'length_penalty': 2.0}
Non-default generation parameters: {'early_stopping': True, 'length_penalty': 2.0}
Non-default generation parameters: {'early_stopping': True, 'length_penalty': 2.0}
Non-default generation parameters: {'early_stopping': True, 'length_penalty': 2.0}
Non-default generation parameters: {'early_stopping': True, 'length_penalty': 2.0}
Non-default generation parameters: {'early_stopping': True, 'length_penalty': 2.0}
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summar

TrainOutput(global_step=3900, training_loss=0.5953431080549191, metrics={'train_runtime': 15236.0888, 'train_samples_per_second': 32.817, 'train_steps_per_second': 0.256, 'total_flos': 3.061502154909942e+17, 'train_loss': 0.5953431080549191, 'epoch': 9.980806142034549})

In [None]:
results = trainer.evaluate(tokenized_eval)



In [None]:
results

{'eval_loss': 0.5570188760757446,
 'eval_runtime': 52.0718,
 'eval_samples_per_second': 96.021,
 'eval_steps_per_second': 12.003,
 'epoch': 9.980806142034549}

In [None]:
import math

metrics = trainer.evaluate(tokenized_eval)

try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")

metrics["perplexity"] = perplexity

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

print(perplexity)

In [None]:
trainer.save_model('./Bert2Bert_trained/')

Non-default generation parameters: {'early_stopping': True, 'length_penalty': 2.0}
