In [19]:
import pandas as pd
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score
import pickle
import os
from transformers import AutoTokenizer
import numpy as np


In [31]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [6]:
class KPKDataBert(Dataset):

  def __init__(self, train_df, val_df):
    self.status_dict = {'no': 0, 'yes': 1}

    self.train_df = train_df
    self.val_df = val_df

    self.base_path = 'DATA/'
    self.tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p2', do_lower_case=True) # Using a pre-trained IndoBERT tokenizer to encode sentences
    self.train_data = None
    self.val_data = None
    self.init_data()

  def init_data(self):
    self.train_data = self.load_data(self.train_df)
    self.val_data = self.load_data(self.val_df)

  def load_data(self, df):
    MAX_LEN = 512
    token_ids = []
    mask_ids = []
    seg_ids = []
    y = []

    instansi_list = df['instansi'].to_list()
    reference_list = df['reference'].to_list()
    status_list = df['status'].to_list()

    for (instansi, reference, status) in zip(instansi_list, reference_list, status_list):
      instansi_id = self.tokenizer.encode(instansi, add_special_tokens = False)
      reference_id = self.tokenizer.encode(reference, add_special_tokens = False)
      pair_token_ids = [self.tokenizer.cls_token_id] + instansi_id + [self.tokenizer.sep_token_id] + reference_id + [self.tokenizer.sep_token_id]
      instansi_len = len(instansi_id)
      reference_len = len(reference_id)

      segment_ids = torch.tensor([0] * (instansi_len + 2) + [1] * (reference_len + 1))  # sentence 0 and sentence 1
      attention_mask_ids = torch.tensor([1] * (instansi_len + reference_len + 3))  # mask padded values

      token_ids.append(torch.tensor(pair_token_ids))
      seg_ids.append(segment_ids)
      mask_ids.append(attention_mask_ids)
      y.append(self.status_dict[status])
    
    token_ids = pad_sequence(token_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    seg_ids = pad_sequence(seg_ids, batch_first=True)
    y = torch.tensor(y)
    dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
    print(len(dataset))
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=True):
    train_loader = DataLoader(
      self.train_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    val_loader = DataLoader(
      self.val_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    return train_loader, val_loader


train_df = pd.read_csv('DATA/v1/train.csv')
train_df['status'] = train_df['status'].apply(lambda x : str(x).strip())

val_df = pd.read_csv('DATA/v1/val.csv')

KPK_dataset = KPKDataBert(train_df, val_df)

1695
199


In [7]:
train_loader, val_loader = KPK_dataset.get_data_loaders(batch_size=16)

In [8]:
# from transformers import AdamW
# model = AutoModel.from_pretrained("indobenchmark/indobert-base-p2")
# model.to(device)

from transformers import BertForSequenceClassification, AdamW
model = BertForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p2", num_labels=2)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [9]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [10]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)



In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 124,442,882 trainable parameters


In [32]:
def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

In [41]:
#Train
best_model_path = "Best_Model/indoBERT.pt"

from torchmetrics import F1Score
import time

EPOCHS = 10

def train(model, train_loader, val_loader, optimizer):  
  total_step = len(train_loader)
  
  best_F1_scores = 0
  for epoch in range(EPOCHS):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0


    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
      # optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)

      loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()

      acc = multi_acc(prediction, labels)
      loss.backward()
      optimizer.step()
      
      total_train_loss += loss.item()
      total_train_acc  += acc.item()

    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)


    model.eval()
    total_val_acc  = 0
    total_val_loss = 0
    
    with torch.no_grad():
      for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
        optimizer.zero_grad()
        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)
        
        loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()

        total_val_loss += loss.item()
        total_val_acc  += acc.item()

    val_acc  = total_val_acc/len(val_loader)
    val_loss = total_val_loss/len(val_loader)


    F1 = F1Score(num_classes=2).to(device)
    f1 = F1(prediction, labels)

    if f1 > best_F1_scores:
      torch.save(model, best_model_path)

    best_F1_scores = f1

    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f} | F1 score : {f1}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [42]:
train(model, train_loader, val_loader, optimizer)

Epoch 1: train_loss: 0.7444 train_acc: 0.5392 | val_loss: 0.7378 val_acc: 0.4667 | F1 score : 0.4285714328289032
00:00:10.05
Epoch 2: train_loss: 0.8458 train_acc: 0.5357 | val_loss: 0.7032 val_acc: 0.6000 | F1 score : 1.0
00:00:09.62
Epoch 3: train_loss: 0.7209 train_acc: 0.5028 | val_loss: 0.6927 val_acc: 0.6667 | F1 score : 0.5714285969734192
00:00:08.89
Epoch 4: train_loss: 0.8241 train_acc: 0.4980 | val_loss: 0.7913 val_acc: 0.5333 | F1 score : 0.5714285969734192
00:00:08.89
Epoch 5: train_loss: 0.7311 train_acc: 0.5594 | val_loss: 0.6966 val_acc: 0.6667 | F1 score : 0.8571428656578064
00:00:09.60
Epoch 6: train_loss: 0.7694 train_acc: 0.5346 | val_loss: 0.7063 val_acc: 0.6000 | F1 score : 0.714285671710968
00:00:08.98
Epoch 7: train_loss: 0.7100 train_acc: 0.5263 | val_loss: 0.6790 val_acc: 0.5333 | F1 score : 0.4285714328289032
00:00:09.01
Epoch 8: train_loss: 0.6874 train_acc: 0.5616 | val_loss: 0.6882 val_acc: 0.4667 | F1 score : 0.714285671710968
00:00:09.61
Epoch 9: train_lo

In [43]:
######################### LOAD MODEL
model_new = torch.load('Best_Model\indoBERT.pt')