In [1]:
pip install torch torchvision



In [2]:
import torch

In [3]:
pip install transformers



In [4]:
!git clone https://github.com/indobenchmark/indonlu

Cloning into 'indonlu'...
remote: Enumerating objects: 509, done.[K
remote: Counting objects: 100% (193/193), done.[K
remote: Compressing objects: 100% (83/83), done.[K
remote: Total 509 (delta 119), reused 139 (delta 110), pack-reused 316 (from 1)[K
Receiving objects: 100% (509/509), 9.46 MiB | 14.18 MiB/s, done.
Resolving deltas: 100% (239/239), done.


In [5]:
import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from indonlu.utils.forward_fn import forward_sequence_classification
from indonlu.utils.metrics import document_sentiment_metrics_fn
from indonlu.utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [6]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [7]:
# Set random seed
set_seed(26092020)

In [8]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
count_param(model)

124443651

In [11]:
train_dataset_path = '/content/train.tsv'
valid_dataset_path = '/content/val.tsv'
test_dataset_path = '/content/test.tsv'

In [12]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)



In [13]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


In [14]:
text = 'bacot'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: bacot | Label : negative (35.408%)


In [15]:
text = 'Budi pergi ke pondok indah mall membeli cakwe'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Budi pergi ke pondok indah mall membeli cakwe | Label : neutral (35.103%)


In [16]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [17]:
# Train
n_epochs = 5
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)

    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)

    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))

    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:0.9597 LR:0.00000300: 100%|██████████| 92/92 [00:24<00:00,  3.82it/s]


(Epoch 1) TRAIN LOSS:0.9597 ACC:0.53 F1:0.48 REC:0.48 PRE:0.55 LR:0.00000300


VALID LOSS:0.7767 ACC:0.68 F1:0.65 REC:0.65 PRE:0.66: 100%|██████████| 20/20 [00:02<00:00,  7.64it/s]


(Epoch 1) VALID LOSS:0.7767 ACC:0.68 F1:0.65 REC:0.65 PRE:0.66


(Epoch 2) TRAIN LOSS:0.7241 LR:0.00000300: 100%|██████████| 92/92 [00:23<00:00,  4.00it/s]


(Epoch 2) TRAIN LOSS:0.7241 ACC:0.69 F1:0.68 REC:0.68 PRE:0.69 LR:0.00000300


VALID LOSS:0.6616 ACC:0.72 F1:0.70 REC:0.71 PRE:0.70: 100%|██████████| 20/20 [00:02<00:00,  6.67it/s]


(Epoch 2) VALID LOSS:0.6616 ACC:0.72 F1:0.70 REC:0.71 PRE:0.70


(Epoch 3) TRAIN LOSS:0.5917 LR:0.00000300: 100%|██████████| 92/92 [00:23<00:00,  3.86it/s]


(Epoch 3) TRAIN LOSS:0.5917 ACC:0.76 F1:0.75 REC:0.75 PRE:0.75 LR:0.00000300


VALID LOSS:0.6191 ACC:0.73 F1:0.72 REC:0.72 PRE:0.72: 100%|██████████| 20/20 [00:02<00:00,  6.93it/s]


(Epoch 3) VALID LOSS:0.6191 ACC:0.73 F1:0.72 REC:0.72 PRE:0.72


(Epoch 4) TRAIN LOSS:0.5110 LR:0.00000300: 100%|██████████| 92/92 [00:23<00:00,  3.86it/s]


(Epoch 4) TRAIN LOSS:0.5110 ACC:0.79 F1:0.79 REC:0.79 PRE:0.79 LR:0.00000300


VALID LOSS:0.6200 ACC:0.76 F1:0.74 REC:0.74 PRE:0.74: 100%|██████████| 20/20 [00:02<00:00,  7.04it/s]


(Epoch 4) VALID LOSS:0.6200 ACC:0.76 F1:0.74 REC:0.74 PRE:0.74


(Epoch 5) TRAIN LOSS:0.4353 LR:0.00000300: 100%|██████████| 92/92 [00:25<00:00,  3.63it/s]


(Epoch 5) TRAIN LOSS:0.4353 ACC:0.83 F1:0.83 REC:0.83 PRE:0.83 LR:0.00000300


VALID LOSS:0.6413 ACC:0.75 F1:0.73 REC:0.72 PRE:0.74: 100%|██████████| 20/20 [00:02<00:00,  6.68it/s]

(Epoch 5) VALID LOSS:0.6413 ACC:0.75 F1:0.73 REC:0.72 PRE:0.74





In [18]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)

print(df)

100%|██████████| 20/20 [00:03<00:00,  5.75it/s]

     index     label
0        0   neutral
1        1  positive
2        2   neutral
3        3   neutral
4        4  positive
..     ...       ...
621    621   neutral
622    622  positive
623    623  negative
624    624   neutral
625    625   neutral

[626 rows x 2 columns]





In [22]:
text = 'rafael mainnya jelek sekali'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: rafael mainnya jelek sekali | Label : negative (97.710%)
