#**Package Preparation**

In [None]:
pip install torch torchvision



In [None]:
pip install transformers



In [None]:
!git clone https://github.com/indobenchmark/indonlu

Cloning into 'indonlu'...
remote: Enumerating objects: 500, done.[K
remote: Counting objects: 100% (184/184), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 500 (delta 115), reused 139 (delta 110), pack-reused 316[K
Receiving objects: 100% (500/500), 9.45 MiB | 6.14 MiB/s, done.
Resolving deltas: 100% (235/235), done.


In [None]:
# Import all packages
import torch
import random
import numpy as np
import pandas as pd
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader


from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from indonlu.utils.forward_fn import forward_sequence_classification
from indonlu.utils.metrics import document_sentiment_metrics_fn
from indonlu.utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [None]:
#common functions

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [None]:
# Set random seed
set_seed(19072021)

#**Load Pre-trained Model**

- Menggunakan pre-trained model Indobert-base-p1 yang memiliki 124.5 juta parameter
- Model Indobert dibangun berdasarkan general-purpose architecture BERT (Bidirectional Encoder Representation from Transformers)
- BERT didesain untuk membantu komputer memahami arti bahasa ambigu dalam teks. Caranya adalah menggunakan teks di sekitarnya untuk membangun konteks.

In [None]:
# Load tokenizer and config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# To call the model
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# To see the parameters
count_param(model)

124443651

#**Dataset Preparation**

In [None]:
train_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
test_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'

In [None]:
class DocumentSentimentDataset(Dataset):
  # static constant variable
  LABEL2INDEX = {'positive':0, 'neutral':1, 'negative':2} # map dari label string ke index
  INDEX2LABEL = {0:'positive', 1:'neutral', 2:'negative'} # map dari index ke label string
  NUM_LABELS = 3 # jumlah label

  def load_dataset(self, path):
    df = pd.read_csv(path, sep='\t', header=None) #Read tsv file with pandas
    df.columns = ['text','sentiment'] # berikan nama pada kolom tabel
    df['sentiment'] = df['sentiment'].apply(lambda lab: self.LABEL2INDEX[lab]) #konversi string label to index
    return df

  def __init__(self, dataset_path, tokenizer, *args, **kwargs):
    self.data = self.load_dataset(dataset_path) # load tsv file
    # Assign tokenizer, menggunakan tokenizer subword dari huggingface
    self.tokenizer = tokenizer

  def __getitem__(self, index):
    data = self.data.loc[index,:] # Ambil data dari baris tertentu dari tabel
    text, sentiment = data['text'], data['sentiment'] # Ambil nilai text dan sentiment
    subwords = self.tokenizer.encode(text) # Tokenisasi text menjadi subword

    # return numpy array dari subwords dan label
    return np.array(subwords), np.array(sentiment), data['text']

  def __len__(self):
    return len(self.data) # Return panjang dari dataset

class DocumentSentimentDataLoader(DataLoader):
  def __init__(self, max_seq_len=512, *args, **kwargs):
    super(DocumentSentimentDataLoader, self).__init__(*args, **kwargs)
    self.max_seq_len = max_seq_len # Assign batas max subword
    self.collate_fn = self._collate_fn # Assign fungsi collate_fn dengan funsgi yang kita definisikan

  def _collate_fn(self, batch):
    batch_size = len(batch) # Ambil  batch size
    max_seq_len = max(map(lambda x: len(x[0]), batch)) # cari panjang subword max dari batch
    max_seq_len = min(self.max_seq_len, max_seq_len) # bandingkan dengan batas yang kita tentukan sebelumnya


    # buat buffer untuk subword, mask, dan sentimen labels, inisialisasikan semuanya dengan 0
    subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
    mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
    sentiment_batch = np.zeros((batch_size, 1), dtype=np.int64)

    # isi semua buffer
    for i, (subwords, sentiment, raw_seq) in enumerate(batch):
      subwords = subwords[:max_seq_len]
      subword_batch[i,:len(subwords)] = subwords
      mask_batch[i,:len(subwords)] = 1
      sentiment_batch[i,0] = sentiment

      # return subword, mask, sentiment data
      return subword_batch, mask_batch, sentiment_batch

In [None]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

In [None]:
train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)



In [None]:
print(train_dataset[0])

(array([    2,  6540,    92,  2970,   213,  4259,  3553,   899,    34,
         259,  5590,   262,  2558,   386,   899,  1687,    26,  1574,
       30470,   899,  3310, 30468, 22130, 30360,  6123,  6368, 30468,
       22130, 30360,  2652,  1746, 30468,  8869,  6540,    34,  6315,
        1622,  1256,  8949,   899, 30468,  4222,  1622,   752,   245,
         295,  2083, 30470,  2346,  7107,   300, 30470,   405,   724,
        5189, 30470,   843, 17464,   899,   540, 10989,  3331,  1107,
       30468,   119,  3221,    79,    34,  2170,    98,  9167, 30457,
           3]), array(0), 'warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung . tahu berkualitas , dipadu keahlian memasak , dipadu kretivitas , jadilah warung yang menyajikan menu utama berbahan tahu , ditambah menu umum lain seperti ayam . semuanya selera indonesia . harga cukup terjangkau . jangan lewatkan tahu bletoka nya , tidak kalah dengan yang asli dari tegal !')


In [None]:
# Definisikan variabel w2i dan i2w untuk tempat DocumentSentimentDataset.LABEL2INDEX dan DocumentSentimentDataset.INDEX2LABEL
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


#**Model Testing with the e.g. Sentences**

In [None]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1,-1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : positive (39.380%)


Padahal, kita dapat mengidentifikasi kalau teks tersebut seharusnya masuk dalam kategori sentimen positif. Oleh karena itu, mari kita lakukan proses Fine Tuning dan Evaluasi.

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

NameError: ignored