# Testing Electra Model

Since we will use Electra to embed document contexts, we need to test Electra model. \
As namuwiki is based on Korean, we will use KOElectra(<https://github.com/monologg/KoELECTRA>) \
This Code is tested on Colab, So it might be not available on other environments.



In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers



In [None]:
!wget https://www.dropbox.com/s/374ftkec978br3d/ratings_train.txt?dl=1
!wget https://www.dropbox.com/s/977gbwh542gdy94/ratings_test.txt?dl=1

--2021-08-19 05:13:30--  https://www.dropbox.com/s/374ftkec978br3d/ratings_train.txt?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.65.18, 2620:100:6027:18::a27d:4812
Connecting to www.dropbox.com (www.dropbox.com)|162.125.65.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/dl/374ftkec978br3d/ratings_train.txt [following]
--2021-08-19 05:13:30--  https://www.dropbox.com/s/dl/374ftkec978br3d/ratings_train.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucacf19fa080ad8ef9424c92ff0b.dl.dropboxusercontent.com/cd/0/get/BUhjQyI6P3kkjOVWpb2j1L5iQdRUz4_CY9wkFhrXIcreR4qZ6oFj8cBnC59Lk4AUuCHni_r9cSg4nIlSKaYwTDdcP0NL1WQHhIJNzG4GTDeahZ7FcYfJescyP9BC29DZdeeBq8RuFbyl1UnVgFMSLeNS/file?dl=1# [following]
--2021-08-19 05:13:30--  https://ucacf19fa080ad8ef9424c92ff0b.dl.dropboxusercontent.com/cd/0/get/BUhjQyI6P3kkjOVWpb2j1L5iQdRUz4_CY9wkFhrXIcreR4qZ6oFj8cBnC59Lk4AUuCHni_

# Load Libraries

Using transformers, we can load model easily.

In [None]:
import gluonnlp as nlp
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import ElectraModel, ElectraTokenizer, DistilBertModel

Load Pretrained KoElectra Model

In [None]:
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
electra = ElectraModel.from_pretrained("monologg/koelectra-small-v3-discriminator")
bert = DistilBertModel.from_pretrained('monologg/distilkobert')

print(sum(p.numel() for p in electra.parameters() if p.requires_grad))
print(sum(p.numel() for p in bert.parameters() if p.requires_grad))

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at monologg/distilkobert were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_proje

14056192
27803904


In [None]:
# Read data

dataset_train = nlp.data.TSVDataset("ratings_train.txt?dl=1", field_indices=[1,2], num_discard_samples=1)
dataset_test = nlp.data.TSVDataset("ratings_test.txt?dl=1", field_indices=[1,2], num_discard_samples=1)

dataset_test[0]

['굳 ㅋ', '1']

# Make Dataset

To train model, we have to process original data with tokenizer. \
For this propose, use Dataset class in pyotrch.

In [None]:
class Rating(Dataset):
  def __init__(self, data, tokenizer, max_len):
    self.comments = [tokenizer(d[0], padding='max_length', max_length=max_len, return_tensors= 'pt', truncation=True) for d in data]
    self.labels = [d[1] for d in data]

  def __getitem__(self, idx):
    return self.comments[idx], torch.tensor([int(self.labels[idx])])

  def __len__(self):
    return len(self.labels)

In [None]:
# Trian settings
epoch = 5
max_len = 64
learning_rate = 0.001
batch_size = 128
log_interval = 500
max_grad_norm = 0.5

device = 'cuda'

In [None]:
dataset_train = Rating(dataset_train, tokenizer, max_len)
dataset_test = Rating(dataset_test, tokenizer, max_len)

In [None]:
trainLoader = DataLoader(dataset_train, batch_size=batch_size, num_workers=5)
testLoader = DataLoader(dataset_test, batch_size=4, num_workers=5)

  cpuset_checked))


In [None]:
data_iter = iter(testLoader)

inputs, labels = data_iter.next()

print(inputs['input_ids'])
print(inputs['attention_mask'])
print(torch.masked_select(inputs['input_ids'], inputs['attention_mask'].bool()))

  cpuset_checked))


tensor([[[    2,  2104,   287,     3,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0]],

        [[    2,    43,  4090,  4253, 13927,  4105,  4091, 19802,  4130, 32150,
           4253,  4169,  4013, 13352,  4053,  4101,  4015,     3,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,

In [None]:
# To test tokenizer works correct, reverse data

tokenizer.convert_ids_to_tokens(dataset_test[0][0]['input_ids'][0])

['[CLS]',
 '굳',
 'ㅋ',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [None]:
# Test model

output = electra(input_ids=dataset_test[0][0]['input_ids'], attention_mask=dataset_test[0][0]['attention_mask'])

# Output of Electra is sequence_output, (hidden_states), (attentions)
# To get embeddign of [CLS] token, sequence_output[0][:, 0]
output[0][:, 0]

tensor([[ 2.5897e-01,  2.1238e-01, -1.9421e-01, -1.7494e-02, -5.3885e-02,
         -6.5732e-01,  2.8845e-01, -7.6507e-01, -2.2635e-01, -5.7455e-01,
          2.6866e-01,  2.9759e-02, -2.9668e-01, -1.4651e-01, -1.2856e-01,
         -2.9700e-01, -4.0587e-01, -8.9296e-01,  4.5421e-01, -1.4678e-01,
          5.6161e+00,  2.3460e-01,  5.6136e-01, -2.9567e-01, -2.8435e-01,
          3.0903e-01, -6.6090e-02,  5.0799e-01,  3.1083e-01,  3.4927e-01,
         -1.6204e-01,  7.1948e-02,  2.1798e-01,  3.0923e-01,  1.2292e-01,
         -2.8467e-01,  2.2177e-01,  4.7371e-01,  6.1875e-02, -1.3758e-01,
          1.9191e-01,  6.1975e-02,  2.6209e-01,  3.6230e-01, -2.8800e-01,
          2.2081e-01, -2.9396e-01,  1.9164e-02,  4.0434e-01, -2.6254e-01,
         -7.9893e-02, -1.1736e-01, -2.5994e-01, -5.4133e-01, -2.2002e-02,
         -1.6750e-02,  5.9630e-01, -3.4271e-01,  8.9398e-04,  5.9024e-01,
         -3.5799e-01, -3.5881e-01,  4.3400e-01, -1.0047e-01,  3.6890e-01,
          1.6900e-01,  4.3253e-03,  1.

In [None]:
only_pad = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]])
attention = torch.tensor([[0, 0, 0, 0, 0, 0, 0, 0]])

output = electra(input_ids=only_pad, attention_mask=attention)
out = output[0][:, 0]
out

tensor([[ 7.1112e-01, -4.9163e-02, -1.8260e-01, -1.5657e-01, -1.6857e-01,
         -1.1118e-01,  2.7091e-01, -6.2547e-01,  1.4038e-01, -9.5232e-01,
          4.8027e-01,  3.1133e-01, -5.3059e-01, -2.7934e-01, -1.2518e-01,
         -4.4102e-01, -8.2895e-02, -1.5149e+00,  8.4931e-01, -3.4192e-01,
          5.1139e+00,  5.7319e-01,  3.2024e-01, -1.7249e-01, -2.6766e-01,
          4.9328e-01, -3.1484e-02,  6.4242e-01, -1.9585e-01,  2.7669e-01,
         -1.5954e-01, -2.4577e-01,  3.6091e-01,  1.9536e-01,  5.3041e-01,
          1.8003e-01, -3.1870e-02,  1.3203e-01, -1.7484e-01,  2.5658e-01,
          9.0250e-02, -4.1634e-02,  3.9124e-01,  7.7824e-01, -2.3612e-01,
          2.2406e-02, -5.0491e-01, -2.7234e-01, -3.5927e-02, -3.9452e-01,
         -8.9301e-02, -7.4678e-02, -2.5207e-01, -6.3218e-01,  2.9558e-02,
          2.4450e-01,  4.1014e-01,  4.5775e-02,  2.3371e-01,  2.7414e-02,
         -1.1391e-01, -6.6699e-02,  6.8913e-01, -7.8566e-02,  8.9245e-02,
         -9.8181e-02, -3.0599e-02,  3.

# Define Mdoel

Electra model itself is not predication model. It is more like embedding model. \
To predict review rating, we need to construct classification model based on Electra.

In [None]:
class Classifier(nn.Module):
  def __init__(self, electra, hidden_layers=[512], num_classes=2):
    """
    Parameters
    --------------

    electra : Electra model
      Electra Model to imbed input text
    hidden_layers : list(int)
      Layers of classifier, each int is size of each layers
    num_classes : int
      output classes
    """
    super(Classifier, self).__init__()
    self.electra = electra

    for param in self.electra.parameters():
      param.requires_grad = False

    layers = []
    # Output size of electra model
    input_size = 256

    layers += [nn.BatchNorm1d(input_size),
               nn.ReLU(inplace=True)]

    for layer_size in hidden_layers:
      layers += [nn.Linear(input_size, layer_size),
                          nn.BatchNorm1d(layer_size),
                          nn.ReLU(inplace=True),
                          nn.Dropout(p=0.5)]

      input_size = layer_size
    
    layers.append(nn.Linear(input_size, num_classes))

    self.hidden = nn.Sequential(*layers)


  def forward(self, input_ids, attention_mask):
    x = self.electra(input_ids=input_ids, attention_mask=attention_mask)[0][:, 0]

    x = self.hidden(x)

    return x
    

In [None]:
model = Classifier(electra).to(device)
loss_fn = nn.CrossEntropyLoss()

sum(p.numel() for p in model.parameters() if p.requires_grad)

134146

# Train Model


In [None]:
opt = optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
def Cal_accuracy(pred, y):
  pred_label = torch.argmax(pred, dim=1)
  acc = (pred_label == y).sum().data.cpu() / (len(y) + 1)
  return acc

In [None]:
# To plot loss save losses
train_loss = []
test_loss = []

for e in range(epoch):
  model.train()
 
  i = 0
  for inputs, labels in trainLoader:
    opt.zero_grad()

    input_ids = inputs['input_ids'].squeeze().to(device)
    attention_mask = inputs['attention_mask'].squeeze().to(device)

    labels = labels.squeeze().to(device)

    pred = model(input_ids, attention_mask)
    loss = loss_fn(pred, labels)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
    opt.step()

    train_loss.append(loss.mean().data.cpu())
    acc = Cal_accuracy(pred, labels)

    i += 1
    if (i % log_interval) == 0:
      print(f"epoch {e} train_loss {loss.data.cpu()} acc {acc}")
  
  model.eval()
  i = 0
  losses = 0
  acc = 0
  for inputs, labels in testLoader:
    input_ids = inputs['input_ids'].squeeze().to(device)
    attention_mask = inputs['attention_mask'].squeeze().to(device)

    labels = labels.squeeze().to(device)

    pred = model(input_ids, attention_mask)
    loss = loss_fn(pred, labels)
    losses += loss.sum().data.cpu()
    acc += Cal_accuracy(pred, labels)

    test_loss.append(loss.mean().data.cpu())
    i += 1
  
  losses = losses / (batch_size * i)
  acc = acc / i

  print(f"epoch {e} test_loss {losses} acc {acc}")

  cpuset_checked))


epoch 0 train_loss 0.6626798510551453 acc 0.5736433863639832
epoch 0 train_loss 0.6159391403198242 acc 0.6589147448539734
epoch 0 test_loss 0.004577093757688999 acc 0.5470238924026489
epoch 1 train_loss 0.5982567071914673 acc 0.6666666865348816
epoch 1 train_loss 0.6167879104614258 acc 0.6201550364494324
epoch 1 test_loss 0.004449367057532072 acc 0.5583352446556091
epoch 2 train_loss 0.6242090463638306 acc 0.6666666865348816
epoch 2 train_loss 0.611432671546936 acc 0.6589147448539734
epoch 2 test_loss 0.004382772371172905 acc 0.5639028549194336
epoch 3 train_loss 0.5624515414237976 acc 0.7054263353347778
epoch 3 train_loss 0.6392976641654968 acc 0.6666666865348816
epoch 3 test_loss 0.004358114209026098 acc 0.5672302842140198
epoch 4 train_loss 0.6076874732971191 acc 0.6356589198112488
epoch 4 train_loss 0.6213057041168213 acc 0.643410861492157
epoch 4 test_loss 0.00433066301047802 acc 0.5668623447418213
