<a href="https://colab.research.google.com/github/TienNam97/phoBert_Sentiment_Analysis/blob/main/Sentiment_Analysis_with_Fold_fixed_bug.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Import những thư viện cần thiết**

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive/BERT/SA')

Mounted at /content/drive/


In [None]:
import numpy as np
import pandas as pd
import torch
import random

def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed_everything(86)

In [None]:
%%capture
!pip install --upgrade pip setuptools wheel
!pip install transformers
!pip install underthesea
!pip install fastBPE
!pip install fairseq

In [None]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes',
    default="PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file("PhoBERT_base_transformers/dict.txt")

In [None]:
bpe.encode('Hôm_nay trời nóng quá nên tôi ở nhà lướt X (Twitter)!')

'Hôm_nay trời nóng quá nên tôi ở nhà lướt X (@@ T@@ wit@@ ter@@ )@@ !'

In [None]:
vocab.encode_line('<s> ' + 'Hôm_nay trời nóng quá nên tôi ở nhà lướt X (@@ T@@ wit@@ ter@@ )@@ !' + ' </s>')

tensor([    0,  3791,  1027,   898,   204,    77,    70,    25,    69,  6667,
         2320, 14157,   982, 23118,  4055, 37272,   381,     2,     2],
       dtype=torch.int32)

In [None]:
# %%capture
# !wget https://public.vinai.io/PhoBERT_base_transformers.tar.gz
# !tar -xzvf PhoBERT_base_transformers.tar.gz

#**Tiền xử lý dữ liệu**

In [None]:
#Xử lý các dòng bị bỏ trống ở mục 'content'
def preprocess(df):
  for i in df.index:
    if type(df['content'].at[i]) != str:  #Các dòng trống khi đọc ra thì nó trả về là NaN chứ không phải là chuỗi rỗng
      df = df.drop(i) #delete row
  df = df.reset_index(drop=True)  #update index
  return df

##Đưa dữ liệu 'content' về chữ in thường

In [None]:
df = pd.read_csv('data.csv')
df = preprocess(df)
df = df.replace({'NEG': 0, 'POS': 2, 'NEU' : 1})
df = df[['content','label']]
df.loc[:,'content'] = df['content'].str.lower()

##Word Segmentation

In [None]:
# Trích ra các subword từ câu đề phục vụ việc embedding
from underthesea import word_tokenize
text1 = []
for i in df.index:
  text1.append(word_tokenize(df.iloc[i]['content'], format = 'text'))

In [None]:
example = 'Việt Nam có đầy đủ chứng cứ lịch sử và pháp lý khẳng định quần đảo Hoàng Sa và Trường Sa là lãnh thổ của Việt Nam'
word_tokenize(example)

['Việt Nam',
 'có',
 'đầy đủ',
 'chứng cứ',
 'lịch sử',
 'và',
 'pháp lý',
 'khẳng định',
 'quần đảo',
 'Hoàng Sa',
 'và',
 'Trường Sa',
 'là',
 'lãnh thổ',
 'của',
 'Việt Nam']

In [None]:
word_tokenize(example, format = 'text')

'Việt_Nam có đầy_đủ chứng_cứ lịch_sử và pháp_lý khẳng_định quần_đảo Hoàng_Sa và Trường_Sa là lãnh_thổ của Việt_Nam'

##Xóa những dòng không có dấu

In [None]:
# Mục dích loại bỏ những dóng không dấu và tiếng Anh
def non_accent(sent):
  count = 0
  letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ', '.', ',']
  for word in sent: # loop over all words
    if word in letters:
      count += 1
      if count == len(sent):
        return True
    else:
      break
  return False

In [None]:
ko_dau_id = []
lan = -1
for sent in text1:
  lan += 1
  if non_accent(sent):
    ko_dau_id.append(lan)
  else:
    continue

In [None]:
for i in ko_dau_id:
  df = df.drop(i)
df = df.reset_index(drop=True)

#Chuẩn bị dữ liệu cho training

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2)
df_train = df_train.reset_index()
df_test = df_test.reset_index()

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from underthesea import word_tokenize
import torch

MAX_LEN = 125

def prepare_loaders(df_train, fold):
  df_train1 = df_train[df_train.kfold != fold].reset_index(drop=True)
  df_valid1 = df_train[df_train.kfold == fold].reset_index(drop=True)

  train_text = []
  train_labels = []
  for i in df_train1.index:
    train_text.append(word_tokenize(df_train1.iloc[i]['content'], format = 'text'))
    train_labels.append(df_train1.iloc[i]['label'])

  val_text = []
  val_labels = []
  for i in df_valid1.index:
    val_text.append(word_tokenize(df_valid1.iloc[i]['content'], format = 'text'))
    val_labels.append(df_valid1.iloc[i]['label'])

  train_ids = []
  for sent in train_text:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    train_ids.append(encoded_sent)

  val_ids = []
  for sent in val_text:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist() #Ánh xạ subword vào vocab để trích xuất tensor tương ứng
    val_ids.append(encoded_sent)

  train_ids = pad_sequences(train_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post") #Pad để đưa về cùng size input nếu nhỏ hơn max length thì
                                                                                                                #  sẽ pad cho đủ length, còn dài hơn thì sẽ cắt tại max length
  val_ids = pad_sequences(val_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

  # Mask để mô hình chú ý vào phần câu, bỏ qua phần padding
  train_masks = []
  for sent_train in train_ids:
    mask = [int(token_id > 0) for token_id in sent_train]
    train_masks.append(mask)

  val_masks = []
  for sent_val in val_ids:
    mask = [int(token_id > 0) for token_id in sent_val]
    val_masks.append(mask)

  train_inputs = torch.tensor(train_ids)
  val_inputs = torch.tensor(val_ids)

  train_labels = torch.tensor(train_labels)
  val_labels = torch.tensor(val_labels)

  train_masks = torch.tensor(train_masks)
  val_masks = torch.tensor(val_masks)

  train_data = TensorDataset(train_inputs, train_masks, train_labels)
  train_sampler = SequentialSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

  val_data = TensorDataset(val_inputs, val_masks, val_labels)
  val_sampler = SequentialSampler(val_data)
  val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=32)

  return train_dataloader, val_dataloader

#Load model để training

In [None]:
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW
def model():
  config = RobertaConfig.from_pretrained(
      "PhoBERT_base_transformers/config.json", from_tf=False, num_labels = 3, output_hidden_states=False,
  ) # Num_labels ở đây chính là số class cần phân loại
  BERT_SA = RobertaForSequenceClassification.from_pretrained(
      "PhoBERT_base_transformers/model.bin",
      config=config
  )
  # config = RobertaConfig.from_pretrained(
  #   "/content/drive/MyDrive/BERT/SA/bert_pretrain_fold_VSFC/config.json")
  # BERT_SA = RobertaForSequenceClassification.from_pretrained(
  #   "/content/drive/MyDrive/BERT/SA/bert_pretrain_fold_VSFC/",
  #   config=config
  #   )
  BERT_SA.cuda()
  print('Done')

  return BERT_SA

In [None]:
BERT_SA = model()

You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PhoBERT_base_transformers/model.bin and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done


In [None]:
from sklearn.model_selection import StratifiedKFold
# Chia fold để train
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle = True)

for fold, (_, val_) in enumerate(skf.split(X=df_train, y=df_train.label)):
    df_train.loc[val_,"kfold"] = fold

In [None]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    F1_score = f1_score(pred_flat, labels_flat, average='weighted')

    return accuracy_score(pred_flat, labels_flat), F1_score

#Training

In [None]:
from tqdm.notebook import tqdm

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
epochs = 5  #Có thể giảm epochs xuống 3 để tiết kiệm thời gian do thường sẽ đạt thông số tốt nhất trong 3 epochs đầu

best_eval_f1_score = 0
for fold in range(skf.n_splits):
  print(f'-----------Fold: {fold+1} ------------------')
  train_dataloader, val_dataloader = prepare_loaders(df_train, fold)
  BERT_SA = model()
  param_optimizer = list(BERT_SA.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]
  optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)

  for epoch_i in range(0, epochs):
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')
      total_loss = 0
      BERT_SA.train()
      train_accuracy = 0
      nb_train_steps = 0
      train_f1 = 0

      for step, batch in tqdm(enumerate(train_dataloader)):
          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)

          BERT_SA.zero_grad()
          outputs = BERT_SA(b_input_ids,
              token_type_ids=None,
              attention_mask=b_input_mask,
              labels=b_labels)
          loss = outputs[0]
          total_loss += loss.item()

          logits = outputs[1].detach().cpu().numpy()
          label_ids = b_labels.to('cpu').numpy()
          tmp_train_accuracy, tmp_train_f1 = flat_accuracy(logits, label_ids)
          train_accuracy += tmp_train_accuracy
          train_f1 += tmp_train_f1
          nb_train_steps += 1

          loss.backward()
          torch.nn.utils.clip_grad_norm_(BERT_SA.parameters(), 1.0)
          optimizer.step()

      avg_train_loss = total_loss / len(train_dataloader)
      print(" Accuracy: {0:.4f}".format(train_accuracy/nb_train_steps))
      print(" F1 score: {0:.4f}".format(train_f1/nb_train_steps))
      print(" Average training loss: {0:.4f}".format(avg_train_loss))

      print("Running Validation...")
      BERT_SA.eval()
      eval_loss, eval_accuracy = 0, 0
      nb_eval_steps, nb_eval_examples = 0, 0
      eval_f1 = 0
      for batch in tqdm(val_dataloader):

          batch = tuple(t.to(device) for t in batch)

          b_input_ids, b_input_mask, b_labels = batch

          with torch.no_grad():
              outputs = BERT_SA(b_input_ids,
              token_type_ids=None,
              attention_mask=b_input_mask)
              logits = outputs[0]
              logits = logits.detach().cpu().numpy()
              label_ids = b_labels.to('cpu').numpy()

              tmp_eval_accuracy, tmp_eval_f1 = flat_accuracy(logits, label_ids)

              eval_accuracy += tmp_eval_accuracy
              eval_f1 += tmp_eval_f1
              nb_eval_steps += 1

      print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
      print(" F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))

      if fold == 0 and epoch_i == 0:
        BERT_SA.save_pretrained('bert_pretrain_fold/')
        print('Saved Pretrain!')
        best_eval_f1_score = eval_f1/nb_eval_steps
      else:
        if best_eval_f1_score < eval_f1/nb_eval_steps:
          BERT_SA.save_pretrained('bert_pretrain_fold/')
          print('Update Saved Pretrain!')
          best_eval_f1_score = eval_f1/nb_eval_steps

  print("Training complete!")

-----------Fold: 1 ------------------


You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PhoBERT_base_transformers/model.bin and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done
Training...




0it [00:00, ?it/s]

 Accuracy: 0.7570
 F1 score: 0.7945
 Average training loss: 0.6035
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7816
 F1 score: 0.8117
Saved Pretrain!
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8041
 F1 score: 0.8236
 Average training loss: 0.4995
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7877
 F1 score: 0.8168
Update Saved Pretrain!
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8224
 F1 score: 0.8362
 Average training loss: 0.4577
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7885
 F1 score: 0.8054
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8390
 F1 score: 0.8497
 Average training loss: 0.4191
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7849
 F1 score: 0.7977
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8532
 F1 score: 0.8610
 Average training loss: 0.3896
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7870
 F1 score: 0.8067
Training complete!
-----------Fold: 2 ------------------


You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PhoBERT_base_transformers/model.bin and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done
Training...




0it [00:00, ?it/s]

 Accuracy: 0.7478
 F1 score: 0.7880
 Average training loss: 0.6179
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7886
 F1 score: 0.8044
Training...


0it [00:00, ?it/s]

 Accuracy: 0.7988
 F1 score: 0.8170
 Average training loss: 0.5080
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7855
 F1 score: 0.7954
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8163
 F1 score: 0.8293
 Average training loss: 0.4643
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7869
 F1 score: 0.7951
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8354
 F1 score: 0.8448
 Average training loss: 0.4255
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7899
 F1 score: 0.7976
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8512
 F1 score: 0.8589
 Average training loss: 0.3924
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7870
 F1 score: 0.7886
Training complete!
-----------Fold: 3 ------------------


You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PhoBERT_base_transformers/model.bin and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done
Training...




0it [00:00, ?it/s]

 Accuracy: 0.7495
 F1 score: 0.7894
 Average training loss: 0.6135
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7837
 F1 score: 0.8150
Training...


0it [00:00, ?it/s]

 Accuracy: 0.7947
 F1 score: 0.8153
 Average training loss: 0.5128
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7968
 F1 score: 0.8114
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8154
 F1 score: 0.8315
 Average training loss: 0.4711
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7931
 F1 score: 0.8034
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8283
 F1 score: 0.8400
 Average training loss: 0.4370
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7980
 F1 score: 0.8139
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8455
 F1 score: 0.8546
 Average training loss: 0.4004
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7897
 F1 score: 0.8031
Training complete!
-----------Fold: 4 ------------------


You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PhoBERT_base_transformers/model.bin and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done
Training...




0it [00:00, ?it/s]

 Accuracy: 0.7504
 F1 score: 0.7924
 Average training loss: 0.6192
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7855
 F1 score: 0.8170
Update Saved Pretrain!
Training...


0it [00:00, ?it/s]

 Accuracy: 0.7959
 F1 score: 0.8191
 Average training loss: 0.5134
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7890
 F1 score: 0.8109
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8147
 F1 score: 0.8305
 Average training loss: 0.4687
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7946
 F1 score: 0.8077
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8311
 F1 score: 0.8434
 Average training loss: 0.4351
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7924
 F1 score: 0.8011
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8459
 F1 score: 0.8551
 Average training loss: 0.3991
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7906
 F1 score: 0.7996
Training complete!
-----------Fold: 5 ------------------


You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PhoBERT_base_transformers/model.bin and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done
Training...




0it [00:00, ?it/s]

 Accuracy: 0.7481
 F1 score: 0.7906
 Average training loss: 0.6222
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7872
 F1 score: 0.8228
Update Saved Pretrain!
Training...


0it [00:00, ?it/s]

 Accuracy: 0.7998
 F1 score: 0.8199
 Average training loss: 0.5032
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7922
 F1 score: 0.8156
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8214
 F1 score: 0.8350
 Average training loss: 0.4554
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7944
 F1 score: 0.8140
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8354
 F1 score: 0.8464
 Average training loss: 0.4232
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7930
 F1 score: 0.8127
Training...


0it [00:00, ?it/s]

 Accuracy: 0.8525
 F1 score: 0.8615
 Average training loss: 0.3886
Running Validation...


  0%|          | 0/155 [00:00<?, ?it/s]

 Accuracy: 0.7872
 F1 score: 0.7994
Training complete!


#Test

In [None]:
# Load Pretrained model vừa lưu ở trên
config2 = RobertaConfig.from_pretrained(
    "/content/drive/MyDrive/BERT/SA/bert_pretrain_fold/config.json")
BERT_SA2 = RobertaForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/BERT/SA/bert_pretrain_fold/",
    config=config2
)

In [None]:
BERT_SA2.cuda()
print('Done')

Done


In [None]:
test_text = []
test_labels = []
for i in df_test.index:
  test_text.append(word_tokenize(df_test.iloc[i]['content'], format = 'text'))
  test_labels.append(df_test.iloc[i]['label'])

In [None]:
from tqdm.notebook import tqdm
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
def test(test_dataloader):
    test_loss, test_accuracy = 0, 0
    nb_test_steps, nb_test_examples = 0, 0
    test_f1 = 0
    # predict_labels = []

    for batch in tqdm(test_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = BERT_SA2(b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            # predict_labels.append(np.argmax(logits))


            tmp_test_accuracy, tmp_test_f1 = flat_accuracy(logits, label_ids)

            test_accuracy += tmp_test_accuracy
            test_f1 += tmp_test_f1
            nb_test_steps += 1

    print(" Accuracy: {0:.4f}".format(test_accuracy/nb_test_steps))
    print(" F1 score: {0:.4f}".format(test_f1/nb_test_steps))
    # return predict_labels

In [None]:
def get_mask(data):
	masks = []
	for sen in data:
		mask = [int(token>0) for token in sen]
		masks.append(mask)
	return masks

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_LEN = 125
test_ids = []
for sent in test_text:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    test_ids.append(encoded_sent)

test_ids = pad_sequences(test_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

test_inputs = torch.tensor(test_ids)

test_labels = torch.tensor(test_labels)

test_masks = get_mask(test_ids)

test_masks = torch.tensor(test_masks)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
# test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32)

  test_labels = torch.tensor(test_labels)


In [None]:
# predict_labels = test(test_dataloader)
test(test_dataloader)

  0%|          | 0/194 [00:00<?, ?it/s]

 Accuracy: 0.7856
 F1 score: 0.8211


In [None]:
# df_test['predict'] = predict_labels

In [None]:
# df_test[df_test['label'] != df_test['predict']]

#Predict

In [None]:
def predict(text):
  text = bpe.encode(' '.join(word_tokenize(text)))
  encode_ = vocab.encode_line('<s> ' + text + ' </s>',append_eos=True, add_if_not_exist=False).long().tolist()
  encode_text = pad_sequences([encode_], maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

  predict_masks = get_mask(encode_text)
  predict_masks = torch.tensor(predict_masks,dtype = torch.int64)
  predict_inputs = torch.tensor(encode_text)

  predict_inputs = predict_inputs.to(device)
  predict_masks = predict_masks.to(device)

  with torch.no_grad():
    outputs = BERT_SA2(predict_inputs, token_type_ids=None, attention_mask=predict_masks)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    predict = np.argmax(logits)

    if predict == 0:
      cls = "Negative"
    elif predict == 2:
      cls = "Positive"
    elif predict == 1:
      cls = "Neutral"
    return cls


In [None]:
sentence = 'Chả được cái tích sự gì'

In [None]:
predict(sentence)

'Negative'

In [None]:
sentence1 = 'Cũng được'

In [None]:
predict(sentence1)

'Positive'

In [None]:
sentence2 = 'tạm chấp nhận'

In [None]:
predict(sentence2)

'Neutral'

In [None]:
sentence3 = 'không lừa đảo, rất uy tín'

In [None]:
predict(sentence3)

'Positive'

In [None]:
from google.colab import runtime
runtime.unassign()