In [None]:
from google.colab import drive
drive.mount("/content/drive")
import os
os.chdir("/content/drive/MyDrive/BERT")
!pip install transformers
from transformers import AutoModel, AutoTokenizer
import re
import numpy as np
import random
import torch
from torch import nn, optim
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
from transformers import get_linear_schedule_with_warmup, get_constant_schedule, AdamW

# Read data

Split train and test set.

Encode data usinng tokenizer from pretrained phobert.

In [None]:
def read_data(path):
  with open(path) as f:
    data = f.read().splitlines()
  return data

In [None]:
data = read_data("80k_dataset/data.txt")
label = read_data("80k_dataset/label.txt")

In [None]:
def remove_quotes_and_parenthesis(data):
  result = []
  for text in data:
    result.append(re.sub(r"[\"\']", "", text))
  return result
data = remove_quotes_and_parenthesis(data)

In [None]:
train_dataset = {"data": data[:60000],"label": label[:60000]}
test_dataset = {"data": data[60000:], "label": label[60000:]}

In [None]:
print("Train data:\n")
for i in range(5):
  print("text: ", train_dataset["data"][i], "\nlabel", train_dataset["label"][i])
print("Test data:\n")
for i in range(5):
  print("text: ", test_dataset["data"][i], "\nlabel", test_dataset["label"][i])

Train data:

text:   Các nhà_khoa_học đã phát_hiện hoá_thạch của một con rùa nước_ngọt châu_Á tại Bắc_cực , và đặt tên là auroral . 
label Khoa học.txt
text:   Trong ngày thi_đấu , chỉ có cặp đấu được diễn ra , song đó lại là những màn so tài rất đáng để chờ_đợi . 
label Thể thao.txt
text:   Theo hãng tin Hàn_Quốc Yonhap , tính đến tháng , sau chưa đầy một tháng ra_mắt , lượng đặt_hàng trước của Galaxy_SS + tăng gấp lần so với Galaxy_SS edge . 
label Công nghệ.txt
text:   Mẫu_SUV cỡ nhỏ thiết_kế thể_thao , sử_dụng động_cơ , lít TFSI công_suất mã_lực , hướng đến người Việt trẻ . 
label Xe cộ.txt
text:   Quảng_cáo trên xe cửa_ô tô cá_nhân là hình_thức mới trong phân khúc quảng_cáo ngoài_trời , đã thịnh_hành trên thế_giới và được ví là mô_hình Uber for ads chia_sẻ quảng_cáo . 
label None.txt
Test data:

text:   Tôi là người không giỏi chịu_đựng , buồn một_chút thôi là suy_nghĩ tiêu_cực nhưng tôi vẫn tìm lại anh với suy_nghĩ mình cứ bên cạnh họ , trân thành họ thì chắc_chắn họ cũng sẽ hiểu

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

def encode_data(data, tokenizer):
  result = []
  for text in data:
    temp = tokenizer.encode(text)

    if(len(temp) < 64):
      temp += [1]*(64-len(temp))
    elif len(temp) > 64:
      temp = temp[:64]
      temp[-1] = tokenizer.eos_token_id
    result.append(temp)
  return result

In [None]:
train_dataset["data"] = encode_data(train_dataset["data"], tokenizer)
test_dataset["data"] = encode_data(test_dataset["data"], tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (302 > 256). Running this sequence through the model will result in indexing errors


In [None]:
lbencode = LabelEncoder()
lbencode.fit(label)
train_dataset["label"] = lbencode.transform(train_dataset["label"])
test_dataset["label"] = lbencode.transform(test_dataset["label"])

# Transform to torch dataset and dataloader

In [None]:
train_dataset_torch = torch.utils.data.TensorDataset(torch.tensor(train_dataset["data"], dtype=torch.long), torch.tensor(train_dataset["label"], dtype=torch.long))
test_dataset_torch = torch.utils.data.TensorDataset(torch.tensor(test_dataset["data"], dtype = torch.long), torch.tensor(test_dataset["label"], dtype  = torch.long))

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset_torch, batch_size=8, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset_torch, batch_size=8, shuffle=False)

# Build fine-tuning model to classify data.

In [None]:
class Model(nn.Module):
  def __init__(self, n_classes):
    super(Model, self).__init__()
    self.phobert = AutoModel.from_pretrained("vinai/phobert-base")
    self.drop = nn.Dropout(p=0.3)
    self.linear1 = nn.Linear(768, 256)
    self.linear2 = nn.Linear(256, n_classes)
    self.softmax = nn.Softmax(dim = 1)
  def forward(self, inputs):
    cls_embedding = self.phobert(inputs)[0][:, 0,:]
    output = self.drop(cls_embedding)
    output = self.linear1(output)
    output = torch.nn.functional.relu(output)
    output = self.linear2(output)
    return self.softmax(output)

# Declare training, validation and evaluation function

In [None]:
def evaluate(y_pred, y_batch):
  with torch.no_grad():
    y_pred = y_pred.detach().cpu().numpy()
    label = np.argmax(y_pred, axis = 1)
    y_batch = y_batch.detach().cpu().numpy()
    f1 = f1_score(y_batch, label, average='weighted')
    acc = accuracy_score(y_batch, label)
  return f1, acc

In [None]:
def validation(test_loader, model, device):
  f1s = []
  accs = []
  model.eval()
  with torch.no_grad():
    for x_batch, y_batch in test_loader:
      x_batch = x_batch.to(device)
      y_batch = y_batch.to(device)
      y_pred = model(x_batch)
      f1, acc = evaluate(y_pred, y_batch)
      f1s.append(f1)
      accs.append(acc)
    f1s = np.mean(f1s)
    accs = np.mean(accs)

  return f1s, accs

In [None]:
def trainOnEpoch(train_loader, model, optimizer, loss, num_epochs, epoch, device, scheduler, history, log_time = 100):
  loss_epoch = 0
  acc_epoch = 0
  f1_epoch = 0
  for i, (x_batch, y_batch) in enumerate(train_loader):
    model.train()
    x_batch = x_batch.to(device)
    y_batch = y_batch.to(device)
    optimizer.zero_grad()
    y_pred = model(x_batch)
    loss_value = loss(y_pred, y_batch)
    acc, f1 = evaluate(y_pred, y_batch)

    loss_value.backward()
    #nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
    optimizer.step()
    loss_epoch += loss_value.item()
    acc_epoch += acc
    f1_epoch += f1
    scheduler.step()
    if (i+1) % log_time == 0:
      print("[TRAIN EPOCH {}] batch {} / {}, loss: {}, acc: {}, f1_avg:{}".format(epoch+1, i+1, len(train_loader), loss_epoch / (i + 1), acc_epoch / (i+1), f1_epoch / (i + 1)))
      history["loss_train"].append(loss_epoch / (i+1))
      history["acc_train"].append(acc_epoch / (i+1))
      history["f1_train"].append(f1_epoch / (i + 1))

# Declare hyperparameters and train model.

In [None]:
# EPOCHS = 20
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# CHECKPOINT_PATH = "checkpoint/last_model.pth.tar"

# history = {"loss_train": [], "acc_train":[], "f1_train":[]}

# model = Model(14)
# model.to(DEVICE)
# loss_function = nn.CrossEntropyLoss()
# optimizer = AdamW(model.parameters(), lr=2e-5)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=len(train_loader)*(EPOCHS-1))
# scheduler_frozen = get_constant_schedule(optimizer)
# start_epoch = 0
# frozen = True
# for child in model.phobert.children():
#   for param in child.parameters():
#     param.requires_grad = False

# if os.path.exists("history/f1_valid.txt"):
#   with open("history/f1_valid.txt") as f:
#     max_score = f.read().splitlines()
#   start_epoch = len(max_score)
#   max_score = np.array(max_score).astype(float)
#   max_score = np.max(max_score)
# else:
#   max_score = 0

# if os.path.exists(CHECKPOINT_PATH):
#   checkpoint = torch.load(CHECKPOINT_PATH)
#   model.load_state_dict(checkpoint["model_state_dict"])
#   optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
#   scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
#   loss_function.load_state_dict(checkpoint["loss_state_dict"])

# for epoch in range(start_epoch, EPOCHS, 1):
#   if epoch > 0 and frozen:
#     for child in model.phobert.children():
#       for param in child.parameters():
#         param.requires_grad = True
#     del scheduler_frozen
#     torch.cuda.empty_cache()
#     frozen = False
#   print("EPOCH ", epoch+1, "/", EPOCHS, ":")
#   if frozen:
#     trainOnEpoch(train_loader=train_loader, model = model, optimizer = optimizer, loss= loss_function, num_epochs = EPOCHS, epoch = epoch, device = DEVICE, scheduler = scheduler_frozen, history=history)
#   else:
#     trainOnEpoch(train_loader=train_loader, model = model, optimizer = optimizer, loss= loss_function, num_epochs = EPOCHS, epoch = epoch,device = DEVICE, scheduler = scheduler, history= history)
#   acc, f1 = validation(test_loader, model, DEVICE)
  
#   print("EPOCH ", epoch+1, "/", EPOCHS, ":", " acc: {}, f1_score: {}".format(acc, f1))

#   with open("history/loss_train.txt", "a+") as f:
#     for item in history["loss_train"]:
#       f.write(str(item) + "\n")
#   with open("history/acc_train.txt", "a+") as f:
#     for item in history["acc_train"]:
#       f.write(str(item) + "\n")
#   with open("history/f1_train.txt", "a+") as f:
#     for item in history["f1_train"]:
#       f.write(str(item) + "\n")
#   with open("history/acc_valid.txt", "a+") as f:
#     f.write(str(acc) + "\n")
#   with open("history/f1_valid.txt", "a+") as f:
#     f.write(str(f1) + "\n")

#   #Save model chechk point
#   if f1 > max_score:
#     dict_ = {
#         "model_state_dict": model.state_dict(),
#         "optimizer_state_dict": optimizer.state_dict(),
#         "scheduler_state_dict": scheduler.state_dict(),
#         "loss_state_dict": loss_function.state_dict(),
#     }

#     torch.save(dict_, CHECKPOINT_PATH)
#     max_score = f1
#   history = {"loss_train": [], "acc_train":[], "f1_train":[]}

In [None]:
# EPOCHS = 30
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# CHECKPOINT_PATH = "checkpoint/last_model.pth.tar"
# CHECKPOINT_BEST_PATH = "checkpoint/best_model.pth.tar"

# history = {"loss_train": [], "acc_train":[], "f1_train":[]}

# model = Model(14)
# model.to(DEVICE)
# loss_function = nn.CrossEntropyLoss()
# optimizer = AdamW(model.parameters(), lr=5e-7)
# scheduler = get_constant_schedule(optimizer)
# start_epoch = 0
# frozen = True
# for child in model.phobert.children():
#   for param in child.parameters():
#     param.requires_grad = False

# if os.path.exists("history/f1_valid.txt"):
#   with open("history/f1_valid.txt") as f:
#     max_score = f.read().splitlines()
#   start_epoch = len(max_score)
#   max_score = np.array(max_score).astype(float)
#   max_score = np.max(max_score)
# else:
#   max_score = 0

# if os.path.exists(CHECKPOINT_PATH):
#   checkpoint = torch.load(CHECKPOINT_PATH)
#   model.load_state_dict(checkpoint["model_state_dict"])
#   optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
#   if start_epoch != 20:
#     scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
#   loss_function.load_state_dict(checkpoint["loss_state_dict"])

# for epoch in range(start_epoch, EPOCHS, 1):
#   if epoch > 0 and frozen:
#     for child in model.phobert.children():
#       for param in child.parameters():
#         param.requires_grad = True
#     torch.cuda.empty_cache()
#     frozen = False
#   print("EPOCH ", epoch+1, "/", EPOCHS, ":")
#   trainOnEpoch(train_loader=train_loader, model = model, optimizer = optimizer, loss= loss_function, num_epochs = EPOCHS, epoch = epoch,device = DEVICE, scheduler = scheduler, history= history)
#   acc, f1 = validation(test_loader, model, DEVICE)
  
#   print("EPOCH ", epoch+1, "/", EPOCHS, ":", " acc: {}, f1_score: {}".format(acc, f1))

#   with open("history/loss_train.txt", "a+") as f:
#     for item in history["loss_train"]:
#       f.write(str(item) + "\n")
#   with open("history/acc_train.txt", "a+") as f:
#     for item in history["acc_train"]:
#       f.write(str(item) + "\n")
#   with open("history/f1_train.txt", "a+") as f:
#     for item in history["f1_train"]:
#       f.write(str(item) + "\n")
#   with open("history/acc_valid.txt", "a+") as f:
#     f.write(str(acc) + "\n")
#   with open("history/f1_valid.txt", "a+") as f:
#     f.write(str(f1) + "\n")

#   #Save model chechk point
#   dict_ = {
#     "model_state_dict": model.state_dict(),
#     "optimizer_state_dict": optimizer.state_dict(),
#     "scheduler_state_dict": scheduler.state_dict(),
#     "loss_state_dict": loss_function.state_dict(),
#   }

#   torch.save(dict_, CHECKPOINT_PATH)

#   if f1 > max_score:
#     torch.save(dict_, CHECKPOINT_BEST_PATH)
#     max_score = f1
#   history = {"loss_train": [], "acc_train":[], "f1_train":[]}

In [None]:
# EPOCHS = 40
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# CHECKPOINT_PATH = "checkpoint/last_model.pth.tar"
# CHECKPOINT_BEST_PATH = "checkpoint/best_model.pth.tar"

# history = {"loss_train": [], "acc_train":[], "f1_train":[]}

# model = Model(14)
# model.to(DEVICE)
# loss_function = nn.CrossEntropyLoss()
# optimizer = AdamW(model.parameters(), lr=5e-7)
# scheduler = get_linear_schedule_with_warmup(optimizer, 40, num_training_steps=10 * len(train_loader))
# start_epoch = 0
# frozen = True
# for child in model.phobert.children():
#   for param in child.parameters():
#     param.requires_grad = False

# if os.path.exists("history/f1_valid.txt"):
#   with open("history/f1_valid.txt") as f:
#     max_score = f.read().splitlines()
#   start_epoch = len(max_score)
#   max_score = np.array(max_score).astype(float)
#   max_score = np.max(max_score)
# else:
#   max_score = 0

# if os.path.exists(CHECKPOINT_PATH):
#   checkpoint = torch.load(CHECKPOINT_PATH)
#   model.load_state_dict(checkpoint["model_state_dict"])
#   optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
#   if start_epoch != 30:
#     scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
#   loss_function.load_state_dict(checkpoint["loss_state_dict"])

# for epoch in range(start_epoch, EPOCHS, 1):
#   if epoch > 0 and frozen:
#     for child in model.phobert.children():
#       for param in child.parameters():
#         param.requires_grad = True
#     torch.cuda.empty_cache()
#     frozen = False
#   print("EPOCH ", epoch+1, "/", EPOCHS, ":")
#   trainOnEpoch(train_loader=train_loader, model = model, optimizer = optimizer, loss= loss_function, num_epochs = EPOCHS, epoch = epoch,device = DEVICE, scheduler = scheduler, history= history)
#   acc, f1 = validation(test_loader, model, DEVICE)
  
#   print("EPOCH ", epoch+1, "/", EPOCHS, ":", " acc: {}, f1_score: {}".format(acc, f1))

#   with open("history/loss_train.txt", "a+") as f:
#     for item in history["loss_train"]:
#       f.write(str(item) + "\n")
#   with open("history/acc_train.txt", "a+") as f:
#     for item in history["acc_train"]:
#       f.write(str(item) + "\n")
#   with open("history/f1_train.txt", "a+") as f:
#     for item in history["f1_train"]:
#       f.write(str(item) + "\n")
#   with open("history/acc_valid.txt", "a+") as f:
#     f.write(str(acc) + "\n")
#   with open("history/f1_valid.txt", "a+") as f:
#     f.write(str(f1) + "\n")

#   #Save model chechk point
#   dict_ = {
#     "model_state_dict": model.state_dict(),
#     "optimizer_state_dict": optimizer.state_dict(),
#     "scheduler_state_dict": scheduler.state_dict(),
#     "loss_state_dict": loss_function.state_dict(),
#   }

#   torch.save(dict_, CHECKPOINT_PATH)

#   if f1 > max_score:
#     torch.save(dict_, CHECKPOINT_BEST_PATH)
#     max_score = f1
#   history = {"loss_train": [], "acc_train":[], "f1_train":[]}

In [None]:
EPOCHS = 60
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CHECKPOINT_PATH = "checkpoint/last_model.pth.tar"
CHECKPOINT_BEST_PATH = "checkpoint/best_model.pth.tar"

history = {"loss_train": [], "acc_train":[], "f1_train":[]}

model = Model(14)
model.to(DEVICE)
loss_function = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-7)
scheduler = get_linear_schedule_with_warmup(optimizer, 40, num_training_steps=20 * len(train_loader))
start_epoch = 0
frozen = True
for child in model.phobert.children():
  for param in child.parameters():
    param.requires_grad = False

if os.path.exists("history/f1_valid.txt"):
  with open("history/f1_valid.txt") as f:
    max_score = f.read().splitlines()
  start_epoch = len(max_score)
  max_score = np.array(max_score).astype(float)
  max_score = np.max(max_score)
else:
  max_score = 0

if os.path.exists(CHECKPOINT_PATH):
  checkpoint = torch.load(CHECKPOINT_PATH)
  model.load_state_dict(checkpoint["model_state_dict"])
  optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
  if start_epoch != 40:
    scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
  loss_function.load_state_dict(checkpoint["loss_state_dict"])

for epoch in range(start_epoch, EPOCHS, 1):
  if epoch > 0 and frozen:
    for child in model.phobert.children():
      for param in child.parameters():
        param.requires_grad = True
    torch.cuda.empty_cache()
    frozen = False
  print("EPOCH ", epoch+1, "/", EPOCHS, ":")
  trainOnEpoch(train_loader=train_loader, model = model, optimizer = optimizer, loss= loss_function, num_epochs = EPOCHS, epoch = epoch,device = DEVICE, scheduler = scheduler, history= history)
  acc, f1 = validation(test_loader, model, DEVICE)
  
  print("EPOCH ", epoch+1, "/", EPOCHS, ":", " acc: {}, f1_score: {}".format(acc, f1))

  with open("history/loss_train.txt", "a+") as f:
    for item in history["loss_train"]:
      f.write(str(item) + "\n")
  with open("history/acc_train.txt", "a+") as f:
    for item in history["acc_train"]:
      f.write(str(item) + "\n")
  with open("history/f1_train.txt", "a+") as f:
    for item in history["f1_train"]:
      f.write(str(item) + "\n")
  with open("history/acc_valid.txt", "a+") as f:
    f.write(str(acc) + "\n")
  with open("history/f1_valid.txt", "a+") as f:
    f.write(str(f1) + "\n")

  #Save model chechk point
  dict_ = {
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "scheduler_state_dict": scheduler.state_dict(),
    "loss_state_dict": loss_function.state_dict(),
  }

  torch.save(dict_, CHECKPOINT_PATH)

  if f1 > max_score:
    torch.save(dict_, CHECKPOINT_BEST_PATH)
    max_score = f1
  history = {"loss_train": [], "acc_train":[], "f1_train":[]}

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=542923308.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


EPOCH  58 / 60 :
[TRAIN EPOCH 58] batch 100 / 7500, loss: 1.985815590620041, acc: 0.762327380952381, f1_avg:0.7675
[TRAIN EPOCH 58] batch 200 / 7500, loss: 1.988469632267952, acc: 0.7605595238095236, f1_avg:0.765625
[TRAIN EPOCH 58] batch 300 / 7500, loss: 1.972511569261551, acc: 0.7744742063492063, f1_avg:0.7816666666666666
[TRAIN EPOCH 58] batch 400 / 7500, loss: 1.9719489294290542, acc: 0.7744717261904758, f1_avg:0.7825
[TRAIN EPOCH 58] batch 500 / 7500, loss: 1.976701859474182, acc: 0.7703690476190476, f1_avg:0.77775
[TRAIN EPOCH 58] batch 600 / 7500, loss: 1.9755754421154659, acc: 0.7705545634920632, f1_avg:0.7789583333333333
[TRAIN EPOCH 58] batch 700 / 7500, loss: 1.975051783663886, acc: 0.7698163265306117, f1_avg:0.7794642857142857
[TRAIN EPOCH 58] batch 800 / 7500, loss: 1.9712630881369113, acc: 0.7740796130952377, f1_avg:0.78328125
[TRAIN EPOCH 58] batch 900 / 7500, loss: 1.973068020078871, acc: 0.772199074074074, f1_avg:0.7815277777777778
[TRAIN EPOCH 58] batch 1000 / 7500, 