In [1]:
from google.colab import drive
drive.mount("/content/drive")
import os
os.chdir("/content/drive/MyDrive/BERT")
!pip install transformers
from transformers import AutoModel, AutoTokenizer
import re
import pandas as pd
import numpy as np
import random
import torch
from torch import nn, optim
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
from transformers import get_linear_schedule_with_warmup, get_constant_schedule, AdamW

torch.cuda.empty_cache()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Read data

Split train and test set.

Encode data usinng tokenizer from pretrained phobert.

In [2]:
# def read_data(path):
#   with open(path) as f:
#     data = f.read().splitlines()
#   return data

In [3]:
# data = read_data("80k_dataset/data.txt")
# label = read_data("80k_dataset/label.txt")

In [4]:
dataset = pd.read_csv("dataset_original.csv")
data = dataset["data"].values
label = dataset["label"].values

In [5]:
len(data)

135622

In [6]:
# def remove_quotes_and_parenthesis(data):
#   result = []
#   for text in data:
#     result.append(re.sub(r"[\"\']", "", text))
#   return result
# data = remove_quotes_and_parenthesis(data)

In [7]:
train_dataset = {"data": data[:80000],"label": label[:80000]}
test_dataset = {"data": data[80000:], "label": label[80000:]}

In [8]:
print("Train data:\n")
for i in range(5):
  print("text: ", train_dataset["data"][i], "\nlabel", train_dataset["label"][i])
print("Test data:\n")
for i in range(5):
  print("text: ", test_dataset["data"][i], "\nlabel", test_dataset["label"][i])

Train data:

text:   Ảnh : Vũ_Di Tài_xế Nguyễn_Thế_Vũ kể : Khi lên xe , ngay_lập_tức thanh_niên xăm trổ liền khống_chế tôi , thu hết điện_thoại và giấy_tờ xe . 
label Pháp luật.txt
text:   Dù không ở khách_sạn để tránh tốn_kém nhưng ông Park cũng đề_nghị VFF phải duy_trì thực_đơn ăn_uống như tại khách_sạn cao_cấp , nếu các tuyển_thủ muốn ăn gì thì phải để họ được ăn . 
label Thể thao.txt
text:   Ông Lê Nguyễn_Minh_Quang , Trưởng Ban đường_sắt đô_thị TP HCM cho biết : Chúng_tôi vẫn kiên_trì đeo_bám , kiến_nghị làm_việc cụ_thể . 
label Xã hội.txt
text:   * Trong bối_cảnh quan_hệ Thổ_Nhĩ_Kỳ với phương Tây đang xuống_dốc không phanh , thì sự_cố xảy ra tại cuộc tập_trận chung của Liên_minh quân_sự tại Na_Uy vừa_rồi như là một cú đẩy Ankara lại gần hơn_nữa với các đối_thủ của NATO là Nga và Iran NATO đang đẩy Thổ_Nhĩ_Kỳ ngã vào lòng Nga ? 
label Thế giới.txt
text:   Chúng_tôi xin giới_thiệu một_số điểm du_lịch thú_vị được nhiều người chọn nhất hiện_nay để các gia_đình có_thể dễ_dàng lựa_chọn

In [9]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

sequence_length = 128

def encode_data(data, tokenizer):
  result = []
  for text in data:
    temp = tokenizer.encode(text)

    if(len(temp) < sequence_length):
      temp += [1]*(sequence_length-len(temp))
    elif len(temp) > sequence_length:
      temp = temp[:sequence_length]
      temp[-1] = tokenizer.eos_token_id
    result.append(temp)
  return result

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
train_dataset["data"] = encode_data(train_dataset["data"], tokenizer)
test_dataset["data"] = encode_data(test_dataset["data"], tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (314 > 256). Running this sequence through the model will result in indexing errors


In [11]:
lbencode = LabelEncoder()
lbencode.fit(label)
train_dataset["label"] = lbencode.transform(train_dataset["label"])
test_dataset["label"] = lbencode.transform(test_dataset["label"])

# Transform to torch dataset and dataloader

In [12]:
train_dataset_torch = torch.utils.data.TensorDataset(torch.tensor(train_dataset["data"], dtype=torch.long), torch.tensor(train_dataset["label"], dtype=torch.long))
test_dataset_torch = torch.utils.data.TensorDataset(torch.tensor(test_dataset["data"], dtype = torch.long), torch.tensor(test_dataset["label"], dtype  = torch.long))

In [13]:
train_loader = torch.utils.data.DataLoader(train_dataset_torch, batch_size=8, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset_torch, batch_size=8, shuffle=False)

# Build fine-tuning model to classify data.

In [14]:
class Model(nn.Module):
  def __init__(self, n_classes):
    super(Model, self).__init__()
    self.phobert = AutoModel.from_pretrained("vinai/phobert-base")
    self.lstm = nn.LSTM(input_size=768, hidden_size=256, batch_first=True)
    self.drop = nn.Dropout(p=0.3)
    self.linear1 = nn.Linear(256, 128)
    self.linear2 = nn.Linear(128, n_classes)
    self.softmax = nn.Softmax(dim = 1)
  def forward(self, inputs):
    cls_embedding = self.phobert(inputs)[0]
    lstm_embedding = self.lstm(cls_embedding)[1][1][0]
    lstm_embedding = torch.nn.functional.relu(lstm_embedding)
    #lstm_embedding = torch.mean(lstm_embedding, dim = 1)
    output = self.drop(lstm_embedding)
    output = self.linear1(lstm_embedding)
    output = torch.nn.functional.relu(output)
    output = self.linear2(output)
    return self.softmax(output)

# Declare training, validation and evaluation function

In [15]:
def evaluate(y_pred, y_batch):
  with torch.no_grad():
    y_pred = y_pred.detach().cpu().numpy()
    label = np.argmax(y_pred, axis = 1)
    y_batch = y_batch.detach().cpu().numpy()
    f1 = f1_score(y_batch, label, average='weighted')
    acc = accuracy_score(y_batch, label)
  return f1, acc

In [16]:
def validation(test_loader, loss, model, device):
  f1s = []
  accs = []
  model.eval()
  with torch.no_grad():
    y_ = []
    y = []
    y_pred_ = []
    for x_batch, y_batch in test_loader:
      x_batch = x_batch.to(device)
      y.extend(y_batch.numpy())
      y_pred = model(x_batch)
      y_pred = y_pred.detach().cpu().numpy()
      y_pred_.extend(y_pred)
      y_pred = np.argmax(y_pred, axis = 1)
      y_.extend(y_pred)
    f1 = f1_score(y, y_, average = None)
    f1s = f1_score(y, y_, average = 'weighted')
    accs = accuracy_score(y, y_)
    y_pred_ = torch.tensor(y_pred_, dtype= torch.float).to(device)
    y = torch.tensor(y, dtype = torch.long).to(device)
    loss_value = loss(y_pred_, y)

  return loss_value.item(), f1, f1s, accs

In [17]:
def trainOnEpoch(train_loader, model, optimizer, loss, num_epochs, epoch, device, scheduler, history, log_time = 100):
  loss_epoch = 0
  acc_epoch = 0
  f1_epoch = 0
  for i, (x_batch, y_batch) in enumerate(train_loader):
    model.train()
    x_batch = x_batch.to(device)
    y_batch = y_batch.to(device)
    optimizer.zero_grad()
    y_pred = model(x_batch)
    loss_value = loss(y_pred, y_batch)
    f1, acc = evaluate(y_pred, y_batch)

    loss_value.backward()
    #nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
    optimizer.step()
    loss_epoch += loss_value.item()
    acc_epoch += acc
    f1_epoch += f1
    scheduler.step()
    if (i+1) % log_time == 0 or i + 1 == len(train_loader):
      print("[TRAIN EPOCH {}] batch {} / {}, loss: {}, acc: {}, f1_avg:{}".format(epoch+1, i+1, len(train_loader), loss_epoch / (i + 1), acc_epoch / (i+1), f1_epoch / (i + 1)))
      history["loss_train"].append(loss_epoch / (i+1))
      history["acc_train"].append(acc_epoch / (i+1))
      history["f1_train"].append(f1_epoch / (i + 1))

# Declare hyperparameters and train model.

In [18]:
EPOCHS = 50
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CHECKPOINT_PATH = "checkpoint_model_3/last_model.pth.tar"
CHECKPOINT_BEST_PATH = "checkpoint_model_3/best_model.path.tar"

history = {"loss_train": [], "acc_train":[], "f1_train":[]}

model = Model(11)
model.to(DEVICE)
loss_function = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=len(train_loader)*(EPOCHS-1))
scheduler_frozen = get_constant_schedule(optimizer)
start_epoch = 0
frozen = True
for child in model.phobert.children():
  for param in child.parameters():
    param.requires_grad = False

if os.path.exists("history_model_3/f1_valid.txt"):
  with open("history_model_3/f1_valid.txt") as f:
    max_score = f.read().splitlines()
  start_epoch = len(max_score)
  max_score = np.array(max_score).astype(float)
  last_score = max_score[-1]
  max_score = np.max(max_score)
else:
  max_score = 0
  last_score = 0

if os.path.exists(CHECKPOINT_PATH):
  checkpoint = torch.load(CHECKPOINT_PATH)
  model.load_state_dict(checkpoint["model_state_dict"])
  optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
  scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
  loss_function.load_state_dict(checkpoint["loss_state_dict"])

for epoch in range(start_epoch, EPOCHS, 1):
  if epoch > 0 and frozen:
    for child in model.phobert.children():
      for param in child.parameters():
        param.requires_grad = True
    del scheduler_frozen
    torch.cuda.empty_cache()
    frozen = False
  print("EPOCH ", epoch+1, "/", EPOCHS, ":")

  if epoch == 30:
    dict_ = optimizer.state_dict()
    dict_['param_groups'][0]['initial_lr'] = scheduler.state_dict()['_last_lr']
    dict_['param_groups'][0]['lr'] = scheduler.state_dict()['_last_lr']
    optimizer.load_state_dict(dict_)
    scheduler = get_constant_schedule(optimizer)

  if frozen:
    trainOnEpoch(train_loader=train_loader, model = model, optimizer = optimizer, loss= loss_function, num_epochs = EPOCHS, epoch = epoch, device = DEVICE, scheduler = scheduler_frozen, history=history)
  else:
    trainOnEpoch(train_loader=train_loader, model = model, optimizer = optimizer, loss= loss_function, num_epochs = EPOCHS, epoch = epoch,device = DEVICE, scheduler = scheduler, history= history)
  losses, f1_classes, f1, acc = validation(test_loader, loss_function, model, DEVICE)
  
  print("EPOCH ", epoch+1, "/", EPOCHS, ":", " acc: {}, f1_score: {}".format(acc, f1))

  with open('history_model_3/loss.txt', 'a+') as f:
    f.write(str(losses) + "\n")
  with open('history_model_3/f1_classes.txt', 'a+') as f:
    f.write(' '.join(f1_classes.astype(str)) + "\n")
  with open("history_model_3/loss_train.txt", "a+") as f:
    for item in history["loss_train"]:
      f.write(str(item) + "\n")
  with open("history_model_3/acc_train.txt", "a+") as f:
    for item in history["acc_train"]:
      f.write(str(item) + "\n")
  with open("history_model_3/f1_train.txt", "a+") as f:
    for item in history["f1_train"]:
      f.write(str(item) + "\n")
  with open("history_model_3/acc_valid.txt", "a+") as f:
    f.write(str(acc) + "\n")
  with open("history_model_3/f1_valid.txt", "a+") as f:
    f.write(str(f1) + "\n")
  dict_ = {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "scheduler_state_dict": scheduler.state_dict(),
        "loss_state_dict": loss_function.state_dict(),
  }
  torch.save(dict_, CHECKPOINT_PATH)
  #Save model chechk point
  if f1 > max_score:
    torch.save(dict_, CHECKPOINT_BEST_PATH)
    max_score = f1
  elif f1 - last_score < -0.02:
    break
  last_score = f1
  history = {"loss_train": [], "acc_train":[], "f1_train":[]}

[TRAIN EPOCH 2] batch 900 / 10000, loss: 2.3266142263677385, acc: 0.21513888888888888, f1_avg:0.10630875220458466


KeyboardInterrupt: ignored