# Huấn luyện trên bộ dữ liệu ViHSD
* Bao gồm dữ liệu thu thập từ mạng xã hội
* Dữ liệu có tính toxic cao - phân biệt chủng tộc, vùng miền, công kích cá nhân, chửi đổng

# Import library

In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
import matplotlib.pyplot as plt
import pyvi
from utility.utility import load_data
import string
import emoji_vietnamese  # type: ignore

## Setup DATA

### Setup DATA for train and dev

In [2]:
dataset_train_type = 'vihsd'
train = load_data(set_name='train', dataset=dataset_train_type)
dev = load_data(set_name='dev', dataset=dataset_train_type)

### Setup DATA for test

In [3]:
dataset_test_type = 'vihsd'
test = load_data(set_name='test', dataset=dataset_test_type)

In [4]:
train['label'], dev['label'], test['label'] = train['label'].replace(2,1), dev['label'].replace(2,1), test['label'].replace(2,1)

## Preprocess data

* Remove url in comment
* remove punctuation
* Lowercase data
* Remove stopwords
* Remove emoji
* Tokenize data

In [5]:
from pyvi import ViTokenizer, ViPosTagger
def tokenize(text):
    """
    Thật tuyệt vời -> Thật tuyệt_vời
    """
    return ViTokenizer.tokenize(text)

# apply tokenize to text
train['text'] = train['text'].apply(tokenize)
dev['text'] = dev['text'].apply(tokenize)
test['text'] = test['text'].apply(tokenize)

In [6]:
def preprocess_data(
    data,
    url=True,
    punctuation=True,
    lowercase=True,
    stopword=False,
    special_stopwords=[],
    emoji=False
):
    # Load stopwords
    with open('./utility/Stopwords/vietnamese-stopwords-dash.txt', 'r', encoding='utf-8') as f:
        stopwords = f.read().splitlines()
    for word in special_stopwords:
        stopwords.remove(word)
    # Function to remove stopwords
    def remove_stopwords(text):
        words = text.split()
        words = [word for word in words if word not in stopwords]
        return ' '.join(words)
    if url:
        # Remove URLs
        data['text'] = data['text'].str.replace(
            r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
    if punctuation:
        # Remove punctuation
        data['text'] = data['text'].str.replace(
            '['+string.punctuation+']', '', regex=True)
    if lowercase:
        # Lowercase
        data['text'] = data['text'].str.lower()
    if stopword:
        # Remove stopword
        data['text'] = data['text'].apply(remove_stopwords)
    if emoji:
        # Remove emojis
        data['text'] = data['text'].apply(emoji_vietnamese.demojize)
    return data


In [7]:
special_stopwords = ["không","không_có","không_thể","chưa", "được"] 

In [8]:
train_preprocess = preprocess_data(train,
                                   url=True,
                                   punctuation=True,
                                   lowercase=True,
                                   stopword=True,
                                   special_stopwords=special_stopwords,
                                   emoji=True)
dev_preprocess = preprocess_data(dev,
                                 url=True,
                                 punctuation=True,
                                 lowercase=True,
                                 stopword=True,
                                 special_stopwords=special_stopwords,
                                 emoji=True)
test_preprocess = preprocess_data(test,
                                  url=True,
                                  punctuation=True,
                                  lowercase=True,
                                  stopword=True,
                                  special_stopwords=special_stopwords,
                                  emoji=True)

In [9]:
X_train = train['text'].astype(str)
y_train = train['label']
X_dev = dev['text'].astype(str)
y_dev = dev['label']
X_test = test['text'].astype(str)
y_test = test['label']

In [10]:
# Để lấy dữ liệu cho nhanh chứ ko up lên git
train_preprocess.to_csv('hsd_preprocessed_train_data.csv', index=False)
dev_preprocess.to_csv('hsd_preprocessed_dev_data.csv', index=False)
test_preprocess.to_csv('hsd_preprocessed_test_data.csv', index=False)

### Load data from csv
* Tokenizer and pad_sequences

In [64]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


vocab_size=50000
embedding_dim=128
max_length=128

train_data = pd.read_csv('hsd_preprocessed_train_data.csv')
dev_data = pd.read_csv('hsd_preprocessed_dev_data.csv')
test_data = pd.read_csv('hsd_preprocessed_test_data.csv')

# # Lấy 1 phần dữ liệu để chạy nhanh
# train_data = train_data[:4454]
# dev_data = dev_data[:527]

# X_train, y_train,_,_ = train_test_split(train_data['text'], train_data['label'], test_size=0.99, random_state=42)
# X_val, y_val,_,_ = train_test_split(dev_data['text'], dev_data['label'], test_size=0.99, random_state=42)
# X_train= X_train.astype(str)
# X_val= X_val.astype(str)

X_train, y_train = train_data['text'].astype(str), train_data['label']
X_dev, y_dev = dev_data['text'].astype(str), dev_data['label']
X_test, y_test = test_data['text'].astype(str), test_data['label']

# Tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=max_length, padding='post',truncating='post')

# Thực hiện thay đổi test để đưa vào tính toán val_acc
X_dev = tokenizer.texts_to_sequences(X_dev)
X_dev = pad_sequences(X_dev, maxlen=max_length, padding='post',truncating='post')

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=max_length, padding='post',truncating='post')

In [65]:
def accuracy_fn(y_true, y_pred):
  y_pred_rounded = torch.round(y_pred)  
  correct = torch.eq(y_true, y_pred_rounded).sum().item()
  acc = (correct/len(y_pred))*100
  return acc

from sklearn.metrics import accuracy_score, f1_score, classification_report
def f1_score_fn(y_true, y_pred):
  y_true = y_true.int().tolist()
  y_pred = torch.round(y_pred).int().tolist()
  f1_score_pos1 = f1_score(y_true=y_true, y_pred=y_pred, pos_label=1)
  f1_score_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
  # print(classification_report(y_true=y_true, y_pred=y_pred,zero_division=1))  
  return f1_score_pos1, f1_score_macro_average


## DataLoader

In [66]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np

class CustomDataset(Dataset):
    def __init__(self, x_encoded: np.ndarray, y_encoded: pd.core.series.Series):
        # Setup
        self.x_encoded = x_encoded
        self.y_encoded = y_encoded.tolist()
    
    def __getitem__(self, idx):
        return (torch.FloatTensor(self.x_encoded[idx]), self.y_encoded[idx])
        # return (self.x_encoded[idx], self.y_encoded[idx])
        # return {'text': self.x[idx], 'label': self.y_encoded[idx]}
    
    def __len__(self):
        return self.x_encoded.shape[0]


In [67]:
train_data = CustomDataset(X_train, y_train)
dev_data = CustomDataset(X_dev, y_dev)
test_data = CustomDataset(X_test, y_test)

In [68]:
import torch
from sklearn.model_selection import StratifiedKFold

class StratifiedBatchSampler:
    """Stratified batch sampling
    Provides equal representation of target classes in each batch
    """
    def __init__(self, y, batch_size, shuffle=True):
        if torch.is_tensor(y):
            y = y.numpy()
        assert len(y.shape) == 1, 'label array must be 1D'
        n_batches = int(len(y) / batch_size)
        self.skf = StratifiedKFold(n_splits=n_batches, shuffle=shuffle)
        self.X = torch.randn(len(y),1).numpy()
        self.y = y
        self.shuffle = shuffle

    def __iter__(self):
        if self.shuffle:
            self.skf.random_state = torch.randint(0,int(1e8),size=()).item()
        for train_idx, test_idx in self.skf.split(self.X, self.y):
            yield test_idx

    def __len__(self):
        return len(self.y)

In [78]:
from torch.utils.data import DataLoader
BATCH_SIZE=64
train_dataloader = DataLoader(dataset=train_data,
                              batch_sampler=StratifiedBatchSampler(torch.tensor(train_data.y_encoded), batch_size=BATCH_SIZE))
dev_dataloader = DataLoader(dataset=dev_data,
                              batch_sampler=StratifiedBatchSampler(torch.tensor(dev_data.y_encoded), batch_size=BATCH_SIZE))                            
test_dataloader = DataLoader(dataset=test_data,
                                batch_sampler=StratifiedBatchSampler(torch.tensor(test_data.y_encoded), batch_size=BATCH_SIZE))

In [92]:
import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocal_size, embedding_dim, hidden_size, output_dim, num_layers):
        super(LSTMModel, self).__init__()
        self.embed = nn.Embedding(num_embeddings=vocal_size, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers,
                            bidirectional=True, batch_first=True)
        self.dropout_lstm = nn.Dropout(p=0.2) 

        self.fc1 = nn.Linear(in_features=hidden_size*2, out_features=16)  # Kết nối trực tiếp LSTM với lớp fully connected 
        self.ln1 = nn.LayerNorm(16)  
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)



        self.fc2 = nn.Linear(in_features=16, out_features=output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embed(x)
        x, _ = self.lstm(x)
        x = self.dropout_lstm(x) 
        x = x[:, -1, :] # Lấy output của timestep cuối cùng

        x = self.fc1(x)
        x = self.ln1(x)
        x = self.relu(x)

        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [93]:
LSTM_Model = LSTMModel(vocal_size=vocab_size, embedding_dim=embedding_dim,hidden_size=32, output_dim=1, num_layers=2)
LSTM_Model

LSTMModel(
  (embed): Embedding(50000, 128)
  (lstm): LSTM(128, 32, batch_first=True, bidirectional=True)
  (dropout_lstm): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=64, out_features=16, bias=True)
  (ln1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=16, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [94]:
def weighted_binary_cross_entropy(y_true, y_pred, pos_weight):
    """
    Weighted Binary Cross Entropy (WBCE) = - (w * y * log(p) + (1 - y) * log(1 - p))
    Trong đó:
    y: Nhãn thực tế (0 hoặc 1).
    p: Xác suất dự đoán cho lớp positive (nhãn 1).
    w: Trọng số cho lớp positive.
    """
    epsilon = 1e-7
    y_pred = torch.clamp(y_pred, epsilon, 1 - epsilon)  # giới hạn giá trị dự đoán trong khoảng (epsilon, 1 - epsilon)
    bce = - (pos_weight * y_true * torch.log(y_pred) +
              (1 - y_true) * torch.log(1 - y_pred))     # Binary Cross Entropy trong đo tăng tầm quan trọng khi dự đoán sai lớp 1
    return torch.mean(bce)

from collections import Counter
print(Counter(y_train))
neg_count = Counter(y_train)[0]
pos_count = Counter(y_train)[1]
pos_weight = torch.tensor((neg_count) / pos_count)

Counter({0: 18582, 1: 3960})


In [95]:
# loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(LSTM_Model.parameters(), lr=2e-5)

In [96]:
# Write a training and evaluationg loop for model_1
torch.manual_seed(42)

# import tqdm for progress bar
from tqdm.auto import tqdm

# Train for longer
epochs = 15

# # Put data on the target device
# X_padded_sequences, y_train = torch.tensor(X_padded_sequences).to(device), torch.tensor(y_train).to(device)
# padded_val_sequences, y_test=  torch.tensor(padded_val_sequences).to(device), torch.tensor(y_test).to(device)

# Huấn luyện mô hình
best_f1_score = 0
best_acc_score = 0

# Create training and test loop
for epoch in tqdm(range(epochs)):
  print(f"Epoch: {epoch}\n------")
  ### Training
  train_loss, train_acc=0,0
  f1_train_pos_1 = []
  f1_train_macro = []
  cnt = 0
  f1_score_list = []
  LSTM_Model.train()
  # Add a loop to loop through the training batches
  for batch, (X, y) in enumerate(train_dataloader):
    cnt+=1
    # 1. Forward
    X = X.long()
    y_pred = LSTM_Model(X)

    # 2. Calculate the loss
    # loss = loss_fn(y_pred.squeeze(), y.float().squeeze())
    loss = weighted_binary_cross_entropy(y_true=y.float().squeeze(), y_pred=y_pred.squeeze(), pos_weight=pos_weight)
    train_loss += loss.item()
    f1_pos_1, f1_macro = f1_score_fn(y_true= y.float(),
                            y_pred = y_pred.squeeze(dim=1))
    f1_train_pos_1.append(f1_pos_1.item())
    f1_train_macro.append(f1_macro.item())

    # 3.
    optimizer.zero_grad()

    # 4.
    loss.backward()

    # 5.
    optimizer.step()

    # 6. Calculate accuracy metric
    y_pred_class = torch.round(y_pred)
    train_acc += (y_pred_class==y).sum().item()/len(y_pred_class)

  # Divide total train loss by length of train dataloader
  # train_loss
  train_loss /= len(train_dataloader)
  train_acc /= len(train_dataloader)
  f1_train_pos_1 = sum(f1_train_pos_1)/len(f1_train_pos_1)
  f1_train_macro = sum(f1_train_macro)/len(f1_train_macro)

  ### Testing
  test_loss, test_acc = 0,0
  f1_test_pos_1 = []
  f1_test_macro = []
  LSTM_Model.eval()
  with torch.inference_mode():
    for batch, (X_test_, y_test_) in enumerate(dev_dataloader):
      # 1. Forward pass
      X_test_ = X_test_.long()
      test_pred = LSTM_Model(X_test_)

      # 2. Calculate the loss (accumulatively)
      test_loss += weighted_binary_cross_entropy(
                y_test_.float(), test_pred.squeeze(dim=1), pos_weight)
      # print(test_pred.shape)
      # 3. Calculate accuracy
      acc = accuracy_fn(y_true= y_test_.float(),
                        y_pred = test_pred.squeeze(dim=1))
      acc = (torch.round(test_pred)==y_test_).sum().item()/len(y_test_)
      test_acc += acc
      # 4. Calculate f1 score
      pos_1, macro = f1_score_fn(y_true= y_test_.float(),
                              y_pred = test_pred.squeeze(dim=1))
      f1_test_pos_1.append(pos_1.item())
      f1_test_macro.append(macro.item())

    # Calculate the test loss average per batch
    test_loss /= len(dev_dataloader)
    # test_loss /= len(test_dataloader)
    f1_test_pos_1 = sum(f1_test_pos_1)/len(f1_test_pos_1)
    f1_test_macro = sum(f1_test_macro)/len(f1_test_macro)
    # Calculate the test acc average per batch
    test_acc /= len(dev_dataloader)

  # print out what happen
  print(f"\nTrain loss: {train_loss:.4f}, Train acc: {train_acc:.4f}, F1 train pos 1 score: {f1_train_pos_1:.4f}, F1 train macro score: {f1_train_macro:.4f} | \nTest loss: {test_loss:.4f},   Test acc: {test_acc:.4f},   F1 test pos 1 score: {f1_test_pos_1:.4f}, F1 test macro scoreL {f1_test_macro:.4f}")
  # print(f"\nTrain loss; {train_loss:.4f}")

  # Lưu trữ trọng số mô hình tốt nhất
  if f1_test_pos_1 > best_f1_score-0.03 and test_acc > best_acc_score-0.03 and f1_test_pos_1 + test_acc > best_f1_score+best_acc_score:
    best_f1_score = f1_test_pos_1
    best_acc_score = test_acc
    torch.save(LSTM_Model.state_dict(), "best_model_bilstm_vihsd.pth")
    print("save model at this epoch")

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 0
------


KeyboardInterrupt: 

## Lưu model

In [None]:
LSTM_Model.state_dict()

OrderedDict([('embed.weight',
              tensor([[ 0.0041, -0.0737, -0.0539,  ..., -1.0554, -1.1585, -0.3546],
                      [ 0.6536, -0.2913,  1.2105,  ..., -1.5105, -0.1381,  1.5027],
                      [-0.4788,  0.5766,  0.4390,  ...,  0.9184,  0.5358, -0.0737],
                      ...,
                      [ 0.0737,  0.6009,  1.6344,  ..., -1.4264,  0.8952,  0.6773],
                      [ 0.1097,  0.3854, -1.3889,  ...,  1.2902,  1.0346, -0.0775],
                      [-0.3203, -0.0348, -1.3883,  ...,  0.7865, -0.3805, -0.0595]])),
             ('lstm.weight_ih_l0',
              tensor([[-0.0903, -0.1114,  0.0563,  ..., -0.0597, -0.0293, -0.0291],
                      [ 0.1450, -0.0268, -0.1452,  ..., -0.0335, -0.1432, -0.1107],
                      [-0.0587, -0.1304,  0.0320,  ..., -0.0369, -0.0744, -0.1328],
                      ...,
                      [ 0.0530,  0.0588, -0.1308,  ...,  0.0297,  0.0302, -0.0578],
                      [-0.1676, -0.004

## Load model

In [None]:
import torch
vocab_size=50000
embedding_dim=128
max_length=128

In [None]:
predict_model = LSTMModel(vocal_size=vocab_size, embedding_dim=embedding_dim,hidden_size=32, length=max_length, output_dim=1)
predict_model.load_state_dict(torch.load('best_model_lstm_vihsd.pth'))

<All keys matched successfully>

## Dự đoán

In [None]:
### preprocess test with 3 type: ViCTSD, ViHSD, ViMergre
def test_dataloader(dataset_test_type):
    test = load_data(set_name='test', dataset=dataset_test_type)
    test['label'] = test['label'].replace(2,1)
    test['text'] = test['text'].apply(tokenize)
    test_preprocess = preprocess_data(test,
                                    url=True,
                                    punctuation=True,
                                    lowercase=True,
                                    stopword=True,
                                    special_stopwords=special_stopwords,
                                    emoji=True)
    X_test = test_preprocess['text'].astype(str)
    y_test = test_preprocess['label']
    X_test = tokenizer.texts_to_sequences(X_test)
    X_test = pad_sequences(X_test, maxlen=max_length, padding='post',truncating='post')
    test_data = CustomDataset(X_test, y_test)
    test_dataloader = DataLoader(dataset=test_data,
                                    batch_sampler=StratifiedBatchSampler(torch.tensor(test_data.y_encoded), batch_size=BATCH_SIZE))
    return test_dataloader

vihsd_test = test_dataloader('vihsd')
victsd_test = test_dataloader('victsd')

In [None]:
test_loss, test_acc = 0,0
f1_test_pos_1 = []
f1_macro = []
predict_model.eval()
y_pred_list = []
y_real_list = []
loss_fn = nn.BCELoss()
for batch, (X_test_, y_test_) in enumerate(victsd_test):
    # 1. Forward pass
    X_test_ = X_test_.long()
    test_pred = predict_model(X_test_)

    y_pred_list.append(test_pred.tolist())
    y_real_list.append(y_test_.tolist())

    # 2. Calculate the loss (accumulatively)
    test_loss += loss_fn(test_pred.squeeze(dim=1), y_test_.float()).item()
    # print(test_pred.shape)
    # 3. Calculate accuracy
    acc = accuracy_fn(y_true= y_test_.float(),
                    y_pred = test_pred.squeeze(dim=1))
    acc = (torch.round(test_pred)==y_test_).sum().item()/len(y_test_)
    test_acc += acc
    # 4. Calculate f1 score
    pos_1, macro = f1_score_fn(y_true= y_test_.float(),
                            y_pred = test_pred.squeeze(dim=1))
    f1_test_pos_1.append(pos_1.item())
    f1_macro.append(macro.item())
# Calculate the test loss average per batch
test_loss /= len(victsd_test)
# test_loss /= len(test_dataloader)
f1_test_pos_1 = sum(f1_test_pos_1)/len(f1_test_pos_1)
f1_test_macro = sum(f1_macro)/len(f1_macro)
# Calculate the test acc average per batch
test_acc /= len(victsd_test)
print(f"Test loss: {test_loss:.4f},   Test acc: {test_acc:.4f},   F1 test score: {f1_test_pos_1:.4f}, F1 test macro: {f1_test_macro:.4f}")

Test loss: 0.0188,   Test acc: 0.7065,   F1 test score: 0.1489, F1 test macro: 0.4851


ViCTSD: Test loss: 0.0188,   Test acc: 0.7065,   F1 test score: 0.1548, F1 test macro: 0.4883
ViHSD: Test loss: 0.0131,   Test acc: 0.7261,   F1 test score: 0.4678, F1 test macro: 0.6828


In [None]:
flat_y_pred_list = sum(y_pred_list, [])
flat_y_pred_list = sum(flat_y_pred_list, [])
flat_y_pred_list = [round(x) for x in flat_y_pred_list]
flat_y_real_list = sum(y_real_list,[])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true=flat_y_real_list, y_pred=flat_y_pred_list))

              precision    recall  f1-score   support

           0       0.89      0.77      0.82       890
           1       0.11      0.25      0.16       110

    accuracy                           0.71      1000
   macro avg       0.50      0.51      0.49      1000
weighted avg       0.81      0.71      0.75      1000



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

model1 = Sequential()
model1.add(Embedding(tokenizer.vocab_size, 64))
model1.add(SimpleRNN(32))
model1.add(Dense(16, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))