# Huấn luyện trên bộ dữ liệu ViHSD
* Bao gồm dữ liệu thu thập từ mạng xã hội
* Dữ liệu có tính toxic cao - phân biệt chủng tộc, vùng miền, công kích cá nhân, chửi đổng

# Import library

In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
import matplotlib.pyplot as plt
import pyvi

In [2]:
import pandas as pd  # type: ignore
import string
import emoji_vietnamese  # type: ignore

def load_data(dataset='vihsd'):
    if dataset == 'vihsd':
        train = pd.read_csv('../Dataset_Reformat/train_vihsd.csv')
        dev = pd.read_csv('../Dataset_Reformat/dev_vihsd.csv')
        test = pd.read_csv('../Dataset_Reformat/test_vihsd.csv')
        train['label'], dev['label'], test['label'] = train['label'].replace(2,1), dev['label'].replace(2,1), test['label'].replace(2,1)
    elif dataset == 'victsd':
        train = pd.read_csv('../Dataset_Reformat/train_victsd.csv')
        dev = pd.read_csv('../Dataset_Reformat/dev_victsd.csv')
        test = pd.read_csv('../Dataset_Reformat/test_victsd.csv')
    elif dataset == 'merged':
        train = pd.read_csv('../Dataset_Reformat/train_merged.csv')
        dev = pd.read_csv('../Dataset_Reformat/dev_merged.csv')
        test = pd.read_csv('../Dataset_Reformat/test_merged.csv')
        train['label'], dev['label'], test['label'] = train['label'].replace(2,1), dev['label'].replace(2,1), test['label'].replace(2,1)
    return train, dev, test


def preprocess_data(
    data,
    url=True,
    punctuation=True,
    lowercase=True,
    stopword=False,
    special_stopwords=[],
    emoji=False
):
    # Load stopwords
    with open('./utility/Stopwords/vietnamese-stopwords-dash.txt', 'r', encoding='utf-8') as f:
        stopwords = f.read().splitlines()
    for word in special_stopwords:
        stopwords.remove(word)
    # Function to remove stopwords
    def remove_stopwords(text):
        words = text.split()
        words = [word for word in words if word not in stopwords]
        return ' '.join(words)
    if url:
        # Remove URLs
        data['text'] = data['text'].str.replace(
            r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
    if punctuation:
        # Remove punctuation
        data['text'] = data['text'].str.replace(
            '['+string.punctuation+']', '', regex=True)
    if lowercase:
        # Lowercase
        data['text'] = data['text'].str.lower()
    if stopword:
        # Remove stopword
        data['text'] = data['text'].apply(remove_stopwords)
    if emoji:
        # Remove emojis
        data['text'] = data['text'].apply(emoji_vietnamese.demojize)
    return data


# Setup data

## Split train, dev, test data

In [3]:
train, dev, test = load_data(dataset='vihsd')

## Preprocess data

* Remove url in comment
* remove punctuation
* Lowercase data
* Remove stopwords
* Remove emoji
* Tokenize data

In [4]:
special_stopwords = ["không","không_có","không_thể","chưa", "được"] 

In [5]:
dev_preprocess = preprocess_data(dev,
                                 url=True,
                                 punctuation=True,
                                 lowercase=True,
                                 stopword=True,
                                 special_stopwords=special_stopwords,
                                 emoji=True)
train_preprocess = preprocess_data(train,
                                   url=True,
                                   punctuation=True,
                                   lowercase=True,
                                   stopword=True,
                                   special_stopwords=special_stopwords,
                                   emoji=True)

In [6]:
test_preprocess = preprocess_data(test,
                                    url=True,
                                    punctuation=True,
                                    lowercase=True,
                                    stopword=True,
                                    special_stopwords=special_stopwords,
                                    emoji=True)

In [7]:
train_preprocess[:5]

Unnamed: 0,text,label
0,được fan cứng nè :bộ chọn trái tim màu đỏ: rea...,0
1,bọn mắt híp lò xo thụt việt nam t 10 r bọn t g...,1
2,đậu văn cường thằng sida,0
3,côn đồ cục súc vô nhân đề nghi vn vn ban thưởng,1
4,lý thuyết thực hành 1 câu,0


### Get text and label each DataFrame

In [8]:
X_train = train['text'].astype(str)
y_train = train['label']
X_dev = dev['text'].astype(str)
y_dev = dev['label']
X_test = test['text'].astype(str)
y_test = test['label']

In [9]:
# Để lấy dữ liệu cho nhanh chứ ko up lên git
train_preprocess.to_csv('preprocessed_train_data.csv', index=False)
dev_preprocess.to_csv('preprocessed_dev_data.csv', index=False)
test.to_csv('preprocessed_test_data.csv', index=False)


### Load data from csv

In [39]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


vocab_size=50000
embedding_dim=128
max_length=128

train_data = pd.read_csv('preprocessed_train_data.csv')
dev_data = pd.read_csv('preprocessed_dev_data.csv')
test_data = pd.read_csv('preprocessed_test_data.csv')

# X_train, y_train,_,_ = train_test_split(train_data['text'], train_data['label'], test_size=0.99, random_state=42)
# X_val, y_val,_,_ = train_test_split(dev_data['text'], dev_data['label'], test_size=0.99, random_state=42)
# X_train= X_train.astype(str)
# X_val= X_val.astype(str)

X_train, y_train = train_data['text'].astype(str), train_data['label']
X_val, y_val = dev_data['text'].astype(str), dev_data['label']
X_test, y_test = test_data['text'].astype(str), test_data['label']

# Tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=max_length, padding='post',truncating='post')

# Thực hiện thay đổi test để đưa vào tính toán val_acc
X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen=max_length, padding='post',truncating='post')

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=max_length, padding='post',truncating='post')

In [40]:
def accuracy_fn(y_true, y_pred):
  y_pred_rounded = torch.round(y_pred)  
  correct = torch.eq(y_true, y_pred_rounded).sum().item()
  acc = (correct/len(y_pred))*100
  return acc

from sklearn.metrics import accuracy_score, f1_score, classification_report
def f1_score_fn(y_true, y_pred):
  y_true = y_true.int()
  y_pred = torch.round(y_pred)
  y_pred = y_pred.int()
  y_true=y_true.tolist()
  y_pred = y_pred.tolist()
  f1_score_value = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
  # print(classification_report(y_true=y_true, y_pred=y_pred,zero_division=1))  
  return f1_score_value


## DataLoader

In [41]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np

class CustomDataset(Dataset):
    def __init__(self, x_encoded: np.ndarray, y_encoded: pd.core.series.Series):
        # Setup
        self.x_encoded = x_encoded
        self.y_encoded = y_encoded.tolist()
    
    def __getitem__(self, idx):
        return (torch.FloatTensor(self.x_encoded[idx]), self.y_encoded[idx])
        # return (self.x_encoded[idx], self.y_encoded[idx])
        # return {'text': self.x[idx], 'label': self.y_encoded[idx]}
    
    def __len__(self):
        return self.x_encoded.shape[0]


In [42]:
train_data = CustomDataset(X_train, y_train)
val_data = CustomDataset(X_val, y_val)
test_data = CustomDataset(X_test, y_test)

In [14]:
len(train_data)

22542

In [15]:
import torch
from sklearn.model_selection import StratifiedKFold

class StratifiedBatchSampler:
    """Stratified batch sampling
    Provides equal representation of target classes in each batch
    """
    def __init__(self, y, batch_size, shuffle=True):
        if torch.is_tensor(y):
            y = y.numpy()
        assert len(y.shape) == 1, 'label array must be 1D'
        n_batches = int(len(y) / batch_size)
        self.skf = StratifiedKFold(n_splits=n_batches, shuffle=shuffle)
        self.X = torch.randn(len(y),1).numpy()
        self.y = y
        self.shuffle = shuffle

    def __iter__(self):
        if self.shuffle:
            self.skf.random_state = torch.randint(0,int(1e8),size=()).item()
        for train_idx, test_idx in self.skf.split(self.X, self.y):
            yield test_idx

    def __len__(self):
        return len(self.y)

In [16]:
from torch.utils.data import DataLoader
BATCH_SIZE=32
train_dataloader = DataLoader(dataset=train_data,
                              batch_sampler=StratifiedBatchSampler(torch.tensor(train_data.y_encoded), batch_size=BATCH_SIZE))
test_dataloader = DataLoader(dataset=val_data,
                              batch_sampler=StratifiedBatchSampler(torch.tensor(val_data.y_encoded), batch_size=BATCH_SIZE))                            

In [17]:
import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocal_size, embedding_dim, hidden_size, output_dim, length):
        super(LSTMModel, self).__init__()
        self.embed = nn.Embedding(num_embeddings=vocal_size, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)
        self.dropout_lstm = nn.Dropout(p=0.2)  # Dropout sau LSTM
        self.flatten = nn.Flatten(start_dim=1)

        self.fc1 = nn.Linear(in_features=length*hidden_size, out_features=length*hidden_size//4)
        self.bn1 = nn.BatchNorm1d(length*hidden_size//4) 

        self.fc2 = nn.Linear(in_features=length*hidden_size//4, out_features=length*hidden_size//16)
        self.bn2 = nn.BatchNorm1d(length*hidden_size//16) 

        self.fc3 = nn.Linear(in_features=length*hidden_size//16, out_features=length*hidden_size//64)
        self.bn3 = nn.BatchNorm1d(length*hidden_size//64) 

        self.fc4 = nn.Linear(in_features=length*hidden_size//64, out_features=output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embed(x)

        x, _ = self.lstm(x)
        x = self.dropout_lstm(x) # Áp dụng Dropout sau LSTM
        x = self.flatten(x)

        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc2(x)
        x = self.bn2(x) 
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc3(x)
        x = self.bn3(x) 
        x = self.relu(x)

        x = self.fc4(x)
        x = self.sigmoid(x)
        return x

In [18]:
LSTM_Model = LSTMModel(vocal_size=vocab_size, embedding_dim=embedding_dim,hidden_size=32, length=max_length, output_dim=1)
LSTM_Model

LSTMModel(
  (embed): Embedding(50000, 128)
  (lstm): LSTM(128, 32, batch_first=True)
  (dropout_lstm): Dropout(p=0.2, inplace=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=4096, out_features=1024, bias=True)
  (bn1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=1024, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=256, out_features=64, bias=True)
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc4): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)

In [19]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(LSTM_Model.parameters(), lr=2e-5)

In [20]:
# Write a training and evaluationg loop for model_1
torch.manual_seed(42)

# import tqdm for progress bar
from tqdm.auto import tqdm

# Train for longer
epochs = 10

# # Put data on the target device
# X_padded_sequences, y_train = torch.tensor(X_padded_sequences).to(device), torch.tensor(y_train).to(device)
# padded_val_sequences, y_test=  torch.tensor(padded_val_sequences).to(device), torch.tensor(y_test).to(device)

# Create training and test loop
for epoch in tqdm(range(epochs)):
  print(f"Epoch: {epoch}\n------")
  ### Training
  train_loss, train_acc=0,0
  f1_train = []
  cnt = 0
  f1_score_list = []
  LSTM_Model.train()
  # Add a loop to loop through the training batches
  for batch, (X, y) in enumerate(train_dataloader):
    cnt+=1
    # 1. Forward
    X = X.long()
    y_pred = LSTM_Model(X)

    # 2. Calculate the loss
    loss = loss_fn(y_pred.squeeze(), y.float().squeeze())
    train_loss += loss.item()
    current_f1 = f1_score_fn(y_true= y.float(),
                            y_pred = y_pred.squeeze(dim=1)).item()
    f1_score_list.append(current_f1)
    f1_train.append(current_f1)

    # 3.
    optimizer.zero_grad()

    # 4.
    loss.backward()

    # 5.
    optimizer.step()

    # 6. Calculate accuracy metric
    y_pred_class = torch.round(y_pred)
    train_acc += (y_pred_class==y).sum().item()/len(y_pred_class)

  # Divide total train loss by length of train dataloader
  # train_loss
  train_loss /= len(train_dataloader)
  train_acc /= len(train_dataloader)
  f1_train = sum(f1_train)/len(f1_train)

  ### Testing
  test_loss, test_acc = 0,0
  f1_test = []
  LSTM_Model.eval()
  with torch.inference_mode():
    for batch, (X_test_, y_test_) in enumerate(test_dataloader):
      # 1. Forward pass
      X_test_ = X_test_.long()
      test_pred = LSTM_Model(X_test_)

      # 2. Calculate the loss (accumulatively)
      test_loss += loss_fn(test_pred.squeeze(dim=1), y_test_.float()).item()
      # print(test_pred.shape)
      # 3. Calculate accuracy
      acc = accuracy_fn(y_true= y_test_.float(),
                        y_pred = test_pred.squeeze(dim=1))
      acc = (torch.round(test_pred)==y_test_).sum().item()/len(y_test_)
      test_acc += acc
      # 4. Calculate f1 score
      f1_test.append(f1_score_fn(y_true= y_test_.float(),
                              y_pred = test_pred.squeeze(dim=1)).item())
    # Calculate the test loss average per batch
    test_loss /= len(test_dataloader)
    # test_loss /= len(test_dataloader)
    f1_test = sum(f1_test)/len(f1_test)
    # Calculate the test acc average per batch
    test_acc /= len(test_dataloader)

  # print out what happen
  print(f"\nTrain loss: {train_loss:.4f}, Train acc: {train_acc:.4f}, F1 train score: {f1_train:.4f} | \nTest loss: {test_loss:.4f},   Test acc: {test_acc:.4f},   F1 test score: {f1_test:.4f}")
  # print(f"\nTrain loss; {train_loss:.4f}")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 0
------

Train loss: 0.0220, Train acc: 0.5028, F1 train score: 0.4725 | 
Test loss: 0.0232,   Test acc: 0.4548,   F1 test score: 0.4649
Epoch: 1
------

Train loss: 0.0183, Train acc: 0.7239, F1 train score: 0.5713 | 
Test loss: 0.0207,   Test acc: 0.6249,   F1 test score: 0.5560
Epoch: 2
------

Train loss: 0.0164, Train acc: 0.7799, F1 train score: 0.5552 | 
Test loss: 0.0188,   Test acc: 0.7174,   F1 test score: 0.5797
Epoch: 3
------

Train loss: 0.0151, Train acc: 0.7927, F1 train score: 0.5483 | 
Test loss: 0.0171,   Test acc: 0.7430,   F1 test score: 0.5812
Epoch: 4
------

Train loss: 0.0142, Train acc: 0.7986, F1 train score: 0.5455 | 
Test loss: 0.0163,   Test acc: 0.7470,   F1 test score: 0.5837
Epoch: 5
------

Train loss: 0.0136, Train acc: 0.8001, F1 train score: 0.5474 | 
Test loss: 0.0154,   Test acc: 0.7452,   F1 test score: 0.5947
Epoch: 6
------

Train loss: 0.0132, Train acc: 0.7982, F1 train score: 0.5617 | 
Test loss: 0.0149,   Test acc: 0.7446,   F1 test

## Lưu model

In [32]:
torch.save(LSTM_Model.state_dict(), 'LSTM.pth')

In [33]:
LSTM_Model.state_dict()

OrderedDict([('embed.weight',
              tensor([[-1.2278, -0.0290,  0.1851,  ...,  1.5826,  1.1992,  1.5305],
                      [ 0.0214,  1.5840, -0.2902,  ..., -0.6611, -0.5091,  1.3219],
                      [ 0.5880, -1.2119,  0.2408,  ...,  0.5963,  0.7509,  0.5156],
                      ...,
                      [-0.0787, -0.9901,  0.8272,  ..., -0.4542,  1.5091, -0.7681],
                      [-1.2770, -0.0990, -2.4526,  ...,  0.2013, -0.2149,  0.1755],
                      [-0.8604, -0.4840,  0.4810,  ...,  1.6619,  1.3122, -0.9310]])),
             ('lstm.weight_ih_l0',
              tensor([[ 0.0185, -0.0472, -0.0084,  ..., -0.0071,  0.1069, -0.0383],
                      [ 0.0451,  0.0761, -0.1067,  ...,  0.0952, -0.0416,  0.1564],
                      [-0.0712,  0.0067,  0.0232,  ..., -0.1420,  0.0660,  0.1597],
                      ...,
                      [-0.1057, -0.1789, -0.1091,  ..., -0.1048,  0.0615, -0.0185],
                      [ 0.1228, -0.153

## Load model

In [34]:
predict_model = LSTMModel(vocal_size=vocab_size, embedding_dim=embedding_dim,hidden_size=32, length=max_length, output_dim=1)
predict_model.load_state_dict(torch.load('LSTM.pth'))

<All keys matched successfully>

## Dự đoán

### Nhập câu muốn dự đoán

In [43]:
test_loss, test_acc = 0,0
f1_test = []
LSTM_Model.eval()
for batch, (X_test_, y_test_) in enumerate(test_dataloader):
    # 1. Forward pass
    X_test_ = X_test_.long()
    test_pred = LSTM_Model(X_test_)

    # 2. Calculate the loss (accumulatively)
    test_loss += loss_fn(test_pred.squeeze(dim=1), y_test_.float()).item()
    # print(test_pred.shape)
    # 3. Calculate accuracy
    acc = accuracy_fn(y_true= y_test_.float(),
                    y_pred = test_pred.squeeze(dim=1))
    acc = (torch.round(test_pred)==y_test_).sum().item()/len(y_test_)
    test_acc += acc
    # 4. Calculate f1 score
    f1_test.append(f1_score_fn(y_true= y_test_.float(),
                            y_pred = test_pred.squeeze(dim=1)).item())
# Calculate the test loss average per batch
test_loss /= len(test_dataloader)
# test_loss /= len(test_dataloader)
f1_test = sum(f1_test)/len(f1_test)
# Calculate the test acc average per batch
test_acc /= len(test_dataloader)
print(f"Test loss: {test_loss:.4f},   Test acc: {test_acc:.4f},   F1 test score: {f1_test:.4f}")

Test loss: 0.0135,   Test acc: 0.7427,   F1 test score: 0.6301
