## Import library

In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
import matplotlib.pyplot as plt
import pyvi

## Setup data

### Data path

In [2]:
ViCTSD_train = "..\\Dataset\\ViCTSD\\ViCTSD_train.csv"
ViCTSD_test = "..\\Dataset\\ViCTSD\\ViCTSD_test.csv"
vihsd_train = "..\\Dataset\\vihsd\\train.csv"
vihsd_test = "..\\Dataset\\vihsd\\test.csv"

In [3]:
from datasets import load_dataset
local_csv = load_dataset("csv", data_files=ViCTSD_train, sep=',')

In [4]:
local_csv

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Comment', 'Constructiveness', 'Toxicity', 'Title', 'Topic'],
        num_rows: 7000
    })
})

### Visualize data

* Visualize record
* Unique labels each path

In [5]:
ViCTSD_train_data = pd.read_csv(ViCTSD_train)[['Comment', 'Toxicity']]
ViCTSD_test_data = pd.read_csv(ViCTSD_test)[['Comment', 'Toxicity']]
vihsd_train_data = pd.read_csv(vihsd_train)
vihsd_test_data = pd.read_csv(vihsd_test)

### Get text and label each DataFrame

In [6]:
vihsd_train_data = vihsd_train_data.rename(columns={'free_text': 'Comment', 'label_id': 'Toxicity'})
vihsd_test_data = vihsd_test_data.rename(columns={'free_text': 'Comment', 'label_id': 'Toxicity'})

In [7]:
vihsd_train_data['Toxicity'] = vihsd_train_data['Toxicity'].replace(2, 1)
vihsd_test_data['Toxicity'] = vihsd_test_data['Toxicity'].replace(2, 1)

### Merge DataFrame

In [8]:
df_train_joined = pd.concat([ViCTSD_train_data, vihsd_train_data], ignore_index=True)
df_test_joined = pd.concat([ViCTSD_test_data, vihsd_test_data], ignore_index=True)

In [9]:
# Setup type for data in Comment column
df_train_joined['Comment'] = df_train_joined['Comment'].astype(str)
df_test_joined['Comment'] = df_test_joined['Comment'].astype(str)

### Check null and remove null data

In [10]:
# Lấy các cột có giá trị null
def check_null_record(dataframe: pd.DataFrame, column_name: str):
    columns_with_nulls = dataframe[column_name].isnull()
    null_comment_indices = columns_with_nulls[columns_with_nulls == True].index
    # print null dataframe
    print(dataframe.iloc[null_comment_indices])
    return null_comment_indices

In [11]:
check_null_record(df_train_joined, 'Comment')

Empty DataFrame
Columns: [Comment, Toxicity]
Index: []


Index([], dtype='int64')

In [12]:
df_train_joined.drop(check_null_record(df_train_joined, 'Comment'), inplace=True)

Empty DataFrame
Columns: [Comment, Toxicity]
Index: []


### Preprocess data function
* Tokenize: tuyệt vời -> tuyệt_vời, ngu ngốc -> ngu_ngốc
* Loại stopwords: Thật tuyệt_vời -> tuyệt_vời
* Loại link, hashtag và tag trong comment

In [13]:
from pyvi import ViTokenizer, ViPosTagger
# word seqmentation
# ML không bắt buộc seqmentation
def tokenize(text):
    """
    Thật tuyệt vời -> Thật tuyệt_vời
    """
    return ViTokenizer.tokenize(text)

In [14]:
import os
with open('Stopwords\\vietnamese-stopwords-dash.txt', 'r', encoding='utf-8') as f:
    # Đọc từng dòng trong file
    stopwords = [line.strip() for line in f]
sus_stopwords = ["không","không_có","không_thể","chưa"]
for sus_stopword in sus_stopwords:
  stopwords.remove(sus_stopword)

In [15]:
def lower_and_remove_stopwords(text):
  try:
    # Chuyển đổi văn bản sang chữ thường
    text = text.lower()

    # Tách văn bản thành danh sách các từ
    words = text.split()

    # Loại bỏ stopword
    filtered_words = [word for word in words if word not in stopwords]

    # Ghép danh sách các từ đã lọc lại thành văn bản
    filtered_text = ' '.join(filtered_words)
  except Exception as e:
    print(text)
  return filtered_text


In [16]:
import re

def remove_links_hashtag_tag(text):
    """
    Removes URLs from a text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with URLs removed.
    """
    link_remover = r"(https?://[^\s]+)"
    hashtag_remover = r"# [^\s]+"
    tag_remover = r"@ [^\s]+"

    text = re.sub(link_remover, "", text)
    text = re.sub(hashtag_remover, "", text)
    text = re.sub(tag_remover, "", text)
  
    return text

In [17]:
# Calculate accuracy - out of 100 examples, what percentage does our model get right?
def accuracy_fn(y_true, y_pred):
  y_pred_rounded = torch.round(y_pred)  
  correct = torch.eq(y_true, y_pred_rounded).sum().item()
  try:
    acc = (correct/len(y_pred))*100
  except ZeroDivisionError:
    print("ZeroDivisionError")
    print(y_true, y_pred)
  return acc

#### Test một dữ liệu khi qua quá trình chuẩn bị

In [18]:
test_text = "bánh này không ngon"

In [19]:
test_text = ViTokenizer.tokenize(test_text)
test_text

'bánh này không ngon'

In [20]:
test_text = lower_and_remove_stopwords(test_text)
print(test_text)
test_text = remove_links_hashtag_tag(test_text)
test_text

bánh không ngon


'bánh không ngon'

In [21]:
text = "Đây là một ví dụ về #hashtag, http://www.example.com và @username."
result = remove_links_hashtag_tag(text)
result

'Đây là một ví dụ về #hashtag,  và @username.'

## Create train, test

X_train, X_test, y_train, y_test

In [22]:
def preprocess_data(path1, path2):
    """
    1. Get data from 2 paths
    2. Concat 2 dataframes
    3. Tokenize -> lower and remove stopwords -> remove links, hashtags, tags
    4. Get X, y from dataframe (input, output)
    """
    train_data1 = pd.read_csv(path1)[['Comment', 'Toxicity']]
    train_data2 = pd.read_csv(path2)
    train_data2 = train_data2.rename(columns={'free_text': 'Comment', 'label_id': 'Toxicity'})
    train_data2['Toxicity'] = train_data2['Toxicity'].replace(2, 1)
    df_joined = pd.concat([train_data1, train_data2], ignore_index=True)
    df_joined['Comment'] = df_joined['Comment'].astype(str)
    df_joined['Comment'] = df_joined['Comment'].apply(tokenize)
    df_joined['Comment'] = df_joined['Comment'].apply(lower_and_remove_stopwords)
    df_joined['Comment'] = df_joined['Comment'].apply(remove_links_hashtag_tag)
    X = df_joined['Comment']
    y = df_joined['Toxicity'].tolist()
    return X, y, df_joined


In [66]:
X, y, train_data = preprocess_data(ViCTSD_train, vihsd_train)
X_test, y_test, test_data = preprocess_data(ViCTSD_test, vihsd_test)

In [24]:
train_data.to_csv('preprocessed_train_data.csv', index=False)
test_data.to_csv('preprocessed_test_data.csv', index=False)


### Load data from csv

In [25]:
import pandas as pd
train_data = pd.read_csv('preprocessed_train_data.csv')
test_data = pd.read_csv('preprocessed_test_data.csv')
X, y = train_data['Comment'].astype(str), train_data['Toxicity']

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab_size=50000
embedding_dim=32
max_length=140

# Tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded_sequences = pad_sequences(X_train_sequences, maxlen=max_length, padding='post',truncating='post')

# Thực hiện thay đổi test để đưa vào tính toán val_acc
X_val = tokenizer.texts_to_sequences(X_val)
X_val_padded = pad_sequences(X_val, maxlen=max_length, padding='post',truncating='post')

## Transforming data 

In [26]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np

class CustomDatasetv1(Dataset):
    def __init__(self, x_encoded: np.ndarray, y_encoded: pd.core.series.Series):
        # Setup
        self.x_encoded = x_encoded
        self.y_encoded = y_encoded.tolist()
    
    def __getitem__(self, idx):
        return (torch.FloatTensor(self.x_encoded[idx]), self.y_encoded[idx])
        # return (self.x_encoded[idx], self.y_encoded[idx])
        # return {'text': self.x[idx], 'label': self.y_encoded[idx]}
    
    def __len__(self):
        return self.x_encoded.shape[0]


In [27]:
train_data = CustomDatasetv1(X_train_padded_sequences, y_train)
test_data = CustomDatasetv1(X_val_padded, y_val)

In [28]:
len(train_data)

27943

In [29]:
from torch.utils.data import DataLoader
BATCH_SIZE=32
train_dataloader = DataLoader(dataset=train_data,
                              batch_size=BATCH_SIZE,
                              shuffle=True)
test_dataloader = DataLoader(dataset=test_data,
                              batch_size=BATCH_SIZE,
                              shuffle=True)                            

In [30]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x229845a8390>

In [32]:
from torch import nn

class ReshapeLayer(nn.Module):
    def forward(self, x):
        batch_size, sequence_length, embedding_dim = x.size()
        return x.view(batch_size, -1)

class model(nn.Module):
    def __init__(self, vocab_size:int, embedding_dim:int):
        super().__init__()
        self.se = nn.Sequential(
            nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim),
            ReshapeLayer(),  # Thêm Reshape Layer mới
            nn.Linear(in_features=140 * embedding_dim, out_features=20),
            nn.ReLU(),
            nn.Linear(in_features=20, out_features=20),
            nn.ReLU(),
            nn.Linear(in_features=20, out_features=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.se(x)

In [49]:
model = model(vocab_size=vocab_size, embedding_dim=embedding_dim)
model

model(
  (se): Sequential(
    (0): Embedding(50000, 32)
    (1): ReshapeLayer()
    (2): Linear(in_features=4480, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=20, bias=True)
    (5): ReLU()
    (6): Linear(in_features=20, out_features=1, bias=True)
    (7): Sigmoid()
  )
)

In [50]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [51]:
y_pred_record = []

In [52]:
# Write a training and evaluationg loop for model_1
torch.manual_seed(42)

# import tqdm for progress bar
from tqdm.auto import tqdm

# Train for longer
epochs = 10

# # Put data on the target device
# X_padded_sequences, y_train = torch.tensor(X_padded_sequences).to(device), torch.tensor(y_train).to(device)
# padded_val_sequences, y_test=  torch.tensor(padded_val_sequences).to(device), torch.tensor(y_test).to(device)

# Create training and test loop
for epoch in tqdm(range(epochs)):
  print(f"Epoch: {epoch}\n------")

  ### Training
  train_loss=0
  model.train()
  # Add a loop to loop through the training batches
  for batch, (X, y) in enumerate(train_dataloader):
    # print(f"Batch: {batch}")

    # 1. Forward
    X = X.long()
    y_pred = model(X)

    # 2. Calculate the loss
    y_pred_record.append(y_pred)
    loss = loss_fn(y_pred.squeeze(), y.float().squeeze())
    train_loss += loss

    # 3.
    optimizer.zero_grad()

    # 4.
    loss.backward()

    # 5.
    optimizer.step()

    # Print out what's happening
    # if batch %50==0:
    #   print(f"Looked at {batch*len(X)}/{len(train_dataloader.dataset)} samples.")

  # Divide total train loss by length of train dataloader
  train_loss /= len(train_dataloader)

  ### Testing
  test_loss, test_acc = 0,0
  model.eval()
  with torch.inference_mode():
    for X_test, y_test in test_dataloader:
      # 1. Forward pass
      X_test = X_test.long()
      test_pred = model(X_test)

      # 2. Calculate the loss (accumulatively)
      test_loss += loss_fn(test_pred.squeeze(dim=1), y_test.float())

      # 3. Calculate accuracy
      test_acc += accuracy_fn(y_true= y_test.float(),
                              y_pred = test_pred.squeeze(dim=1))

    # Calculate the test loss average per batch
    test_loss /= len(test_dataloader)

    # Calculate the test acc average per batch
    test_acc /= len(test_dataloader)

  # print out what happen
  print(f"\nTrain loss; {train_loss:.4f} | Test loss: {test_loss:.4f}, Test acc: {test_acc:.4f}")
  # print(f"\nTrain loss; {train_loss:.4f}")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 0
------

Train loss; 0.4343 | Test loss: 0.4217, Test acc: 84.9490
Epoch: 1
------

Train loss; 0.3903 | Test loss: 0.3626, Test acc: 85.5548
Epoch: 2
------

Train loss; 0.3156 | Test loss: 0.3774, Test acc: 86.0651
Epoch: 3
------

Train loss; 0.2700 | Test loss: 0.3815, Test acc: 82.8444
Epoch: 4
------

Train loss; 0.2373 | Test loss: 0.3429, Test acc: 87.0217
Epoch: 5
------

Train loss; 0.2136 | Test loss: 0.3862, Test acc: 83.9923
Epoch: 6
------

Train loss; 0.1947 | Test loss: 0.3609, Test acc: 86.7347
Epoch: 7
------

Train loss; 0.1760 | Test loss: 0.3696, Test acc: 86.6071
Epoch: 8
------

Train loss; 0.1589 | Test loss: 0.4115, Test acc: 85.6824
Epoch: 9
------

Train loss; 0.1451 | Test loss: 0.4359, Test acc: 86.9579


In [53]:
len(y_pred_record[-1])

7

In [54]:
torch.round(y_pred_record[-5])

tensor([[1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.]], grad_fn=<RoundBackward0>)

## Lưu model

In [None]:
torch.save(model.state_dict(), 'my_model.pth')

## Load model

In [33]:
predict_model = model(vocab_size=vocab_size, embedding_dim=embedding_dim)
predict_model.load_state_dict(torch.load('my_model.pth'))

<All keys matched successfully>

## Dự đoán

### Nhập câu muốn dự đoán

In [64]:
test_sentences = [  "Đồ ngu, sao mày ngu thế?",
                    "Con này xấu xí, béo ú, chẳng ra gì cả.",
                    "Đồ đàn bà con gái chỉ biết ăn bám đàn ông.",
                    "Sản phẩm này tệ hại, ai mua cũng hối hận.",
                    "Mày hát dở tệ, đừng bao giờ hát nữa nhé.",
                    "Mày dám nói tao ngu à? Thử lặp lại lần nữa xem!",
                    "Đời mày xuống lỗ rồi con ạ",
                    "Ngu thì nín :))",
                    "khôn như vợ chồng nhà này k biết bao giờ mới giỗ đầu bạn nhỉ :))",
                    "toàn cái bọn lúc vay muốn dc vc.khi xong vc rồi thì vẩy đuôi b oi.giải thích bọn nó cố tính đéo hiểu đâu.còn vay gì trả đấy đơn giản chỉ thế.",
                    "sản phẩm này bố ỉa vào mà mua",
                    "bạn thật xinh đẹp",
                    "cộng hòa xã hội chủ nghĩa việt nam muôn năm",
                    "em thật dễ thương",
                    "hôm nay ăn gì",
                    "đầu bạn nhìn như đầu buổi"
]

In [69]:
X[0]

'tuyệt_vời ... ! ! !'

In [62]:
def predict(test_sentences):
    test_seq = tokenize(test_sentences)
    test_seq = lower_and_remove_stopwords(test_seq)
    test_seq = remove_links_hashtag_tag(test_seq)
    test_seq = tokenizer.texts_to_sequences([test_seq])
    # print(test_seq)
    test_seq = pad_sequences(test_seq, maxlen=max_length, padding='post',truncating='post')
    # print(test_seq)
    print(predict_model(torch.tensor(test_seq)))

In [65]:
for sentence in test_sentences:
    predict(sentence)

tensor([[0.9287]], grad_fn=<SigmoidBackward0>)
tensor([[0.9278]], grad_fn=<SigmoidBackward0>)
tensor([[0.9272]], grad_fn=<SigmoidBackward0>)
tensor([[0.0526]], grad_fn=<SigmoidBackward0>)
tensor([[0.9063]], grad_fn=<SigmoidBackward0>)
tensor([[0.9006]], grad_fn=<SigmoidBackward0>)
tensor([[0.5593]], grad_fn=<SigmoidBackward0>)
tensor([[0.9287]], grad_fn=<SigmoidBackward0>)
tensor([[0.0004]], grad_fn=<SigmoidBackward0>)
tensor([[0.9287]], grad_fn=<SigmoidBackward0>)
tensor([[0.0118]], grad_fn=<SigmoidBackward0>)
tensor([[0.0008]], grad_fn=<SigmoidBackward0>)
tensor([[0.6395]], grad_fn=<SigmoidBackward0>)
tensor([[9.2950e-05]], grad_fn=<SigmoidBackward0>)
tensor([[0.0052]], grad_fn=<SigmoidBackward0>)
tensor([[0.0484]], grad_fn=<SigmoidBackward0>)


In [71]:
def predictV1(test_sentences):
    test_seq = tokenize(test_sentences)
    test_seq = lower_and_remove_stopwords(test_seq)
    test_seq = remove_links_hashtag_tag(test_seq)
    test_seq = tokenizer.texts_to_sequences([test_seq])
    # print(test_seq)
    test_seq = pad_sequences(test_seq, maxlen=max_length, padding='post',truncating='post')
    # print(test_seq)
    return predict_model(torch.tensor(test_seq))

In [75]:
predictV1(X_test[0])<0.5

tensor([[True]])

In [77]:
num_record = len(X_test)
is_true = 0
true_0 = 0
true_1 = 0
test_predict = []
for i in range(len(X_test)):
    if predictV1(X_test[i])<0.5:
        test_predict.append(0)
    else:
        test_predict.append(1)
    

In [79]:
torch.eq(torch.tensor(test_predict), torch.tensor(y_test)).sum().item()

6708

In [80]:
6708/num_record

0.8734375

In [None]:
# "Đồ ngu, sao mày ngu thế?", 1
# "Con này xấu xí, béo ú, chẳng ra gì cả.", 1
# "Đồ đàn bà con gái chỉ biết ăn bám đàn ông.",
# "Sản phẩm này tệ hại, ai mua cũng hối hận.",
# "Mày hát dở tệ, đừng bao giờ hát nữa nhé.", 0 
# "Mày dám nói tao ngu à? Thử lặp lại lần nữa xem!",
# "Đời mày xuống lỗ rồi con ạ", 0
# "Ngu thì nín :))",
# "khôn như vợ chồng nhà này k biết bao giờ mới giỗ đầu bạn nhỉ :))", 0 
# "toàn cái bọn lúc vay muốn dc vc.khi xong vc rồi thì vẩy đuôi b oi.g
# "sản phẩm này bố ỉa vào mà mua"

In [None]:
# import csv
# filename = "E:\\AI Project\\word2vec_vi_syllables_100dims.txt"
# chunksize = 10 ** 6
# for chunk in pd.read_csv(filename, chunksize=chunksize, on_bad_lines='skip', quoting=csv.QUOTE_NONE, encoding='utf-8'):
#     # chunk is a DataFrame. To "process" the rows in the chunk:
#     for index, row in chunk.iterrows():
#         print(row)