In [3]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, Reshape, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.optimizers import Adam

In [4]:
import os
import sys

# project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)), '..')
project_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath('__file__')), '..'))
data_dir = os.path.join(project_dir, 'preprocessing')
sys.path.append(data_dir)

from preprocess import preprocess

## Evaluation

In [150]:
def evaluation(y_pred, y_test):
    # Tính toán các chỉ số đánh giá
    accuracy  = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1_micro = f1_score(y_test, y_pred, average='micro')
    f1_macro = f1_score(y_test, y_pred, average='macro')

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-micro: {f1_micro}")
    print(f"F1-macro: {f1_macro}")

## Feature 

In [6]:
def tfidf_svd(X_train, X_test, X_dev):
    tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)
    tfidf_vect.fit(X_train) # learn vocabulary and idf from training set
    X_train_tfidf =  tfidf_vect.transform(X_train)
    X_test_tfidf =  tfidf_vect.transform(X_test)
    X_dev_tfidf =  tfidf_vect.transform(X_dev)

    svd = TruncatedSVD(n_components=300, random_state=42)
    svd.fit(X_train_tfidf)

    X_train_tfidf_svd = svd.transform(X_train_tfidf)
    X_test_tfidf_svd = svd.transform(X_test_tfidf)
    X_dev_tfidf_svd = svd.transform(X_dev_tfidf)

    return X_train_tfidf_svd, X_test_tfidf_svd, X_dev_tfidf_svd

## Model

In [125]:
def create_lstm_model():
    input_layer = Input(shape=(300,))
    
    layer = Reshape((10, 30))(input_layer)
    layer = LSTM(128, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)
    
    output_layer = Dense(10, activation='softmax')(layer)
    
    classifier = models.Model(input_layer, output_layer)
    
    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return classifier

In [123]:
# def create_lstm_model():
#     input_layer = Input(shape=(300,))
    
#     layer = Reshape((10, 30))(input_layer)
#     layer = LSTM(128, activation='relu')(layer)
#     layer = Dropout(0.1)(layer)
#     layer = Dense(512, activation='relu')(layer)
#     layer = Dropout(0.1)(layer)
#     layer = Dense(256, activation='relu')(layer)
#     layer = Dropout(0.1)(layer)
#     layer = Dense(128, activation='relu')(layer)
#     layer = Dropout(0.1)(layer)
#     layer = Dense(64, activation='relu')(layer)
#     layer = Dropout(0.1)(layer)
    
    
#     output_layer = Dense(10, activation='softmax')(layer)
    
#     classifier = models.Model(input_layer, output_layer)
    
#     classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
#     return classifier

## ViHSD

### load data

In [140]:
train = pd.read_csv("../data/ViHSD/train.csv")
test = pd.read_csv("../data/ViHSD/test.csv")
dev = pd.read_csv("../data/ViHSD/dev.csv")

In [146]:
train['label_id'].value_counts()

label_id
0    19886
1     4162
Name: count, dtype: int64

In [141]:
X_train = train["free_text"].copy()
y_train = train["label_id"].copy()

In [142]:
X_test = test["free_text"].copy()
y_test = test["label_id"].copy()

In [143]:
X_dev = dev["free_text"].copy()
y_dev = dev["label_id"].copy()

In [144]:
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)
X_dev = X_dev.apply(preprocess)

### train

In [148]:
X_train_feature, X_test_feature, X_dev_feature = tfidf_svd(X_train, X_test, X_dev)

In [159]:
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5, restore_best_weights=True,min_delta=0.001)
class_weights = {0: 1.0, 1: 5.0}

model_hsd = create_lstm_model()
history = model_hsd.fit(X_train_feature, y_train, 
                    validation_data=(X_dev_feature, y_dev), 
                    batch_size=64, epochs=100, verbose=True,
                    callbacks=[early_stopping],
                    class_weight=class_weights
                )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 15: early stopping


In [161]:
model_hsd.save('../models/model_lstm_hsd.h5')

  saving_api.save_model(


In [160]:
model_eva = create_lstm_model()
# model_eva.load_weights('../models/model_lstm_ctsd.h5')
model_eva= model_hsd

y_pred = model_eva.predict(X_test_feature)
y_pred_classes = np.argmax(y_pred, axis=1)

evaluation(y_pred_classes, y_test)

Accuracy: 0.8494011976047904
Precision: 0.5474397590361446
Recall: 0.642226148409894
F1-micro: 0.8494011976047903
F1-macro: 0.7493816662937272


In [None]:
# Accuracy: 0.8494011976047904
# Precision: 0.5474397590361446
# Recall: 0.642226148409894
# F1-micro: 0.8494011976047903
# F1-macro: 0.7493816662937272

## CTSD

### load data

In [22]:
train = pd.read_csv("../data/ViCTSD/ViCTSD_train.csv")
test = pd.read_csv("../data/ViCTSD/ViCTSD_test.csv")
dev = pd.read_csv("../data/ViCTSD/ViCTSD_valid.csv")

In [23]:
# # Lọc các hàng có nhãn 1
# df_label_1 = train[train['Toxicity'] == 1]

# # Nhân đôi các hàng có nhãn 1
# df_label_1_doubled = pd.concat([df_label_1] * 3, ignore_index=True)

# # Kết hợp với DataFrame gốc
# train = pd.concat([train, df_label_1_doubled], ignore_index=True)

In [24]:
train["Toxicity"].value_counts()

Toxicity
0    6241
1     759
Name: count, dtype: int64

In [25]:
X_train = train["Comment"].copy()
y_train = train["Toxicity"].copy()

In [26]:
X_test = test["Comment"].copy()
y_test = test["Toxicity"].copy()

In [27]:
X_dev = dev["Comment"].copy()
y_dev = dev["Toxicity"].copy()

In [28]:
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)
X_dev = X_dev.apply(preprocess)

In [29]:
# def augment_data(X, y, num_augmented_samples=10000):
#     augmented_X = []
#     augmented_y = []
#     for _ in range(num_augmented_samples):
#         idx = np.random.choice(len(X))
#         sample = X[idx]
#         label = y[idx]
#         # Thực hiện một số phép biến đổi ngẫu nhiên, ví dụ thay đổi một số từ trong mẫu
#         augmented_sample = sample.copy()
#         change_idx = np.random.choice(len(sample), size=5, replace=False)
#         augmented_sample[change_idx] = np.random.randint(1, 5000, size=5)
#         augmented_X.append(augmented_sample)
#         augmented_y.append(label)
#     return np.array(augmented_X), np.array(augmented_y)
# augmented_X, augmented_y = augment_data(X_train_feature, y_train)

### train

In [31]:
X_train_feature, X_test_feature, X_dev_feature = tfidf_svd(X_train, X_test, X_dev)

In [135]:
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5, restore_best_weights=True,min_delta=0.001)
class_weights = {0: 1.0, 1: 5.0}
model_ctsd = create_lstm_model()
history = model_ctsd.fit(X_train_feature, y_train, 
                    validation_data=(X_dev_feature, y_dev),
                    epochs=100, batch_size=32, verbose=True,
                    callbacks=[early_stopping],
                    class_weight=class_weights
                )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 14: early stopping


In [137]:
model_ctsd.save('../models/model_lstm_ctsd.h5')

  saving_api.save_model(


In [139]:
model_eva = create_lstm_model()
model_eva.load_weights('../models/model_lstm_ctsd.h5')
# model_eva= model_ctsd

y_pred = model_eva.predict(X_test_feature)
y_pred_classes = np.argmax(y_pred, axis=1)

evaluation(model_eva, X_test_feature, y_test)

Accuracy: 0.879
Precision: 0.39622641509433965
Recall: 0.19090909090909092
F1-micro: 0.879
F1-macro: 0.5959002240916939


In [138]:
from collections import Counter

element_counts = Counter(y_pred_classes)
print(element_counts)

Counter({0: 947, 1: 53})


In [None]:
# Accuracy: 0.879
# Precision: 0.39622641509433965
# Recall: 0.19090909090909092
# F1-micro: 0.879
# F1-macro: 0.5959002240916939