In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



In [32]:
import os
import sys

# project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)), '..')
project_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath('__file__')), '..'))
data_dir = os.path.join(project_dir, 'preprocessing')
sys.path.append(data_dir)

from preprocess import preprocess

In [3]:
def evaluation(y_pred, y_test):
    # Tính toán các chỉ số đánh giá
    accuracy  = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1_micro = f1_score(y_test, y_pred, average='micro')
    f1_macro = f1_score(y_test, y_pred, average='macro')

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-micro: {f1_micro}")
    print(f"F1-macro: {f1_macro}")

In [4]:
def tfidf_svd(X_train, X_test, X_dev):
    tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)
    tfidf_vect.fit(X_train) # learn vocabulary and idf from training set
    X_train_tfidf =  tfidf_vect.transform(X_train)
    X_test_tfidf =  tfidf_vect.transform(X_test)
    X_dev_tfidf =  tfidf_vect.transform(X_dev)

    svd = TruncatedSVD(n_components=300, random_state=42)
    svd.fit(X_train_tfidf)

    X_train_tfidf_svd = svd.transform(X_train_tfidf)
    X_test_tfidf_svd = svd.transform(X_test_tfidf)
    X_dev_tfidf_svd = svd.transform(X_dev_tfidf)

    return X_train_tfidf_svd, X_test_tfidf_svd, X_dev_tfidf_svd

## Model

In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
# from xgboost.xgb import XGBClassifier

In [54]:
def naive_bayes_model(X_train_feature, y_train, X_test_feature, y_test):
    model = GaussianNB()
    model.fit(X_train_feature, y_train)

    y_pred = model.predict(X_test_feature)

    print("---Naive Bayes---")
    evaluation(y_pred, y_test)

    return model, y_pred

In [55]:
def RandomForest_model(X_train_feature, y_train, X_test_feature, y_test):
    model = RandomForestClassifier(n_estimators=10)
    model = model.fit(X_train_feature, y_train)

    y_pred = model.predict(X_test_feature)

    print("---RandomForest---")
    evaluation(y_pred, y_test)

    return model, y_pred

In [None]:
# def XGB_model(X_train_feature, y_train, X_test_feature, y_test):
#     model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
#     model.fit(X_train_feature, y_train)

#     y_pred = model.predict(X_test_feature)

#     print("---XGB---")
#     evaluation(y_pred, y_test)

#     return model

## ViHSD

In [47]:
train = pd.read_csv("../data/ViHSD/train.csv")
test = pd.read_csv("../data/ViHSD/test.csv")
dev = pd.read_csv("../data/ViHSD/dev.csv")

In [48]:
X_train = train["free_text"].copy()
y_train = train["label_id"].copy()

In [49]:
X_test = test["free_text"].copy()
y_test = test["label_id"].copy()

In [50]:
X_dev = dev["free_text"].copy()
y_dev = dev["label_id"].copy()

In [51]:
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)
X_dev = X_dev.apply(preprocess)

In [52]:
X_train_feature, X_test_feature, X_dev_feature = tfidf_svd(X_train, X_test, X_dev)

In [60]:
naive_bayes_model(X_train_feature, y_train, X_test_feature, y_test)
RandomForest_model(X_train_feature, y_train, X_test_feature, y_test)

---Naive Bayes---
Accuracy: 0.537125748502994
Precision: 0.23627556512378903
Recall: 0.7756183745583038
F1-micro: 0.537125748502994
F1-macro: 0.49947967071143207
---RandomForest---
Accuracy: 0.8550898203592814
Precision: 0.6607843137254902
Recall: 0.29770318021201414
F1-micro: 0.8550898203592814
F1-macro: 0.6639335384374871


(RandomForestClassifier(n_estimators=10),
 array([0, 1, 0, ..., 0, 0, 0], dtype=int64))

## CTSD

In [61]:
train = pd.read_csv("../data/ViCTSD/ViCTSD_train.csv")
test = pd.read_csv("../data/ViCTSD/ViCTSD_test.csv")
dev = pd.read_csv("../data/ViCTSD/ViCTSD_valid.csv")

In [62]:
X_train = train["Comment"].copy()
y_train = train["Toxicity"].copy()

In [63]:
X_test = test["Comment"].copy()
y_test = test["Toxicity"].copy()

In [64]:
X_dev = dev["Comment"].copy()
y_dev = dev["Toxicity"].copy()

In [69]:
from preprocess import preprocess
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)
X_dev = X_dev.apply(preprocess)

In [70]:
X_train_feature, X_test_feature, X_dev_feature = tfidf_svd(X_train, X_test, X_dev)

In [71]:
naive_bayes_model(X_train_feature, y_train, X_test_feature, y_test)
RandomForest_model(X_train_feature, y_train, X_test_feature, y_test)

---Naive Bayes---
Accuracy: 0.748
Precision: 0.20168067226890757
Recall: 0.43636363636363634
F1-micro: 0.748
F1-macro: 0.5616598480420807
---RandomForest---
Accuracy: 0.891
Precision: 0.5454545454545454
Recall: 0.05454545454545454
F1-micro: 0.891
F1-macro: 0.5205819870777053


(RandomForestClassifier(n_estimators=10),
 array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [72]:
y_test

0      1
1      1
2      0
3      0
4      1
      ..
995    1
996    0
997    0
998    0
999    0
Name: Toxicity, Length: 1000, dtype: int64