In [99]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score

import joblib

In [2]:
import os
import sys

# project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)), '..')
project_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath('__file__')), '..'))
data_dir = os.path.join(project_dir, 'preprocessing')
sys.path.append(data_dir)

from preprocess import preprocess

In [36]:
def evaluation(y_pred, y_test):
    # Tính toán các chỉ số đánh giá
    f1_micro = f1_score(y_test, y_pred, average='micro')
    print("F1 - micro: " + str(f1_micro))

    f1_macro = f1_score(y_test, y_pred, average='macro')
    print("F1 - macro: " + str(f1_macro))

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: " + str(accuracy))

    precision_macro = precision_score(y_test, y_pred, average='macro')
    print("Precision - macro: " + str(precision_macro))

    recall_macro = recall_score(y_test, y_pred, average='macro')
    print("Recall - macro: " + str(recall_macro))

In [24]:
def tfidf_svd(X_train, X_test, X_dev):
    tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)
    tfidf_vect.fit(X_train) # learn vocabulary and idf from training set
    X_train_tfidf =  tfidf_vect.transform(X_train)
    X_test_tfidf =  tfidf_vect.transform(X_test)
    X_dev_tfidf =  tfidf_vect.transform(X_dev)

    svd = TruncatedSVD(n_components=300, random_state=42)
    svd.fit(X_train_tfidf)

    X_train_tfidf_svd = svd.transform(X_train_tfidf)
    X_test_tfidf_svd = svd.transform(X_test_tfidf)
    X_dev_tfidf_svd = svd.transform(X_dev_tfidf)

    return X_train_tfidf_svd, X_test_tfidf_svd, X_dev_tfidf_svd

## Model

In [6]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
# from xgboost.xgb import XGBClassifier

In [71]:
def naive_bayes_model(X_train_feature, y_train, X_test_feature, y_test):
    model = GaussianNB(var_smoothing=1e-09)
    model.fit(X_train_feature, y_train)

    y_pred = model.predict(X_test_feature)

    print("---Naive Bayes---")
    evaluation(y_pred, y_test)

    return model, y_pred

In [89]:
def RandomForest_model(X_train_feature, y_train, X_test_feature, y_test):
    model = RandomForestClassifier(max_depth= None, n_estimators = 100, 
                                   min_samples_split = 2, min_samples_leaf =1 , 
                                   max_features= 'sqrt', random_state =42)
    model = model.fit(X_train_feature, y_train)

    y_pred = model.predict(X_test_feature)

    print("---RandomForest---")
    evaluation(y_pred, y_test)

    return model, y_pred

In [54]:
# def XGB_model(X_train_feature, y_train, X_test_feature, y_test):
#     model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
#     model.fit(X_train_feature, y_train)

#     y_pred = model.predict(X_test_feature)

#     print("---XGB---")
#     evaluation(y_pred, y_test)

#     return model

## ViHSD

In [92]:
train = pd.read_csv("../data/ViHSD/train.csv")
test = pd.read_csv("../data/ViHSD/test.csv")
dev = pd.read_csv("../data/ViHSD/dev.csv")

In [93]:
X_train = train["free_text"].copy()
y_train = train["label_id"].copy()

In [94]:
X_test = test["free_text"].copy()
y_test = test["label_id"].copy()

In [95]:
X_dev = dev["free_text"].copy()
y_dev = dev["label_id"].copy()

In [96]:
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)
X_dev = X_dev.apply(preprocess)

In [97]:
X_train_feature, X_test_feature, X_dev_feature = tfidf_svd(X_train, X_test, X_dev)

In [100]:
model_nb, y_pre_nb =naive_bayes_model(X_train_feature, y_train, X_test_feature, y_test)

joblib.dump(model_nb, '../models/nb_hsd.joblib')

---Naive Bayes---
F1 - micro: 0.537125748502994
F1 - macro: 0.49947967071143207
Accuracy: 0.537125748502994
Precision - macro: 0.5752902791880754
Recall - macro: 0.6320413430109472


['../models/nb_hsd.joblib']

In [None]:
# loaded_model = joblib.load('../models/nb_hsd.joblib')

In [104]:
model_rf, y_pre_rf =RandomForest_model(X_train_feature, y_train, X_test_feature, y_test)

joblib.dump(model_rf, '../models/rf_hsd.joblib')

---RandomForest---
F1 - micro: 0.8672155688622755
F1 - macro: 0.6918934386722521
Accuracy: 0.8672155688622755
Precision - macro: 0.8091607585897532
Recall - macro: 0.6546232558258029


['../models/rf_hsd.joblib']

## CTSD

In [105]:
train = pd.read_csv("../data/ViCTSD/ViCTSD_train.csv")
test = pd.read_csv("../data/ViCTSD/ViCTSD_test.csv")
dev = pd.read_csv("../data/ViCTSD/ViCTSD_valid.csv")

In [106]:
X_train = train["Comment"].copy()
y_train = train["Toxicity"].copy()

In [107]:
X_test = test["Comment"].copy()
y_test = test["Toxicity"].copy()

In [108]:
X_dev = dev["Comment"].copy()
y_dev = dev["Toxicity"].copy()

In [109]:
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)
X_dev = X_dev.apply(preprocess)

In [110]:
X_train_feature, X_test_feature, X_dev_feature = tfidf_svd(X_train, X_test, X_dev)

In [111]:
model_nb, y_pre_nb =naive_bayes_model(X_train_feature, y_train, X_test_feature, y_test)

joblib.dump(model_nb, '../models/nb_ctsd.joblib')

---Naive Bayes---
F1 - micro: 0.773
F1 - macro: 0.5809264295116408
Accuracy: 0.773
Precision - macro: 0.573285967392666
Recall - macro: 0.625485188968335


['../models/nb_ctsd.joblib']

In [112]:
model_rf, y_pre_rf =RandomForest_model(X_train_feature, y_train, X_test_feature, y_test)

joblib.dump(model_rf, '../models/rf_ctsd.joblib')

---RandomForest---
F1 - micro: 0.892
F1 - macro: 0.5212765957446809
Accuracy: 0.892
Precision - macro: 0.7474747474747474
Recall - macro: 0.5250255362614913


['../models/rf_ctsd.joblib']

In [114]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5],
#     # 'priors': [[0.3,0.7], [0.2,0.5,0.7]]
# }

# # Khởi tạo mô hình
# model = GaussianNB()

# # Khởi tạo GridSearchCV
# grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro')

# grid_search.fit(X_dev_feature, y_dev)

# print("parameters:", model.get_params())


In [115]:
# model = RandomForestClassifier(random_state=42)

# # Định nghĩa lưới tham số để tìm kiếm
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2']
# }

# # Khởi tạo GridSearchCV
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=2)
# print("parameters:", model.get_params())
