In [10]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyvi import ViTokenizer, ViPosTagger
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.metrics import accuracy_score, f1_score, classification_report
import string
import emoji_vietnamese
import pickle

In [11]:
train = pd.read_csv('../Dataset_Cleaned/clean_train_victsd.csv')
dev = pd.read_csv('../Dataset_Cleaned/dev_victsd.csv')
test = pd.read_csv('../Dataset_Cleaned/test_victsd.csv')

In [12]:
train['text'] = train['text'].astype(str).apply(ViTokenizer.tokenize)

## Training

In [13]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train['text'])
y_train = train['label']

In [14]:
# Define the parameter distribution
param_dist = {
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(1, 20),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'n_estimators': randint(50, 500),
}

# Initialize a RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=XGBClassifier(
), param_distributions=param_dist, cv=3, scoring='f1')

# Fit the RandomizedSearchCV object to the data
random_search.fit(X_train, y_train)

## Evaluate 

In [15]:
def preprocess_data(data):
    data['text'] = data['text'].str.replace(
        r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
    data['text'] = data['text'].str.replace(
        '['+string.punctuation+']', '', regex=True)
    data['text'] = data['text'].str.lower()
    data['text'] = data['text'].apply(emoji_vietnamese.demojize)
    data['text'] = data['text'].astype(str).apply(ViTokenizer.tokenize)
    return data

In [16]:
dev = preprocess_data(dev)
test = preprocess_data(test)
X_dev = vectorizer.transform(dev['text'])
y_dev = dev['label']
X_test = vectorizer.transform(test['text'])
y_test = test['label']

In [17]:
# Get the best model
best_model = random_search.best_estimator_
# Make predictions on the development set
y_dev_pred = best_model.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
# Make predictions on the test set
y_test_pred = best_model.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94      1768
           1       0.62      0.15      0.24       232

    accuracy                           0.89      2000
   macro avg       0.76      0.57      0.59      2000
weighted avg       0.87      0.89      0.86      2000

              precision    recall  f1-score   support

           0       0.90      0.98      0.94       890
           1       0.56      0.16      0.25       110

    accuracy                           0.89      1000
   macro avg       0.73      0.57      0.60      1000
weighted avg       0.87      0.89      0.87      1000



In [18]:
# Let's assume X_new is your new example
X_new = vectorizer.transform(['mày sống chỉ tổ chật đất mà thôi'])

# Use the model to make a prediction
prediction = best_model.predict(X_new)

print(prediction)

[0]
