In [1]:
# Import libraries
import pandas as pd
# Load & preprocess data
import sys
sys.path.append('Models')
from utility.utility import preprocess_data, load_data
# Tokenizer
from pyvi import ViTokenizer
# Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Model
from sklearn.ensemble import RandomForestClassifier
# Grid search
from sklearn.model_selection import GridSearchCV, StratifiedKFold
# Evaluation
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load data
train = load_data('train','merged')
dev = load_data('dev','merged')
test = load_data('test','merged')

In [3]:
# Preprocess data
train = preprocess_data(train, url = True, lowercase = True ,punctuation = True, stopword = False, emoji = False)
train.dropna(subset = 'text',inplace = True)

In [4]:
# Tokenize & Vectorize
train['text'] = train['text'].astype(str).apply(ViTokenizer.tokenize)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train['text'])
y_train = train['label']

## Training

In [5]:
# Define the parameter grid
param_grid = {
    'n_estimators': [300, 400],
    'max_depth': [50, 100],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [5, 10],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'class_weight': ['balanced']
}

# Create the RandomForestClassifier
clf = RandomForestClassifier(bootstrap=True)

# Use StratifiedKFold for better handling of imbalanced classes
cv = StratifiedKFold(n_splits=5)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=cv, scoring='f1_weighted', n_jobs=-1, verbose=0)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(best_params)

KeyboardInterrupt: 

In [None]:
# # Create a new model with the best parameters
# clf = RandomForestClassifier(n_estimators= 400, max_depth= 100, min_samples_split= 3, min_samples_leaf= 10, max_features= 'sqrt', bootstrap= True, class_weight= 'balanced')

# Create a new model with the best parameters
clf = RandomForestClassifier(**best_params)

# # Fit the model to the training data
clf.fit(X_train, y_train)

In [None]:
# Training performance
y_train_pred = clf.predict(X_train)
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.97      0.94      0.96      6241
           1       0.61      0.80      0.69       759

    accuracy                           0.92      7000
   macro avg       0.79      0.87      0.82      7000
weighted avg       0.93      0.92      0.93      7000



## Evaluate 

In [None]:
# Evaluate the model on the dev set
dev = preprocess_data(dev, url=True, punctuation=True, stopword=False, emoji=False)
X_dev = vectorizer.transform(dev['text'])
y_dev = dev['label']
# Make predictions on the development set
y_dev_pred = clf.predict(X_dev)
# Compute accuracy
print('Accuracy on the development set:', accuracy_score(y_dev, y_dev_pred))
# Compute F1 score
print(classification_report(y_dev, y_dev_pred))

Accuracy on the development set: 0.8625
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1768
           1       0.41      0.41      0.41       232

    accuracy                           0.86      2000
   macro avg       0.66      0.66      0.66      2000
weighted avg       0.86      0.86      0.86      2000



In [None]:
# Evaluate the model on the test set
test = preprocess_data(test, url=True, punctuation=True, stopword=False, emoji=False)
X_test = vectorizer.transform(test['text'])
y_test = test['label']
# Make predictions on the test set
y_test_pred = clf.predict(X_test)
# Compute accuracy
print('Accuracy on the test set:', accuracy_score(y_test, y_test_pred))
# Compute F1 score
print(classification_report(y_test, y_test_pred))

Accuracy on the test set: 0.855
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       890
           1       0.36      0.42      0.39       110

    accuracy                           0.85      1000
   macro avg       0.64      0.66      0.65      1000
weighted avg       0.86      0.85      0.86      1000



In [None]:
# Test on new data
texts = [
    'mày sống chỉ tổ chật đất, đừng tồn tại trên đời nữa', 
    'đm mày',
    'cái vòi này đẹp vãi',
    'đi chết đi con chó'
]
index = 0
for text in texts:
    index += 1;
    text = ViTokenizer.tokenize(text)
    X_new = vectorizer.transform([text])  # put text in a list
    # Use the model to make a prediction
    prediction = clf.predict(X_new)
    print(f'Sentence {index}: {prediction}')

Sentence 1: [0]
Sentence 2: [0]
Sentence 3: [1]
Sentence 4: [0]
