In [1]:
# Import libraries
import pandas as pd
# Load & preprocess data
import sys
sys.path.append('Models')
from utility.utility import preprocess_data, load_data
# Tokenizer
from pyvi import ViTokenizer
# Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Model
from sklearn import svm
# Grid search
from sklearn.model_selection import GridSearchCV
# Evaluation
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load data
train = load_data('train','victsd')
dev = load_data('dev','victsd')
test = load_data('test','vihsd')

In [3]:
# Preprocess data
train = preprocess_data(train, url = True, lowercase = True ,punctuation = True, stopword = False, emoji = False)
train.dropna(subset = 'text',inplace = True)

In [4]:
# Tokenize & Vectorize
train['text'] = train['text'].astype(str).apply(ViTokenizer.tokenize)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train['text'])
y_train = train['label']

## Training

In [5]:
clf = svm.SVC(
    C = 0.3,
    gamma = 'scale',
    class_weight = 'balanced',
    kernel = 'rbf'
)
clf.fit(X_train, y_train)

In [6]:
# Training performance
y_train_pred = clf.predict(X_train)
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      6241
           1       0.76      0.99      0.86       759

    accuracy                           0.97      7000
   macro avg       0.88      0.98      0.92      7000
weighted avg       0.97      0.97      0.97      7000



## Evaluate 

In [7]:
# Evaluate the model on the dev set
dev = preprocess_data(dev, url=True, punctuation=True, stopword=False, emoji=False)
X_dev = vectorizer.transform(dev['text'])
y_dev = dev['label']
# Make predictions on the development set
y_dev_pred = clf.predict(X_dev)
# Compute accuracy
print('Accuracy on the development set:', accuracy_score(y_dev, y_dev_pred))
# Compute F1 score
print(classification_report(y_dev, y_dev_pred))

Accuracy on the development set: 0.8735
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      1768
           1       0.45      0.38      0.41       232

    accuracy                           0.87      2000
   macro avg       0.68      0.66      0.67      2000
weighted avg       0.87      0.87      0.87      2000



In [8]:
# Evaluate the model on the test set
test = preprocess_data(test, url=True, punctuation=True, stopword=False, emoji=False)
X_test = vectorizer.transform(test['text'])
y_test = test['label']
y_test = y_test.replace(2,1)
# Make predictions on the test set
y_test_pred = clf.predict(X_test)
# Compute accuracy
print('Accuracy on the test set:', accuracy_score(y_test, y_test_pred))
# Compute F1 score
print(classification_report(y_test, y_test_pred))

Accuracy on the test set: 0.8100299401197605
              precision    recall  f1-score   support

           0       0.84      0.95      0.89      5548
           1       0.31      0.10      0.15      1132

    accuracy                           0.81      6680
   macro avg       0.58      0.53      0.52      6680
weighted avg       0.75      0.81      0.77      6680



In [9]:
# Test on new data
texts = [
    'mày sống chỉ tổ chật đất, đừng tồn tại trên đời nữa', 
    'đm mày',
    'cái vòi này đẹp vãi',
    'đi chết đi con chó'
]
index = 0
for text in texts:
    index += 1;
    text = ViTokenizer.tokenize(text)
    X_new = vectorizer.transform([text])  # put text in a list
    # Use the model to make a prediction
    prediction = clf.predict(X_new)
    print(f'Sentence {index}: {prediction}')

Sentence 1: [0]
Sentence 2: [0]
Sentence 3: [1]
Sentence 4: [0]
