In [1]:
! pip install underthesea

Collecting underthesea
  Downloading underthesea-6.8.0-py3-none-any.whl.metadata (14 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl.metadata (1.7 kB)
Downloading underthesea-6.8.0-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl (657 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m46.6

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import pandas as pd
from underthesea import word_tokenize
from sklearn.model_selection import train_test_split
import pickle

Load data

In [4]:
train_set = pd.read_csv("/kaggle/input/vnese-articles/train_set.csv").astype("str")

X, y = train_set["article"], train_set["label"]

train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10253 entries, 0 to 10252
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   article  10253 non-null  object
 1   label    10253 non-null  object
dtypes: object(2)
memory usage: 160.3+ KB


Define a list of stop words

In [5]:
with open("/kaggle/input/vnese-articles/vietnamese-stopwords-dash.txt", "r") as f:
    stop_words = f.readlines()
    stop_words = [stop_word.strip() for stop_word in stop_words]
print(stop_words[:10])

['a_lô', 'a_ha', 'ai', 'ai_ai', 'ai_nấy', 'ai_đó', 'alô', 'amen', 'anh', 'anh_ấy']


In [9]:
X = X.apply(lambda x : word_tokenize(x, format="text"))
print(X)

0        một khoảnh_khắc thiên_nhiên kỳ_thú được các du...
1        hoa_hậu lê_hoàng_phương hóa hằng nga trai đẹp ...
2        giá vàng nhẫn_k liên_tục lập_đỉnh mới có nơi b...
3        hôm_nay ban kỷ_luật liên_đoàn bóng_đá việt_nam...
4        mỹ vừa tiến_hành các biện_pháp nặng cân nhằm t...
                               ...                        
10248    lãnh_đạo huyện nói theo hình_ảnh từ camera cô_...
10249    bên dưới nền_móng lịch_sử của lâu_đài cổ_zerze...
10250    năm_học tình_trạng giáo_viên thừa thiếu cục_bộ...
10251    là một trong thành_trì được xây_dựng thời nhà ...
10252    phim chúng_ta của năm sau tập tiểu_tam đến tận...
Name: article, Length: 10253, dtype: object


Convert text to BOW

In [22]:
train_countvectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000)

X_BOW = train_countvectorizer.fit_transform(X)
vocabulary = train_countvectorizer.vocabulary_
print(len(vocabulary.items()))

pickle.dump(vocabulary, open("/kaggle/working/vocabulary.pkl", 'wb'))

10000


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_BOW, y, random_state=42, test_size=0.1)

print(X_train.shape)
print(X_test.shape)

(9227, 10000)
(1026, 10000)


In [12]:
max_value = X_BOW.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
feature_names = np.array(train_countvectorizer.get_feature_names_out())

print("Features with lowest tfidf:\n{}".format(feature_names[sorted_by_tfidf[:20]]))
print("Features with highest tfidf: \n{}".format(feature_names[sorted_by_tfidf[-20:]]))

Features with lowest tfidf:
['quyết_đấu' 'quân_tuổi' 'cội_nguồn' 'cảm_kích' 'dũng_mãnh' 'rặng' 'hòng'
 'lạnh_lùng' 'lựa' 'quyết_đoán' 'rong_ruổi' 'gắt_gao' 'bảo_chứng'
 'duyên_nợ' 'quý_phú_nhuận' 'bà_trương' 'nhiệt_thành' 'tổng_quan' 'dù_vậy'
 'thương_lượng']
Features with highest tfidf: 
['quảng_ngãi' 'smartwatch' 'windows' 'cụ' 'sinner' 'hát_lý' 'angelababy'
 'đen' 'jolie' 'lightstick' 'twice' 'acbs' 'gấu_trúc' 'hiền_hồ' 'gà'
 'bánh_mì' 'lưu_bích' 'tào_tháo' 'nokia' 'hắn']


Naive Bayes

In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score

scores = cross_val_score(MultinomialNB(), X_train, y_train, cv=10)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred, average='micro')}")

pickle.dump(model, open("/kaggle/working/bayes_model.pkl", 'wb'))

Mean cross-validation accuracy: 0.83
Accuracy: 0.8255360623781677
F1: 0.8255360623781677


Decision Tree

In [18]:
from sklearn.tree import DecisionTreeClassifier

scores = cross_val_score(DecisionTreeClassifier(), X_train, y_train, cv=10)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred, average='micro')}")

Mean cross-validation accuracy: 0.59
Accuracy: 0.6045831301803998
F1: 0.6045831301803998


SVM

In [21]:
from sklearn.svm import SVC

scores = cross_val_score(SVC(C=10, gamma=1e-5), X_train, y_train, cv=10)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

model = SVC(C=10, gamma=1e-5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred, average='micro')}")

Mean cross-validation accuracy: 0.10
Accuracy: 0.10238907849829351
F1: 0.10238907849829351


In [31]:
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=10)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred, average='micro')}")

Mean cross-validation accuracy: 0.83
Accuracy: 0.8186250609458801
F1: 0.81862506094588


In [25]:
from sklearn.ensemble import RandomForestClassifier

scores = cross_val_score(RandomForestClassifier(n_estimators=10), X_train, y_train, cv=10)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

model = RandomForestClassifier(n_estimators=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred, average='micro')}")

Mean cross-validation accuracy: 0.65
Accuracy: 0.6704046806435885
F1: 0.6704046806435885


In [26]:
from sklearn.ensemble import GradientBoostingClassifier

scores = cross_val_score(GradientBoostingClassifier(n_estimators=10), X_train, y_train, cv=10)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

model = GradientBoostingClassifier(n_estimators=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred, average='micro')}")

Mean cross-validation accuracy: 0.75
Accuracy: 0.7450024378352024
F1: 0.7450024378352024


Neural Network

In [19]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

# Khởi tạo mô hình Sequential
model = Sequential()

# Thêm tầng ẩn đầu tiên với 128 units và hàm kích hoạt là ReLU
model.add(Dense(8, input_shape=(10000 ,), activation='relu'))

model.add(Dropout(0.2))

# Thêm tầng đầu ra với số units bằng số lớp và hàm kích hoạt là softmax (vì đây là bài toán phân loại)
model.add(Dense(10, activation='softmax'))

# Biên dịch mô hình với hàm mất mát là categorical_crossentropy (do đây là bài toán phân loại) và tối ưu hóa là adam
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# In ra tóm tắt của mô hình
model.summary()


In [20]:
model.fit(
    x=X_train.toarray(),
    y=y_train.astype("float32"),
    epochs=30,
    batch_size=64,
    validation_split=0.2
)

model.save("/kaggle/working/nn_model.h5")

Epoch 1/30
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.3124 - loss: 2.2434 - val_accuracy: 0.5726 - val_loss: 1.9988
Epoch 2/30
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5298 - loss: 1.8942 - val_accuracy: 0.6831 - val_loss: 1.5988
Epoch 3/30
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6060 - loss: 1.5285 - val_accuracy: 0.7254 - val_loss: 1.2698
Epoch 4/30
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6567 - loss: 1.2328 - val_accuracy: 0.7633 - val_loss: 1.0466
Epoch 5/30
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6784 - loss: 1.0706 - val_accuracy: 0.7882 - val_loss: 0.9009
Epoch 6/30
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7095 - loss: 0.9590 - val_accuracy: 0.8093 - val_loss: 0.8009
Epoch 7/30
[1m116/116[0m 

In [21]:
model.evaluate(
    x=X_test.toarray(),
    y=y_test.astype("float32"),
    batch_size=64
)

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8206 - loss: 0.4950 


[0.44919195771217346, 0.8265107274055481]