# 1. Load data

In [1]:
import pandas as pd

train = pd.read_csv('data/train.csv')
valid = pd.read_csv('data/valid.csv')
test = pd.read_csv('data/test.csv')

# 2. Data preprocessing

load vncorenlp

In [2]:
from vncorenlp import VnCoreNLP

rdrsegmenter = VnCoreNLP("VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg,pos,ner", max_heap_size='-Xmx2g')

load stopwords

In [3]:
with open("data/stopwords.txt", encoding='utf-8') as f:
    stopwords = {line.strip().lower() for line in f if line.strip()}

Tiền xử lý

In [4]:
import re
from itertools import chain

def preprocessing(sentence):
    # Loại bỏ emoji và một số kí tự khác
    s = re.sub(r'[^\w\s.,!?]|_', '', str(sentence))

    # Loại bỏ khoảng trắng đầu và cuối
    if not s.strip():
        return []

    s = s.strip()

    # NER và chuyển về chữ thường, loại bỏ stopwords
    tagged = rdrsegmenter.ner(s)
    tagged = list(chain.from_iterable(tagged))

    output = []
    for word, tag in tagged:
        if word.lower() in stopwords:
            continue

        if tag != 'O':
            output.append(f'<{tag}>')
        else:
            output.append(word.lower())
    
    return output

# 3. TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    tokenizer=preprocessing,
    token_pattern=None
)

Hiển thị thanh tiến trình để theo dõi

In [7]:
from tqdm import tqdm

x_train = tfidf.fit_transform(tqdm(train['content']))
x_valid = tfidf.transform(tqdm(valid['content']))
x_test = tfidf.transform(tqdm(test['content']))

100%|██████████| 46774/46774 [11:46<00:00, 66.17it/s] 
100%|██████████| 8065/8065 [01:56<00:00, 69.13it/s]
100%|██████████| 11760/11760 [02:40<00:00, 73.50it/s]


In [30]:
x_train.shape

(46774, 24235)

Lưu lại TF-IDF vectorizer để transform dữ liệu mới

In [9]:
import pickle

with open('embedding_models/tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

Lưu vector

In [16]:
from spicy import sparse

sparse.save_npz('embedded/tfidf/train.npz', x_train)
sparse.save_npz('embedded/tfidf/valid.npz', x_valid)
sparse.save_npz('embedded/tfidf/test.npz', x_test)

# 4. Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(
    tokenizer=preprocessing,
    token_pattern=None
)

In [26]:
from tqdm import tqdm

x_train = bow.fit_transform(tqdm(train['content']))
x_valid = bow.transform(tqdm(valid['content']))
x_test = bow.transform(tqdm(test['content']))

100%|██████████| 46774/46774 [18:27<00:00, 42.23it/s]
100%|██████████| 8065/8065 [03:13<00:00, 41.78it/s]
100%|██████████| 11760/11760 [04:07<00:00, 47.54it/s]


Lưu lại BoW vectorizer để transform dữ liệu mới

In [27]:
import pickle

with open('embedding_models/bow.pkl', 'wb') as f:
    pickle.dump(bow, f)

Lưu vector

In [28]:
from spicy import sparse

sparse.save_npz('embedded/bow/train.npz', x_train)
sparse.save_npz('embedded/bow/valid.npz', x_valid)
sparse.save_npz('embedded/bow/test.npz', x_test)

# 5. multilingual-e5-large

Link: https://huggingface.co/intfloat/multilingual-e5-large

In [None]:
from sentence_transformers import SentenceTransformer

model_finetuned = SentenceTransformer('intfloat/multilingual-e5-large')

In [None]:
x_train = model_finetuned.encode(
    train["content"].tolist(),
    show_progress_bar=True
)

x_valid = model_finetuned.encode(
    valid["content"].tolist(),
    show_progress_bar=True
)

x_test = model_finetuned.encode(
    train["content"].tolist(),
    show_progress_bar=True
)

In [None]:
import numpy as np

np.save('multilingual-e5-large/train.npy', x_train)
np.save('multilingual-e5-large/valid.npy', x_valid)
np.save('multilingual-e5-large/test.npy', x_test)