# Build a Vietnamese Text Corpus from Wikipedia

## Preparation

### Installing dependencies
- `gensim`: For training Word2Vec.
- `underthesea`: For data processing.
- `Cython`: Required by `gensim`, may not be explicitly installed.

In [None]:
!pip install gensim underthesea Cython

In [None]:
import urllib.request
from underthesea import text_normalize, word_tokenize
import os
import sys
import pickle
import json

from gensim.corpora import WikiCorpus
from gensim.models.word2vec import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

### Download Wikidump

In [None]:
TEXT_DUMP = "https://dumps.wikimedia.org/viwiki/20240120/viwiki-20240120-pages-articles.xml.bz2" # @param {type:"string"}

In [None]:
DATA_DIR = "/content/data/"

In [None]:
!mkdir -p $DATA_DIR

In [None]:
WIKIDUMP_FILE = os.path.join(DATA_DIR, "wikidump.bz2")

In [None]:
!curl -L -o $WIKIDUMP_FILE -C - $TEXT_DUMP

### Download the dictionary

The dictionaries are provided from two main sources:

- Vietnamese Hunspell library: The open sourced spell checking library, Vietnamese language version.
- UTS Dictionary: By Underthesea

In [None]:
DICTIONARY_FILE = os.path.join(DATA_DIR, "dictionary.txt")

In [None]:
!curl -L "https://raw.githubusercontent.com/1ec5/hunspell-vi/master/dictionaries/vi-DauCu.dic" > $DICTIONARY_FILE
!curl -L "https://raw.githubusercontent.com/1ec5/hunspell-vi/master/dictionaries/vi-DauMoi.dic" >> $DICTIONARY_FILE
!curl -L "https://huggingface.co/datasets/undertheseanlp/UTS_Dictionary/resolve/main/data/data.txt?download=true" >> $DICTIONARY_FILE

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 39910  100 39910    0     0  85433      0 --:--:-- --:--:-- --:--:-- 85460
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 39916  100 39916    0     0  80335      0 --:--:-- --:--:-- --:--:-- 80313
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1174  100  1174    0     0   3210      0 --:--:-- --:--:-- --:--:--  3216
100  952k  100  952k    0     0  1458k      0 --:--:-- --:--:-- --:--:-- 11.3M


## Making the corpus

### Building vocab

From the dictionaries, we build a vocab set. To simplify the vocab set, we will only use vocab with two words or fewer.

In [None]:
def make_vocab():
    vocab = set()
    with open(DICTIONARY_FILE, "r") as f:
        for line in f:
            if len(line.split()) <= 2:
                vocab.add(text_normalize(line).lower())
    return vocab


VOCAB = make_vocab()

In [None]:
print("Vocab count: {}".format(len(VOCAB)))
print(list(VOCAB)[:20])

Vocab count: 58070
['sacarin', 'gà vịt', 'khổng tử', 'tốt vía', 'kết dính', 'bổ khí', 'aceton', 'mượt mà', 'lãnh dục', 'giảm biên', 'hoảng', 'làm rẫy', 'uẩn khúc', 'kem', 'công kênh', 'cuối tuần', 'váy xòe', 'nghỉu', 'thiêu thân', 'tín điều']


### Define the tokenizer function

In [None]:
def tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool):
    tokens = list()
    for token in word_tokenize(text):
        if not (token_min_len <= len(token) <= token_max_len):
            continue
        if lower:
            token = token.lower()
        if token in VOCAB:
            tokens.append(token)
    return tokens

### Building the corpus

In [None]:
CORPUS_FILE = os.path.join(DATA_DIR, "vi-wiki-corpus-token.bin")

In [None]:
corpus = WikiCorpus(WIKIDUMP_FILE, tokenizer_func=tokenizer_func, dictionary={}, lower=True)

In [None]:
def make_corpus():
    with open(CORPUS_FILE, "wb") as f:
        print("Tokenizing corpus...")

        for index, text in enumerate(corpus.get_texts()):
            if (index % 1000 == 0):
                print('Processed {} articles'.format(index))
            pickle.dump(text, f)

        print('Processing complete!')

In [None]:
OVERRIDE = False

if not os.path.isfile(CORPUS_FILE):
    make_corpus()
else:
    if OVERRIDE:
        make_corpus()
    else:
        print("Corpus existed...")

### Preview corpus

In [None]:
if not os.path.isfile(CORPUS_FILE):
    print("Cannot find corpus")
else:
    with open(CORPUS_FILE, "rb") as f:
        count = 0
        read_limit = 5
        print("First {} line(s) of corpus".format(read_limit))
        while count < 5:
            count = count + 1
            try:
                data = pickle.load(f)
                if len(data) > 10:
                    print("{} ...".format(pickle.load(f)[:10]))
                else:
                    print(data)
            except EOFError:
                break

First 5 line(s) of corpus
['tiếng', 'việt', 'cũng', 'gọi là', 'tiếng', 'hay', 'việt ngữ', 'là', 'ngôn ngữ', 'của'] ...
['còn', 'được', 'người', 'việt', 'gọi', 'vắn tắt', 'là', 'hay', 'phiên', 'là'] ...
['thành phố', 'viết', 'tắt', 'hay', 'sài gòn', 'là', 'thành phố', 'lớn', 'nhất', 'và'] ...
['là', 'tổ chức', 'tiêu chuẩn', 'quốc tế', 'chính', 'cho', 'được', 'thành lập', 'vào', 'năm'] ...
['lào', 'tên', 'chính thức', 'là', 'cộng hòa', 'lào', 'là', 'quốc gia', 'có', 'chủ quyền'] ...


## Build Word2Vec

`MAX_SENTENCE` denotes the number of sentences will be processed. Set `MAX_SENTENCE` to `-1` to use all corpus.

In [None]:
MAX_SENTENCE = -1 # @param {type: "integer"}

In [None]:
class MySentences(object):
    def __init__(self):
        self.count = 0

    def __iter__(self):
        with open(CORPUS_FILE, "rb") as f:
            while self.count < MAX_SENTENCE or MAX_SENTENCE == -1:
                self.count = self.count + 1
                try:
                    yield pickle.load(f)
                except EOFError:
                    break


class MyCallbacks(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        print("Epoch: " + str(self.epoch) +  ", Loss: " + str(model.get_latest_training_loss()))
        model.running_training_loss = 0.0
        self.epoch += 1


callbacks = [MyCallbacks()]
sentences = MySentences()

In [None]:
OUTPUT_DIR = "/content/out"
!mkdir -p $OUTPUT_DIR

MODEL_PATH = os.path.join(OUTPUT_DIR, "vi-word2vec.model")

In [None]:
model = Word2Vec(sentences, workers=12, sample=1e-3, min_count=10,
                 vector_size=100, window=15, sg=1,
                 epochs=20, compute_loss=True, callbacks=callbacks)
model.save(MODEL_PATH)

Epoch: 0, Loss: 94236304.0
Epoch: 1, Loss: 93933872.0
Epoch: 2, Loss: 93315536.0
Epoch: 3, Loss: 92566296.0
Epoch: 4, Loss: 92068112.0
Epoch: 5, Loss: 91679168.0
Epoch: 6, Loss: 91088648.0
Epoch: 7, Loss: 90658784.0
Epoch: 8, Loss: 90247440.0
Epoch: 9, Loss: 89908392.0
Epoch: 10, Loss: 89519416.0
Epoch: 11, Loss: 89144616.0
Epoch: 12, Loss: 88493720.0
Epoch: 13, Loss: 87958984.0
Epoch: 14, Loss: 87538344.0
Epoch: 15, Loss: 87411360.0
Epoch: 16, Loss: 86728184.0
Epoch: 17, Loss: 86242384.0
Epoch: 18, Loss: 85243752.0
Epoch: 19, Loss: 83609160.0


In [None]:
model = Word2Vec.load(MODEL_PATH)

In [None]:
model.wv.most_similar(["hành động", "con người"], topn=10)

[('bản chất', 0.8088125586509705),
 ('tâm trí', 0.7885621786117554),
 ('khía cạnh', 0.787632167339325),
 ('tinh thần', 0.784379780292511),
 ('tưởng tượng', 0.7830929160118103),
 ('nhận thức', 0.7775944471359253),
 ('đạo đức', 0.77630615234375),
 ('loài người', 0.7752376198768616),
 ('siêu nhiên', 0.7739694118499756),
 ('ý thức', 0.7722036242485046)]

# Export vectors and metadata

- Can be viewed using [embedding projector](https://projector.tensorflow.org/).

In [None]:
METADATA_PATH = os.path.join(OUTPUT_DIR, "metadata.tsv")
VECTOR_PATH = os.path.join(OUTPUT_DIR, "vectors.tsv")

In [None]:
keys = model.wv.index_to_key

with open(METADATA_PATH, "w", encoding="utf-8") as metadata:
    for key in keys:
        metadata.writelines(key + "\n")

with open(VECTOR_PATH, "w", encoding="utf-8") as vectors:
    for key in keys:
        # Normalize vector `norm=True`
        vector = "\t".join([str(v) for v in model.wv.get_vector(key, norm=True).tolist()])
        vectors.write(vector + "\n")

# Select start words

In [None]:
CATEGORY = [ # The categories we want to extract
    ["gia đình"],
    ["việc làm"],
    ["động vật", "chó"],
    ["đồ ăn"],
    ["tính cách"]
]
NO_WORDS = 10 # Number of words per category

words_list = list()
for cat in CATEGORY:
  words = [word[0] for word in model.wv.most_similar(cat, topn=NO_WORDS)]
  words_list.append({"category": cat, "words": words})

words_list_json = json.dumps(words_list, indent=4, ensure_ascii=False)
print(words_list_json)
with open(os.path.join(OUTPUT_DIR, "categories.json"), "w", encoding='utf8') as outfile:
    outfile.write(words_list_json)

[
    {
        "category": [
            "gia đình"
        ],
        "words": [
            "cha mẹ",
            "cha",
            "ông bà",
            "mẹ",
            "chị gái",
            "cha dượng",
            "vợ",
            "bà",
            "người thân",
            "bố mẹ"
        ]
    },
    {
        "category": [
            "việc làm"
        ],
        "words": [
            "lao động",
            "khuyến khích",
            "tuyển dụng",
            "lợi ích",
            "thất nghiệp",
            "công bằng",
            "phúc lợi",
            "trợ cấp",
            "nghề nghiệp",
            "thúc đẩy"
        ]
    },
    {
        "category": [
            "động vật",
            "chó"
        ],
        "words": [
            "thú",
            "mèo",
            "gấu",
            "thỏ",
            "gấu mèo",
            "khỉ",
            "hươu",
            "săn",
            "sói",
            "lửng"
        ]
    },
    {
        "category": [
 