In [1]:
import os
import json
import py_vncorenlp
import pickle
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
json_dir = "/workspace/Vi-VLM-TTDN/data/wiki_corpus/extracted"
embedding_model_name = "dangvantuan/vietnamese-embedding"
vncorenlp_path = "/workspace/Vi-VLM-TTDN/modules/vncorenlp"
chunk_size = 1
overlap = 0

In [3]:
embed_model = SentenceTransformer(embedding_model_name)
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
segmenter = py_vncorenlp.VnCoreNLP(save_dir=vncorenlp_path, annotators=["wseg"])

2025-04-01 12:30:44 INFO  WordSegmenter:24 - Loading Word Segmentation model


In [4]:
corpus = []
list_chunks = []
metadata = []
embeddings = []

In [5]:
# load all doc to corpus list
for root, dirs, files in os.walk(json_dir):
    for filename in tqdm(files):
        file_path = os.path.join(root, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    data = json.loads(line)
                    if isinstance(data, dict):
                        corpus.append(data)
                except json.JSONDecodeError as e:
                    print(f"[WARNING] Skipping malformed line in file {filename} : {e}")

print(f"The keys in each article dictionary: {corpus[2].keys()}")
print(f"The number of articles: {len(corpus)}")

0it [00:00, ?it/s]
100%|██████████| 100/100 [00:00<00:00, 283.64it/s]
100%|██████████| 100/100 [00:00<00:00, 257.29it/s]
100%|██████████| 100/100 [00:00<00:00, 252.50it/s]
100%|██████████| 100/100 [00:00<00:00, 233.45it/s]
100%|██████████| 100/100 [00:00<00:00, 224.77it/s]
100%|██████████| 100/100 [00:00<00:00, 219.44it/s]
100%|██████████| 100/100 [00:00<00:00, 199.47it/s]
100%|██████████| 100/100 [00:00<00:00, 147.27it/s]
100%|██████████| 100/100 [00:00<00:00, 155.93it/s]
100%|██████████| 100/100 [00:00<00:00, 143.59it/s]
100%|██████████| 100/100 [00:00<00:00, 174.37it/s]
100%|██████████| 100/100 [00:00<00:00, 135.80it/s]
100%|██████████| 100/100 [00:00<00:00, 233.26it/s]
100%|██████████| 100/100 [00:00<00:00, 230.38it/s]
100%|██████████| 100/100 [00:00<00:00, 221.97it/s]
100%|██████████| 100/100 [00:00<00:00, 220.17it/s]
100%|██████████| 100/100 [00:00<00:00, 234.61it/s]
100%|██████████| 100/100 [00:00<00:00, 236.43it/s]
100%|██████████| 100/100 [00:00<00:00, 234.23it/s]
100%|███████

The keys in each article dictionary: dict_keys(['id', 'revid', 'url', 'title', 'text'])
The number of articles: 1591518





In [6]:
# Take 100 first particle to load
part_article = corpus[0:100]

In [7]:
def chunk_by_sentence(segmented_sentences: list, overlap: int = 6):
    assert 0 <= overlap < chunk_size
    step = chunk_size - overlap
    chunks = []
    last_index = 0
    for i in range(0, len(segmented_sentences) - chunk_size + 1, step):
        chunk = " ".join(segmented_sentences[i : i + chunk_size])
        chunks.append(chunk)
        last_index = i + step       
    if last_index < len(segmented_sentences):
        chunks.append(" ".join(segmented_sentences[-chunk_size:]))
    return chunks 

In [8]:
for doc_id, doc in enumerate(tqdm(part_article, desc="Processing article")):
    title = doc.get("title", f"{doc_id}")
    text = doc.get("text", "")
    # if not text.strip():
    #     continue
    
    # Return a list of segmented sentences
    segmented_text = segmenter.word_segment(text)
    chunks_text = chunk_by_sentence(segmented_text, overlap)
    list_chunks.append(chunks_text)

Processing article: 100%|██████████| 100/100 [00:19<00:00,  5.21it/s]


In [13]:
for chunk_list in tqdm(list_chunks, desc="Embedding chunks"):
    for chunk in tqdm(chunk_list):
        embedding_chunk = embed_model.encode(chunk, convert_to_numpy=True, normalize_embeddings=True)
        embeddings.append(embedding_chunk)

0it [00:00, ?it/s]  0%|          | 0/100 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 34.52it/s]
100%|██████████| 10/10 [00:00<00:00, 133.80it/s]
0it [00:00, ?it/s]  3%|▎         | 3/100 [00:00<00:03, 26.67it/s]
100%|██████████| 175/175 [00:01<00:00, 152.77it/s]
100%|██████████| 47/47 [00:00<00:00, 160.54it/s]
0it [00:00, ?it/s]  6%|▌         | 6/100 [00:01<00:28,  3.33it/s]
100%|██████████| 169/169 [00:01<00:00, 161.95it/s]
100%|██████████| 554/554 [00:03<00:00, 162.14it/s]:35,  2.62it/s]
100%|██████████| 597/597 [00:03<00:00, 164.24it/s]:31,  1.00s/it]
100%|██████████| 205/205 [00:01<00:00, 165.68it/s]2:23,  1.60s/it]
100%|██████████| 74/74 [00:00<00:00, 167.69it/s]<02:14,  1.51s/it]
100%|██████████| 21/21 [00:00<00:00, 163.99it/s]<01:48,  1.24s/it]
0it [00:00, ?it/s] 13%|█▎        | 13/100 [00:11<01:21,  1.06it/s]
100%|██████████| 107/107 [00:00<00:00, 172.19it/s]
100%|██████████| 1056/1056 [00:06<00:00, 170.55it/s]56,  1.50it/s]
100%|██████████| 199/199 [00:01<00:00, 170.02i

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [39]:
len(list_chunks[17][25])

107

In [1]:
from sentence_transformers import SentenceTransformer
from pyvi.ViTokenizer import tokenize

sentences = ["Hà Nội là thủ đô của Việt Nam", "Đà Nẵng là thành phố du lịch"]
tokenizer_sent = [tokenize(sent) for sent in sentences]

model = SentenceTransformer('dangvantuan/vietnamese-embedding')
embeddings = model.encode(tokenizer_sent)
print(embeddings)

  from .autonotebook import tqdm as notebook_tqdm


[[ 0.29940698  0.16307998 -0.2462025  ... -0.00254658  0.03522903
   0.3837834 ]
 [ 0.20620279  0.16631396 -0.14708327 ...  0.31187764 -0.6178886
  -0.3789772 ]]


In [3]:
tokenizer_sent[0]

'Hà_Nội là thủ_đô của Việt_Nam'

In [2]:
len(tokenizer_sent[0])

29