In [1]:
import pandas as pd
from tqdm import tqdm
from pymongo import MongoClient
import py_vncorenlp
import re
from collections import Counter
import os
from collections import defaultdict
import numpy as np
from math import log
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import math
import re
import unicodedata
from dotenv import load_dotenv
from gensim.models import Word2Vec

load_dotenv()

MONGO_URI = os.getenv("MONGO_URI")
client = MongoClient(MONGO_URI)
db = client["nlp"]

article_collection = db["article"]      

In [2]:
df_article = pd.DataFrame(list(article_collection.find({})))

In [3]:
def fix_spacing(text):
    text = re.sub(r'([.,!?;:])(?=\S)', r'\1 ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def is_writer_signature(sentence):
    sentence = sentence.strip().rstrip('.')
    
    if len(sentence.split()) <= 3 and sentence == sentence.title():
        return True
    
    if re.fullmatch(r'[A-Z]\.?', sentence) or re.fullmatch(r'[A-Z][a-z]+(\s[A-Z][a-z]+)?', sentence):
        return True

    if "Thực hiện" in sentence or sentence.upper() == sentence:
        return True

    return False

def remove_writer_name(text):
    sentences = [s.strip() for s in text.strip().split('.') if s.strip()]
    if not sentences:
        return text
    last = sentences[-1]
    if is_writer_signature(last):
        return '. '.join(sentences[:-1]) + '.' if len(sentences) > 1 else ''
    return text

# clear stuff like \n \xa0... from df['content]
df_article['content'] = df_article['content'].apply(lambda x: re.sub(r'[\n\r\t\xa0\u200b\u202f]+', ' ', str(x)).strip())
# fix spacing
df_article['content'] = df_article['content'].apply(fix_spacing)
# remove writer name
df_article['content'] = df_article['content'].apply(remove_writer_name)

In [4]:
import py_vncorenlp
import os
original_cwd = os.getcwd()
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=os.path.join(original_cwd, "vncorenlp"))
os.chdir(original_cwd)

In [5]:
with open('vietnamese-stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = set(line.strip().lower() for line in f if line.strip())
stopwords.add('sto')

In [41]:
def is_noise_token(w):
    return re.fullmatch(r'[\W_]+', w) is not None 

def segment_article(article):
    segmented_sentences = rdrsegmenter.word_segment(article) 
    tokenized_sentences = []

    for sentence in segmented_sentences:
        words = sentence.split()
        
        words = [
            w for w in words 
            if w.lower() not in stopwords and not is_noise_token(w) # 
        ]

        words = [w.replace("_", " ").lower() for w in words]
        tokenized_sentences.append(words)
    return tokenized_sentences

In [42]:
all_tokenized_sentences = []

for text in tqdm(df_article['content'].dropna(), total=len(df_article['content'])):
    try:
        tokenized = segment_article(text)
        all_tokenized_sentences.extend(tokenized)
    except Exception as e:
        print(f"Error segmenting text: {e}")


100%|██████████| 6347/6347 [00:46<00:00, 136.29it/s]


In [None]:
model = Word2Vec(
    sentences=all_tokenized_sentences,
    vector_size=300,
    window=6,
    min_count=5,
    sg=1,
    workers=4
)

In [None]:
#model.save("word2vec_vi_bao_st.model")