In [1]:
import pandas as pd
from tqdm import tqdm
from pymongo import MongoClient
import py_vncorenlp
import re
from collections import Counter
import os
from collections import defaultdict
import numpy as np
from math import log
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import math
import re
import unicodedata
from dotenv import load_dotenv

load_dotenv()

MONGO_URI = os.getenv("MONGO_URI")
client = MongoClient(MONGO_URI)
db = client["nlp"]

article_collection = db["article"]      

In [2]:
tf_idf_collection = db["article_tf_idf"]
list_tf_idf = list(tf_idf_collection.find({}))


rows = []
for doc in list_tf_idf:
    article_id = doc['articleId']
    tf_idf = doc.get('tf_idf', {})
    tf_idf['articleId'] = article_id
    rows.append(tf_idf)

df_tf_idf_full = pd.DataFrame(rows)

df_tf_idf_full.set_index('articleId', inplace=True)
df_tf_idf_full = df_tf_idf_full.fillna(0)

In [49]:
article_collection = db["article"]
list_articles = list(article_collection.find({}))

df_articles = pd.DataFrame(list_articles)

def fix_spacing(text):
    text = re.sub(r'([.,!?;:])(?=\S)', r'\1 ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def is_writer_signature(sentence):
    sentence = sentence.strip().rstrip('.')
    
    if len(sentence.split()) <= 3 and sentence == sentence.title():
        return True
    
    if re.fullmatch(r'[A-Z]\.?', sentence) or re.fullmatch(r'[A-Z][a-z]+(\s[A-Z][a-z]+)?', sentence):
        return True

    if "Thực hiện" in sentence or sentence.upper() == sentence:
        return True

    return False

def remove_writer_name(text):
    sentences = [s.strip() for s in text.strip().split('.') if s.strip()]
    if not sentences:
        return text
    last = sentences[-1]
    if is_writer_signature(last):
        return '. '.join(sentences[:-1]) + '.' if len(sentences) > 1 else ''
    return text

# clear stuff like \n \xa0... from df['content]
df_articles['content'] = df_articles['content'].apply(lambda x: re.sub(r'[\n\r\t\xa0\u200b\u202f]+', ' ', str(x)).strip())
# fix spacing
df_articles['content'] = df_articles['content'].apply(fix_spacing)
# remove writer name
df_articles['content'] = df_articles['content'].apply(remove_writer_name)

In [37]:
def expand_query(token, model, topn=5):
    expanded_tokens = {token}  

    if token in model.wv:
        similar_words = model.wv.most_similar(token, topn=topn)
        for word, _ in similar_words:
            expanded_tokens.add(word.replace('_', ' ')) 

    return expanded_tokens


In [5]:
def rank_documents_by_query(query, tf_idf, word_model, tokenizer, stopwords, expansion_weight=0.25):
    # Tokenize query
    segmented = tokenizer.word_segment(query)
    query_tokens = []
    for sentence in segmented:
        words = sentence.split()
        words = [w.replace("_", " ") for w in words]
        words = [w.lower() for w in words if w.lower() not in stopwords]
        query_tokens.extend(words)

    word_counts = {}

    for token in query_tokens:
        word_counts[token] = word_counts.get(token, 0) + 1 
        expanded_tokens = expand_query(token, word_model, topn=5)
        for expanded in expanded_tokens:
            if expanded != token and expanded not in stopwords:
                word_counts[expanded] = word_counts.get(expanded, 0) + expansion_weight 

    total_terms = sum(word_counts.values())
    if total_terms == 0:
        return []

    word_list = tf_idf.columns
    query_vector = np.zeros(len(word_list))

    for i, term in enumerate(word_list):
        if term in word_counts:
            query_vector[i] = word_counts[term] / total_terms

    cosin_sim = cosine_similarity([query_vector], tf_idf.values)[0]

    article_ids = tf_idf.index.tolist()
    ranked = sorted(zip(article_ids, cosin_sim), key=lambda x: x[1], reverse=True)

    return ranked


In [40]:
def search_articles(query):
    results = rank_documents_by_query(query, df_tf_idf_full, word2vec_model, rdrsegmenter, stopwords)
    result_ids = results[:10]
    result_articles = list(article_collection.find({"id": {"$in": [item[0] for item in result_ids]}}))
    return result_articles

In [1]:
import py_vncorenlp
import os
original_cwd = os.getcwd()
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg", "pos", "ner", "parse"], save_dir=os.path.join(original_cwd, "vncorenlp"))
os.chdir(original_cwd)

In [2]:
rdrsegmenter.annotate_text('tình hình giao thông ở thành phố sóc trăng')

{0: [{'index': 1,
   'wordForm': 'tình_hình',
   'posTag': 'N',
   'nerLabel': 'O',
   'head': 0,
   'depLabel': 'root'},
  {'index': 2,
   'wordForm': 'giao_thông',
   'posTag': 'N',
   'nerLabel': 'O',
   'head': 1,
   'depLabel': 'nmod'},
  {'index': 3,
   'wordForm': 'ở',
   'posTag': 'E',
   'nerLabel': 'O',
   'head': 1,
   'depLabel': 'loc'},
  {'index': 4,
   'wordForm': 'thành_phố',
   'posTag': 'N',
   'nerLabel': 'O',
   'head': 3,
   'depLabel': 'pob'},
  {'index': 5,
   'wordForm': 'sóc_trăng',
   'posTag': 'N',
   'nerLabel': 'B-LOC',
   'head': 4,
   'depLabel': 'nmod'}]}

In [7]:
with open('vietnamese-stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = set(line.strip().lower() for line in f if line.strip())
stopwords.add('sto')

In [None]:
from gensim.models import Word2Vec


word2vec_model = Word2Vec.load("word2vec_vi_bao_st.model")

In [83]:

def expand_query_enhanced(token, model, topn=5, similarity_threshold=0.5):
    expanded_tokens = [(token, 1.0)]
    
    if token in model.wv:
        similar_words = model.wv.most_similar(token, topn=topn)
        for word, similarity in similar_words:
            if similarity > similarity_threshold:
                clean_word = word.replace('_', ' ')
                expanded_tokens.append((clean_word, similarity))
    
    return expanded_tokens

def should_expand_token(token, stopwords, min_length=3):
    if token.lower() in stopwords:
        return False
    if len(token) < min_length:
        return False
    if token.isnumeric():
        return False
    return True

def rank_documents_by_query_enhanced(query, tf_idf, word_model, tokenizer, stopwords, 
                                   base_expansion_weight=0.3, 
                                   adaptive_expansion=True,
                                   similarity_threshold=0.7):
    
    segmented = tokenizer.word_segment(query)
    query_tokens = []
    for sentence in segmented:
        words = sentence.split()
        words = [w.replace("_", " ") for w in words]
        words = [w.lower() for w in words if w.lower() not in stopwords]
        query_tokens.extend(words)
    
    if not query_tokens:
        return []
    
    if adaptive_expansion:
        if len(query_tokens) <= 2:
            expansion_weight = base_expansion_weight * 1.5  
        elif len(query_tokens) >= 6:
            expansion_weight = base_expansion_weight * 0.5  
        else:
            expansion_weight = base_expansion_weight
    else:
        expansion_weight = base_expansion_weight
    
    word_counts = {}
    expansion_stats = {'original_terms': 0, 'expanded_terms': 0}
    
    for token in query_tokens:
        word_counts[token] = word_counts.get(token, 0) + 1.0
        expansion_stats['original_terms'] += 1
        
        if should_expand_token(token, stopwords):
            expanded_tokens = expand_query_enhanced(
                token, word_model, 
                topn=5, 
                similarity_threshold=similarity_threshold
            )
            
            for expanded_token, similarity in expanded_tokens[1:]: 
                if expanded_token not in stopwords and expanded_token != token:
                    weight = expansion_weight * similarity
                    word_counts[expanded_token] = word_counts.get(expanded_token, 0) + weight
                    expansion_stats['expanded_terms'] += 1
    
    total_weight = sum(word_counts.values())
    if total_weight == 0:
        return []
    
    word_list = tf_idf.columns.tolist()
    col_index = {col.lower(): idx for idx, col in enumerate(word_list)}

    query_vector = np.zeros(len(word_list))
    for tok, wt in word_counts.items():
        idx = col_index.get(tok) 
        if idx is not None:
            query_vector[idx] = wt / total_weight
    
    cosine_sim = cosine_similarity([query_vector], tf_idf.values)[0]
    
    article_ids = tf_idf.index.tolist()
    ranked = sorted(zip(article_ids, cosine_sim), key=lambda x: x[1], reverse=True)
    
    return ranked, expansion_stats, word_counts

def search_articles_enhanced(query, top_k=10):
    """
    Enhanced article search with improved query expansion
    """
    results, stats, query_tokens = rank_documents_by_query_enhanced(
        query, df_tf_idf_full, word2vec_model, rdrsegmenter, stopwords
    )
    
    # Print expansion statistics for debugging
    print(f"Query expansion stats: {stats}")
    print(f"Query tokens: {query_tokens}")
    
    result_ids = results[:top_k]
    result_articles = list(article_collection.find({
        "id": {"$in": [item[0] for item in result_ids]}
    }))
    
    return result_articles

In [85]:
for article in search_articles_enhanced('VNPT'):
    print(article['title'])
    print('https://baosoctrang.org.vn' + article['pageUrl'])
    print('-' * 80)

Query expansion stats: {'original_terms': 1, 'expanded_terms': 0}
Query tokens: {'vnpt': 1.0}
Thành phố Sóc Trăng phát sóng wifi miễn phí tại các công viên
https://baosoctrang.org.vn/thanh-pho-soc-trang-tren-duong-phat-trien/202410/thanh-pho-soc-trang-phat-song-wifi-mien-phi-tai-cac-cong-vien-c7d3d14/
--------------------------------------------------------------------------------
Thành phố Sóc Trăng phát động hưởng ứng Ngày Chuyển đổi số Quốc gia
https://baosoctrang.org.vn/thoi-su/202410/thanh-pho-soc-trang-phat-ong-huong-ung-ngay-chuyen-oi-so-quoc-gia-27231f4/
--------------------------------------------------------------------------------
Sóc Trăng phát động hưởng ứng Ngày Chuyển đổi số quốc gia
https://baosoctrang.org.vn/chuyen-doi-so/202410/soc-trang-phat-ong-huong-ung-ngay-chuyen-oi-so-quoc-gia-6a56292/
--------------------------------------------------------------------------------
Tổng kết công tác văn hóa, thể thao, du lịch và truyền thanh năm 2021
https://baosoctrang.org.vn/h

In [91]:
expand_query_enhanced('tai nạn trên quốc lộ', word2vec_model, topn=5, similarity_threshold=0.1)

[('tai nạn trên quốc lộ', 1.0)]

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer_gpt_vi = GPT2Tokenizer.from_pretrained("NlpHUST/gpt2-vietnamese", cache_dir="./transformers_cache")
model_gpt_vi = GPT2LMHeadModel.from_pretrained("NlpHUST/gpt2-vietnamese", cache_dir="./transformers_cache")

In [24]:

text = "tai nạn giao"
input_ids = tokenizer_gpt_vi.encode(text, return_tensors='pt')

sample_outputs = model_gpt_vi.generate(input_ids,pad_token_id=tokenizer_gpt_vi.eos_token_id,
                                   do_sample=True,
                                   max_length=len(text.split(' ')) + 3,
                                   min_length=len(text.split(' ')) + 1,
                                   top_k=5,
                                   num_beams=1,
                                   early_stopping=True,
                                   no_repeat_ngram_size=3,
                                   num_return_sequences=10)

for i, sample_output in enumerate(sample_outputs):
    print(">> Generated text {}\n\n{}".format(i+1, tokenizer_gpt_vi.decode(sample_output.tolist())))
    print('\n---')

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


>> Generated text 1

tai nạn giao thông mới xảy

---
>> Generated text 2

tai nạn giao thông, tai

---
>> Generated text 3

tai nạn giao thông đường sắt

---
>> Generated text 4

tai nạn giao thông nghiêm trọng

---
>> Generated text 5

tai nạn giao thông” năm

---
>> Generated text 6

tai nạn giao thông, tai

---
>> Generated text 7

tai nạn giao thông” năm

---
>> Generated text 8

tai nạn giao thông đường sắt

---
>> Generated text 9

tai nạn giao thông nghiêm trọng

---
>> Generated text 10

tai nạn giao thông” trong

---


>> Generated text 1

tai nạn giao thông mới nhất 24h qua, cập nhật tin nóng thời sự trưa ngày 24/

---
