In [1]:
import pandas as pd
from tqdm import tqdm
from pymongo import MongoClient
import py_vncorenlp
import re
from collections import Counter
import os
from collections import defaultdict
import numpy as np
from math import log
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import math
import re
import unicodedata
from dotenv import load_dotenv

load_dotenv()

MONGO_URI = os.getenv("MONGO_URI")
client = MongoClient(MONGO_URI)
db = client["nlp"]

article_collection = db["article"]      

In [2]:
tf_idf_collection = db["article_tf_idf"]
list_tf_idf = list(tf_idf_collection.find({}))


rows = []
for doc in list_tf_idf:
    article_id = doc['articleId']
    tf_idf = doc.get('tf_idf', {})
    tf_idf['articleId'] = article_id
    rows.append(tf_idf)

df_tf_idf_full = pd.DataFrame(rows)

df_tf_idf_full.set_index('articleId', inplace=True)
df_tf_idf_full = df_tf_idf_full.fillna(0)

In [3]:
article_collection = db["article"]
list_articles = list(article_collection.find({}))

df_articles = pd.DataFrame(list_articles)

In [4]:
def expand_query(token, model, topn=5):
    expanded_tokens = {token}  

    if token in model.wv:
        similar_words = model.wv.most_similar(token, topn=topn)
        for word, _ in similar_words:
            expanded_tokens.add(word.replace('_', ' ')) 

    return expanded_tokens


def expand_query_enhanced(token, model, topn=5, similarity_threshold=0.5):
    expanded_tokens = [(token, 1.0)]
    
    if token in model.wv:
        similar_words = model.wv.most_similar(token, topn=topn)
        for word, similarity in similar_words:
            if similarity > similarity_threshold:
                clean_word = word.replace('_', ' ')
                expanded_tokens.append((clean_word, similarity))
    
    return expanded_tokens

def should_expand_token(token, stopwords, min_length=3):
    if token.lower() in stopwords:
        return False
    if len(token) < min_length:
        return False
    if token.isnumeric():
        return False
    return True


In [5]:
def rank_documents_by_query(query, tf_idf, word_model, tokenizer, stopwords, expansion_weight=0.25):
    # Tokenize query
    segmented = tokenizer.word_segment(query)
    query_tokens = []
    for sentence in segmented:
        words = sentence.split()
        words = [w.replace("_", " ") for w in words]
        words = [w.lower() for w in words if w.lower() not in stopwords]
        query_tokens.extend(words)

    word_counts = {}

    for token in query_tokens:
        word_counts[token] = word_counts.get(token, 0) + 1 
        expanded_tokens = expand_query(token, word_model, topn=5)
        for expanded in expanded_tokens:
            if expanded != token and expanded not in stopwords:
                word_counts[expanded] = word_counts.get(expanded, 0) + expansion_weight 

    total_terms = sum(word_counts.values())
    if total_terms == 0:
        return []

    word_list = tf_idf.columns
    query_vector = np.zeros(len(word_list))

    for i, term in enumerate(word_list):
        if term in word_counts:
            query_vector[i] = word_counts[term] / total_terms

    cosin_sim = cosine_similarity([query_vector], tf_idf.values)[0]

    article_ids = tf_idf.index.tolist()
    ranked = sorted(zip(article_ids, cosin_sim), key=lambda x: x[1], reverse=True)

    return ranked


In [6]:
import py_vncorenlp
import os
original_cwd = os.getcwd()
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=os.path.join(original_cwd, "vncorenlp"))
os.chdir(original_cwd)

In [7]:
with open('vietnamese-stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = set(line.strip().lower() for line in f if line.strip())
stopwords.add('sto')

In [8]:
def rank_documents_by_query_enhanced(query, tf_idf, word_model, tokenizer, stopwords, 
                                   base_expansion_weight=0.3, 
                                   adaptive_expansion=True,
                                   similarity_threshold=0.5):
    
    segmented = tokenizer.word_segment(query)
    query_tokens = []
    for sentence in segmented:
        words = sentence.split()
        words = [w.replace("_", " ") for w in words]
        words = [w.lower() for w in words if w.lower() not in stopwords]
        query_tokens.extend(words)
    
    if not query_tokens:
        return []
    
    if adaptive_expansion:
        if len(query_tokens) <= 2:
            expansion_weight = base_expansion_weight * 1.5  
        elif len(query_tokens) >= 6:
            expansion_weight = base_expansion_weight * 0.5  
        else:
            expansion_weight = base_expansion_weight
    else:
        expansion_weight = base_expansion_weight
    
    word_counts = {}
    expansion_stats = {'original_terms': 0, 'expanded_terms': 0}
    
    for token in query_tokens:
        # Add original token
        word_counts[token] = word_counts.get(token, 0) + 1.0
        expansion_stats['original_terms'] += 1
        
        # Expand token if appropriate
        if should_expand_token(token, stopwords):
            expanded_tokens = expand_query_enhanced(
                token, word_model, 
                topn=5, 
                similarity_threshold=similarity_threshold
            )
            
            for expanded_token, similarity in expanded_tokens[1:]: 
                if expanded_token not in stopwords and expanded_token != token:
                    weight = expansion_weight * similarity
                    word_counts[expanded_token] = word_counts.get(expanded_token, 0) + weight
                    expansion_stats['expanded_terms'] += 1
    
    total_weight = sum(word_counts.values())
    if total_weight == 0:
        return []
    
    word_list = tf_idf.columns
    query_vector = np.zeros(len(word_list))
    
    for i, term in enumerate(word_list):
        if term in word_counts:
            query_vector[i] = word_counts[term] / total_weight
    
    cosine_sim = cosine_similarity([query_vector], tf_idf.values)[0]
    
    article_ids = tf_idf.index.tolist()
    ranked = sorted(zip(article_ids, cosine_sim), key=lambda x: x[1], reverse=True)
    
    return ranked, expansion_stats

def search_articles_enhanced(query, top_k=10):
    """
    Enhanced article search with improved query expansion
    """
    results, stats = rank_documents_by_query_enhanced(
        query, df_tf_idf_full, word2vec_model, rdrsegmenter, stopwords
    )
    
    # Print expansion statistics for debugging
    print(f"Query expansion stats: {stats}")
    
    result_ids = results[:top_k]
    result_articles = list(article_collection.find({
        "id": {"$in": [item[0] for item in result_ids]}
    }))
    
    return result_articles

In [9]:
from gensim.models import Word2Vec


word2vec_model = Word2Vec.load("word2vec_vi_bao_st.model")

In [10]:
def search_articles(query):
    results = rank_documents_by_query(query, df_tf_idf_full, word2vec_model, rdrsegmenter, stopwords)
    result_ids = results[:10]
    result_articles = list(article_collection.find({"id": {"$in": [item[0] for item in result_ids]}}))
    return result_articles

In [11]:
for article in search_articles_enhanced('tai nạn trên quốc lộ'):
    print(article['title'])
    # print('https://baosoctrang.org.vn' + article['pageUrl'])
    print('-' * 80)

Query expansion stats: {'original_terms': 2, 'expanded_terms': 10}
Tích cực thực hiện chương trình phòng, chống tai nạn, thương tích trẻ em
--------------------------------------------------------------------------------
Tăng cường kiểm soát tốc độ phương tiện tuyến Quốc lộ Quản lộ Phụng Hiệp
--------------------------------------------------------------------------------
Tạo môi trường an toàn cho trẻ em vùng sông nước TX. Ngã Năm
--------------------------------------------------------------------------------
Xây dựng ngôi nhà an toàn cho trẻ
--------------------------------------------------------------------------------
Huyện Châu Thành phấn đấu giảm tai nạn giao thông cả 3 tiêu chí
--------------------------------------------------------------------------------
Xe cộ lúa cán cháu bé 6 tuổi tử vong tại chỗ
--------------------------------------------------------------------------------
Nhiều hoạt động trong phòng, chống tai nạn thương tích trẻ em
-----------------------------------

In [12]:
import re
from collections import defaultdict, Counter
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class ContextAwareQueryExpander:
    def __init__(self, word_model, tf_idf_matrix, min_phrase_freq=5):
        self.word_model = word_model
        self.tf_idf_matrix = tf_idf_matrix
        self.min_phrase_freq = min_phrase_freq
        self._build_phrase_vocabulary()
        self._build_context_relationships()
    
    def _build_phrase_vocabulary(self):
        """
        Extract common phrases from TF-IDF vocabulary
        """
        self.phrase_vocab = {}
        vocabulary = self.tf_idf_matrix.columns
        
        # Find multi-word terms that appear frequently enough
        for term in vocabulary:
            if ' ' in term and len(term.split()) <= 4:  # 2-4 word phrases
                # Check if this phrase appears frequently
                phrase_scores = self.tf_idf_matrix[term]
                non_zero_count = (phrase_scores > 0).sum()
                
                if non_zero_count >= self.min_phrase_freq:
                    words = term.split()
                    self.phrase_vocab[term] = {
                        'words': words,
                        'frequency': non_zero_count,
                        'avg_score': phrase_scores.mean()
                    }
    
    def _build_context_relationships(self):
        """
        Build relationships between words based on co-occurrence in phrases
        """
        self.context_relationships = defaultdict(lambda: defaultdict(float))
        
        for phrase, info in self.phrase_vocab.items():
            words = info['words']
            phrase_strength = info['avg_score'] * info['frequency']
            
            # Create bidirectional relationships between words in phrases
            for i, word1 in enumerate(words):
                for j, word2 in enumerate(words):
                    if i != j:
                        # Distance penalty: closer words have stronger relationship
                        distance_penalty = 1.0 / (abs(i - j) + 1)
                        strength = phrase_strength * distance_penalty
                        self.context_relationships[word1][word2] += strength
    
    def detect_query_phrases(self, query_tokens):
        """
        Dynamically detect phrases in the query based on vocabulary
        """
        detected_phrases = []
        query_text = ' '.join(query_tokens)
        
        # Sort phrases by length (longer first) to avoid partial matches
        sorted_phrases = sorted(self.phrase_vocab.keys(), 
                              key=lambda x: len(x.split()), reverse=True)
        
        used_positions = set()
        
        for phrase in sorted_phrases:
            # Find all occurrences of this phrase in query
            pattern = r'\b' + re.escape(phrase) + r'\b'
            matches = list(re.finditer(pattern, query_text, re.IGNORECASE))
            
            for match in matches:
                start_pos = len(query_text[:match.start()].split())
                end_pos = start_pos + len(phrase.split())
                
                # Check if this position is already used by a longer phrase
                pos_range = set(range(start_pos, end_pos))
                if not pos_range.intersection(used_positions):
                    detected_phrases.append({
                        'phrase': phrase,
                        'positions': (start_pos, end_pos),
                        'importance': self.phrase_vocab[phrase]['avg_score'],
                        'frequency': self.phrase_vocab[phrase]['frequency']
                    })
                    used_positions.update(pos_range)
        
        # Also try to construct phrases from consecutive query tokens
        self._add_constructed_phrases(query_tokens, detected_phrases, used_positions)
        
        return detected_phrases
    
    def _add_constructed_phrases(self, query_tokens, detected_phrases, used_positions):
        """
        Try to construct meaningful phrases from consecutive tokens even if not in vocab
        """
        # Try all possible consecutive combinations (longest first)
        for length in range(min(len(query_tokens), 4), 2, -1):  # Try 4,3 word phrases first
            for i in range(len(query_tokens) - length + 1):
                candidate_phrase = ' '.join(query_tokens[i:i+length])
                pos_range = set(range(i, i+length))
                
                # Skip if positions already used by a detected phrase
                if pos_range.intersection(used_positions):
                    continue
                
                # Check if this constructed phrase might be meaningful
                if self._is_meaningful_phrase(candidate_phrase, query_tokens):
                    detected_phrases.append({
                        'phrase': candidate_phrase,
                        'positions': (i, i+length),
                        'importance': 3.0,  # Higher importance for constructed phrases
                        'frequency': 15,  # Assume reasonable frequency
                        'constructed': True
                    })
                    used_positions.update(pos_range)
                    print(f"Constructed phrase: '{candidate_phrase}'")  # Debug
                    # Continue to try other non-overlapping phrases
    
    def _is_meaningful_phrase(self, phrase, query_tokens=None):
        """
        Heuristics to determine if a constructed phrase might be meaningful
        """
        words = phrase.split()
        
        # For short queries, any multi-word combination could be meaningful
        if query_tokens and len(query_tokens) <= 4:
            return True
            
        # At least one word should be in our word2vec model
        model_coverage = sum(1 for word in words if word in self.word_model.wv)
        if model_coverage == 0:
            return False
            
        # For 3+ word phrases, be more permissive
        if len(words) >= 3:
            return True
            
        return model_coverage >= 1
    
    def get_contextual_expansions(self, word, context_words, topn=5):
        """
        Get expansions for a word considering its context
        """
        expansions = []
        
        # Get Word2Vec expansions
        if word in self.word_model.wv:
            similar_words = self.word_model.wv.most_similar(word, topn=topn*2)
            
            for candidate, similarity in similar_words:
                candidate = candidate.replace('_', ' ')
                
                # Calculate context relevance
                context_score = 0.0
                for context_word in context_words:
                    if context_word in self.context_relationships[candidate]:
                        context_score += self.context_relationships[candidate][context_word]
                    if candidate in self.context_relationships[context_word]:
                        context_score += self.context_relationships[context_word][candidate]
                
                # Normalize context score
                context_score = context_score / (len(context_words) + 1)
                
                # Combined score: word2vec similarity + context relevance
                combined_score = similarity * 0.7 + min(context_score * 0.3, 0.3)
                
                expansions.append((candidate, combined_score))
        
        # Sort by combined score and return top candidates
        expansions.sort(key=lambda x: x[1], reverse=True)
        return expansions[:topn]
    
    def expand_query(self, query_tokens, stopwords, base_weight=1.0, 
                    phrase_weight=3.0, expansion_weight=0.3):
        """
        Expand query with context awareness - prioritize longer, more specific phrases
        """
        word_weights = defaultdict(float)
        
        # Detect phrases first
        detected_phrases = self.detect_query_phrases(query_tokens)
        
        # Sort phrases by length (longer = more specific = higher priority)
        detected_phrases.sort(key=lambda x: len(x['phrase'].split()), reverse=True)
        
        # Track which tokens are part of phrases
        phrase_tokens = set()
        phrase_coverage = {}  # Track which tokens are covered by which phrases
        
        for phrase_info in detected_phrases:
            phrase = phrase_info['phrase']
            importance = phrase_info.get('importance', 2.0)
            frequency = phrase_info.get('frequency', 10)
            is_constructed = phrase_info.get('constructed', False)
            
            # Higher weight for longer, more specific phrases
            phrase_length_bonus = len(phrase.split()) * 0.5
            phrase_final_weight = phrase_weight * (1 + np.log(frequency)) + phrase_length_bonus
            
            # Even higher bonus for constructed phrases (like "giáo dục mầm non")
            if is_constructed:
                phrase_final_weight *= 1.5
            
            word_weights[phrase] += phrase_final_weight
            
            # Track phrase coverage
            phrase_words = phrase.split()
            for i, word in enumerate(phrase_words):
                phrase_tokens.add(word)
                if word not in phrase_coverage:
                    phrase_coverage[word] = []
                phrase_coverage[word].append({
                    'phrase': phrase, 
                    'length': len(phrase_words),
                    'position': i,
                    'weight': phrase_final_weight
                })
        
        # Process individual tokens with phrase context awareness
        for i, token in enumerate(query_tokens):
            if token.lower() in stopwords:
                continue
            
            # Determine token weight based on phrase membership
            token_weight = base_weight
            
            if token in phrase_coverage:
                # Token is part of one or more phrases
                # Reduce individual token weight if it's part of a longer phrase
                max_phrase_length = max(pc['length'] for pc in phrase_coverage[token])
                if max_phrase_length >= 3:
                    # Significantly reduce weight for tokens in 3+ word phrases
                    token_weight *= 0.3
                elif max_phrase_length == 2:
                    # Moderately reduce for 2-word phrases
                    token_weight *= 0.6
                    
                # But still give some boost for being in a phrase
                token_weight *= 1.2
            
            word_weights[token] += token_weight
            
            # Get context for expansion
            context_words = []
            
            # Add surrounding words as context
            for j in range(max(0, i-2), min(len(query_tokens), i+3)):
                if j != i and query_tokens[j].lower() not in stopwords:
                    context_words.append(query_tokens[j])
            
            # Add words from detected phrases as context (prioritize longer phrases)
            for phrase_info in detected_phrases:
                phrase_words = phrase_info['phrase'].split()
                if token in phrase_words:
                    context_words.extend([w for w in phrase_words if w != token])
            
            # Get contextual expansions (reduce expansion for tokens in long phrases)
            if context_words:
                expansion_factor = expansion_weight
                if token in phrase_coverage:
                    max_phrase_length = max(pc['length'] for pc in phrase_coverage[token])
                    if max_phrase_length >= 3:
                        expansion_factor *= 0.5  # Less expansion for specific phrase tokens
                
                expansions = self.get_contextual_expansions(token, context_words)
                
                for expanded_word, score in expansions:
                    if expanded_word not in stopwords and expanded_word != token:
                        expansion_final_weight = expansion_factor * score
                        word_weights[expanded_word] += expansion_final_weight
        
        return dict(word_weights)

def rank_documents_context_aware(query, tf_idf, word_model, tokenizer, stopwords, 
                                expander=None):
    """
    Rank documents using context-aware query expansion
    """
    # Initialize expander if not provided
    if expander is None:
        expander = ContextAwareQueryExpander(word_model, tf_idf)
    
    # Tokenize query
    segmented = tokenizer.word_segment(query)
    query_tokens = []
    for sentence in segmented:
        words = sentence.split()
        words = [w.replace("_", " ") for w in words]
        words = [w.lower() for w in words]
        query_tokens.extend(words)
    
    if not query_tokens:
        return [], {}
    
    # Expand query with context awareness
    word_weights = expander.expand_query(query_tokens, stopwords)
    
    # Create query vector
    word_list = tf_idf.columns
    query_vector = np.zeros(len(word_list))
    
    # Normalize weights
    total_weight = sum(word_weights.values())
    if total_weight == 0:
        return [], {}
    
    # Fill query vector
    matched_terms = []
    for i, term in enumerate(word_list):
        if term in word_weights:
            query_vector[i] = word_weights[term] / total_weight
            matched_terms.append((term, word_weights[term]))
    
    # Calculate similarity
    cosine_sim = cosine_similarity([query_vector], tf_idf.values)[0]
    
    # Rank documents
    article_ids = tf_idf.index.tolist()
    ranked = sorted(zip(article_ids, cosine_sim), key=lambda x: x[1], reverse=True)
    
    # Statistics for debugging
    stats = {
        'original_tokens': query_tokens,
        'detected_phrases': expander.detect_query_phrases(query_tokens) if hasattr(expander, 'detect_query_phrases') else [],
        'matched_terms': sorted(matched_terms, key=lambda x: x[1], reverse=True)[:10],
        'total_terms': len(word_weights)
    }
    
    return ranked, stats

# Global expander instance (initialize once for efficiency)
_global_expander = None

def get_expander(tf_idf, word_model):
    global _global_expander
    if _global_expander is None:
        print("Building context-aware expander (one-time setup)...")
        _global_expander = ContextAwareQueryExpander(word_model, tf_idf)
        print(f"Found {len(_global_expander.phrase_vocab)} common phrases")
    return _global_expander

def search_articles_context_aware(query, top_k=10, debug=True):
    """
    Context-aware article search
    """
    expander = get_expander(df_tf_idf_full, word2vec_model)
    
    results, stats = rank_documents_context_aware(
        query, df_tf_idf_full, word2vec_model, rdrsegmenter, stopwords, expander
    )
    
    # Debug output
    if debug:
        print(f"\nQuery: '{query}'")
        print(f"Original tokens: {stats['original_tokens']}")
        print(f"Detected phrases: {[p['phrase'] for p in stats['detected_phrases']]}")
        print(f"Phrase details: {stats['detected_phrases']}")
        print(f"Top matched terms: {stats['matched_terms'][:5]}")
        print("-" * 50)
    
    result_ids = results[:top_k]
    result_articles = list(article_collection.find({
        "id": {"$in": [item[0] for item in result_ids]}
    }))
    
    return result_articles

# Example usage:
# results = search_articles_context_aware("tai nạn giao thông")
# results = search_articles_context_aware("bệnh viện nhi đồng") 
# results = search_articles_context_aware("giáo dục mầm non")

In [13]:
# Example usage:
# results = search_articles_context_aware("tai nạn giao thông")
# results = search_articles_context_aware("bệnh viện nhi đồng") 
# results = search_articles_context_aware("giáo dục mầm non")

In [14]:
results = search_articles_context_aware("giáo dục mầm non")
for article in results:
    print(article['title'])
    print('https://baosoctrang.org.vn' + article['pageUrl'])
    print('-' * 80)

Building context-aware expander (one-time setup)...
Found 5837 common phrases

Query: 'giáo dục mầm non'
Original tokens: ['giáo dục', 'mầm non']
Detected phrases: ['giáo dục', 'mầm non']
Phrase details: [{'phrase': 'giáo dục', 'positions': (0, 2), 'importance': 0.0017599859007624604, 'frequency': 1195}, {'phrase': 'mầm non', 'positions': (2, 4), 'importance': 0.0003358951851875883, 'frequency': 81}]
Top matched terms: [('giáo dục', 26.25770439309683), ('mầm non', 18.183347464017316), ('tiểu học', 0.16712696850299835), ('mẫu giáo', 0.1639937442541122), ('công lập', 0.15711144804954527)]
--------------------------------------------------
Thành phố Sóc Trăng họp mặt kỷ niệm 42 năm ngày Nhà giáo Việt Nam
https://baosoctrang.org.vn/giao-duc-dao-tao/202411/thanh-pho-soc-trang-hop-mat-ky-niem-42-nam-ngay-nha-giao-viet-nam-9913c13/
--------------------------------------------------------------------------------
Khảo sát tình hình xây dựng và phát triển trường mầm non trên địa bàn TP. Sóc Trăn