In [29]:
import pandas as pd
from tqdm import tqdm
from pymongo import MongoClient
import py_vncorenlp
import re
from collections import Counter
import os
from collections import defaultdict
import numpy as np
from math import log
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import math
import re
import unicodedata
from dotenv import load_dotenv

load_dotenv()

MONGO_URI = os.getenv("MONGO_URI")
client = MongoClient(MONGO_URI)
db = client["nlp"]

article_collection = db["article"]      

In [34]:
df_collection = db["article_df"]
list_df = pd.DataFrame(df_collection.find({}))

In [40]:
words = list_df['articleId'].to_list()

In [5]:
def load_dict(path):
    with open(path, encoding='utf-8') as f:
        return {l.strip() for l in f if l.strip()}

def ngrams(s, n=2):
    return {s[i:i+n] for i in range(len(s)-n+1)} if len(s)>=n else {s}

def edit_dist(a, b):
    if abs(len(a)-len(b))>1: return 2
    dp = [[j if i==0 else (i if j==0 else 0)
           for j in range(len(b)+1)] for i in range(len(a)+1)]
    for i in range(1, len(a)+1):
        for j in range(1, len(b)+1):
            cost = 0 if a[i-1]==b[j-1] else 1
            dp[i][j] = min(dp[i-1][j]+1, dp[i][j-1]+1, dp[i-1][j-1]+cost)
    return dp[-1][-1]

def spell_correcting(s, dict_path):
    D = load_dict(dict_path)
    tks = s.split()
    s2 = ngrams(s)
    return sorted(
        e for e in D
        if e.count(' ')==len(tks)-1
           and edit_dist(s, e)==1
           and (s2 & ngrams(e))
    )


In [2]:
sentence = "ta nạn"
fixed = spell_correcting(sentence, "vietDict.txt") 
print("Original :", sentence)
print("Corrected:", fixed)  

Original : ta nạn
Corrected: ['tai nạn', 'tị nạn']


In [3]:
from underthesea import word_tokenize

In [4]:
word_tokenize(sentence)

['ta', 'nạn']

In [1]:
import py_vncorenlp
import os
original_cwd = os.getcwd()
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=os.path.join(original_cwd, "vncorenlp"))
os.chdir(original_cwd)

In [2]:
with open('vietnamese-stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = set(line.strip().lower() for line in f if line.strip())
stopwords.add('sto')

In [3]:
from query_processing import beam_search_kenlm, load_vocab_from_file, generate_progressive_suggestions
import kenlm
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

kenMlModel = kenlm.Model("vi_model_6gramVinToken.binary")

In [13]:
query = "diễn biến cơn bão số 10"
beamResult = beam_search_kenlm(query.lower().split(), kenMlModel, force=True)
add_tonal = detokenize(beamResult[0][0])
print(add_tonal)

segmented = rdrsegmenter.word_segment(add_tonal)
query_tokens = []
for sentence in segmented:
    words = sentence.split()
    words = [w.replace("_", " ") for w in words]
    query_tokens.extend(words)
print(query_tokens)

query_tokens_corrected = []
for word in query_tokens:
    query_tokens_corrected.append(spell_correcting(word, "vietDict.txt"))

print(query_tokens_corrected)

diễn biến con bảo số 10
['diễn biến', 'con', 'bảo', 'số', '10']
[['diễn tiến'], ['bon', 'co', 'coi', 'cong', 'gon', 'lon', 'non', 'son', 'đon'], ['bả', 'bản', 'bảy', 'cảo', 'hảo', 'rảo', 'sảo', 'tảo', 'xảo', 'đảo', 'ảo'], ['sốc', 'sốt'], []]


In [7]:
spell_correcting('đua ghe ngo', "vietDictNew.txt")

[]

In [55]:
with open("vietDict.txt", encoding='utf-8') as f_in:
    current_words = set(f_in.read().splitlines())

In [46]:
for word in words:
    current_words.add(word)

In [51]:
with open("vietDictNew.txt", encoding='utf-8', mode='w+') as f_out:
    f_out.write('\n'.join(sorted(current_words)))

In [56]:
len(current_words)

23429

In [53]:
len(words)

31462