In [1]:
import pandas as pd
import re
import os
import json
import pickle
from tqdm import tqdm

In [2]:
with open('./law_dict.json','r') as f:
    dg = json.loads(f.read())
dg = {k.lower(): v for k, v in dg.items()}
clean_answers = pd.read_json('./clean_answers.jsonl',lines=True)

In [3]:
# drop invalid label
drop_idx = set()
for row in clean_answers.itertuples():
    for ans in row.label:
        try:
            X = dg[ans['law_id']][ans['dieu']]
        except:
            drop_idx.add(row.Index)
clean_answers = clean_answers.drop(index=drop_idx)

In [4]:
clean_answers = clean_answers[clean_answers['label'].map(len)!=0]

In [5]:
clean_answers.head()


Unnamed: 0,short_title,linhvuc,cauhoi,traloi,url,askDate,answerDate,answers,label
0,Thẻ BHYT hết hạn gia hạn lại,Bảo hiểm y tế,\n\nNội dung câu hỏi:\n\n\nKính gửi Quý cơ qua...,\n\nCâu trả lời:\n\n\nQua rà soát dữ liệu mã s...,https://baohiemxahoi.gov.vn/hoidap/pages/defau...,23/02/2023,03/03/2023,"[146/2018/nđ-cp > điều 12 > khoản 5, 2089/vbhn...","[{'law_id': '146/2018/nđ-cp', 'dieu': '12', 'i..."
2,Khi nào tôi nhận được tiền thai sản,"Ốm đau, thai sản",\n\nNội dung câu hỏi:\n\n\nTôi tên Vũ Thị Tuyế...,"\n\nCâu trả lời:\n\n\nTại Điều 102, Luật Bảo h...",https://baohiemxahoi.gov.vn/hoidap/pages/defau...,22/02/2023,24/02/2023,[58/2014/qh13 > điều 102],"[{'law_id': '58/2014/qh13', 'dieu': '102', 'id..."
3,LÃNH BHXH 1 LẦN,BHXH tự nguyên,\n\nNội dung câu hỏi:\n\n\nMSBH: 7911379604\nT...,\n\nCâu trả lời:\n\n\n* Về điều kiện hưởng BHX...,https://baohiemxahoi.gov.vn/hoidap/pages/defau...,01/03/2023,10/03/2023,[115/2015/nđ-cp > điều 8 > khoản 1],"[{'law_id': '115/2015/nđ-cp', 'dieu': '8', 'id..."
4,Hỗ trợ về tài khoản VssID,Sổ BHXH - Thẻ BHYT,\n\nNội dung câu hỏi:\n\n\nTôi tên là: Trần Th...,\n\nCâu trả lời:\n\n\nQua kiểm tra theo số CCC...,https://baohiemxahoi.gov.vn/hoidap/pages/defau...,22/02/2023,03/03/2023,"[2089/vbhn-bhxh > điều 27 > khoản 143, 2089/vb...","[{'law_id': '2089/vbhn-bhxh', 'dieu': '27', 'i..."
5,Việc cập nhập app Vssid,"Thu - nộp BHXH, BHYT, BHTN",\n\nNội dung câu hỏi:\n\n\nTôi tên trần tuyết ...,\n\nCâu trả lời:\n\n\n1. Theo khoản 5 Điều 3 q...,https://baohiemxahoi.gov.vn/hoidap/pages/defau...,19/02/2023,24/02/2023,"[58/2014/qh13 > điều 3 > khoản 5, 58/2014/qh13...","[{'law_id': '58/2014/qh13', 'dieu': '3', 'id':..."


In [6]:
clean_answers.to_json('final_answers.jsonl',lines=True,force_ascii=False,orient='records')

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import rank_bm25

In [9]:
import re
import py_vncorenlp
vncore_model = py_vncorenlp.VnCoreNLP(save_dir='C:/Users/ngoph/Desktop/luanvantotnghiep/code/paper/VnCoreNLP')

def format_text(text,word_segmentation=False):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    if word_segmentation:
        text = vncore_model.word_segment(text)
        text = ' '.join(text)
    return text

In [11]:
TOP=[5,10,20,50,100]
acc={
    'bm25':{},
    'tdidf':{},
    'bm25_ws':{},
    'tdidf_ws':{},
}
for top in TOP:
    acc['tdidf'][top]=0
    acc['bm25'][top]=0
    acc['bm25_ws'][top]=0
    acc['tdidf_ws'][top]=0

In [16]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

word_segment=True

ids = []
contents = []
for doc_key in dg:
    for pag_key in dg[doc_key]:
        id ='{}_d{}'.format(doc_key,pag_key)
        content = dg[doc_key][pag_key]
        ids.append(id)
        contents.append(format_text(content,word_segment))
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(contents)
bm25 = rank_bm25.BM25Okapi([doc.split(" ") for doc in contents])

for sample in tqdm(clean_answers.itertuples(),total=len(clean_answers)):
    # tdidf
    q_vec = vectorizer.transform([format_text(sample.cauhoi,word_segment)])
    cosine_sim = cosine_similarity(q_vec, doc_vectors)
    top_indexes = np.argsort(cosine_sim.flatten())[::-1]
    # bm25
    query = format_text(sample.cauhoi,word_segment).split(' ')
    bm25_top_indexes = bm25.get_top_n(query,ids,n=100)

    for top in TOP:
        top_ids = [ids[i] for i in top_indexes][:top]
        acc['tdidf'+ ('_ws' if word_segment else '')][top]+=all(x['id'] in top_ids for x in sample.label)

        top_ids = bm25_top_indexes[:top]
        acc['bm25'+('_ws' if word_segment else '')][top]+=all(x['id'] in top_ids for x in sample.label)

100%|██████████| 4227/4227 [02:40<00:00, 26.32it/s]


In [17]:
acc

{'bm25': {5: 382, 10: 683, 20: 1148, 50: 1897, 100: 2607},
 'tdidf': {5: 430, 10: 769, 20: 1372, 50: 2139, 100: 2801},
 'bm25_ws': {5: 419, 10: 739, 20: 1203, 50: 1902, 100: 2639},
 'tdidf_ws': {5: 410, 10: 734, 20: 1294, 50: 2034, 100: 2834}}

In [18]:
for key in acc:
    for key2 in acc[key]:
        acc[key][key2]/=len(clean_answers)
acc

{'bm25': {5: 0.09037142181215993,
  10: 0.16158031700969955,
  20: 0.271587414241779,
  50: 0.4487816418263544,
  100: 0.6167494677075941},
 'tdidf': {5: 0.10172699313934232,
  10: 0.18192571563756801,
  20: 0.3245800804352969,
  50: 0.5060326472675657,
  100: 0.662644901821623},
 'bm25_ws': {5: 0.09912467471019636,
  10: 0.174828483558079,
  20: 0.28459900638750885,
  50: 0.44996451383960256,
  100: 0.6243198485923823},
 'tdidf_ws': {5: 0.09699550508634966,
  10: 0.17364561154483085,
  20: 0.3061272770286255,
  50: 0.48119233498935415,
  100: 0.6704518571090607}}

In [23]:
df = pd.DataFrame(acc)
print(df.rename_axis('TopN').reset_index().to_markdown(index=False))

|   TopN |      bm25 |    tdidf |   bm25_ws |   tdidf_ws |
|-------:|----------:|---------:|----------:|-----------:|
|      5 | 0.0903714 | 0.101727 | 0.0991247 |  0.0969955 |
|     10 | 0.16158   | 0.181926 | 0.174828  |  0.173646  |
|     20 | 0.271587  | 0.32458  | 0.284599  |  0.306127  |
|     50 | 0.448782  | 0.506033 | 0.449965  |  0.481192  |
|    100 | 0.616749  | 0.662645 | 0.62432   |  0.670452  |
