In [1]:
import json
import lawquery
import pandas as pd
import os
from tqdm import tqdm

c:\Users\ngoph\.conda\envs\research\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\ngoph\.conda\envs\research\lib\site-packages\numpy\.libs\libopenblas.xwydx2ikjw2nmtwsfyngfuwkqu3lytcz.gfortran-win_amd64.dll
c:\Users\ngoph\.conda\envs\research\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


# Load data

In [2]:
topk = [5,10,20,50,100]
run_results = []

In [3]:
engines = {}
law_df = pd.read_csv('./documents/data.csv')
for i in range(len(law_df)):
    path = law_df['path'][i]
    so_hieu_van_ban = law_df['so_hieu_van_ban'][i]
    if so_hieu_van_ban not in engines:
        engines[so_hieu_van_ban] = lawquery.Engine(
            os.path.join('documents',path, 'tree.json.gz'),
        )
print("Number of engines:",len(engines))

documents = []
metadatas = []
ids = []

# take all
for engine in engines:
    results = engines[engine].query(node_type='điều')
    for result in results:
        documents.append(result.content)
        metadatas.append({'law_id': engine, 'node_type': result.node_type, 'node_id': result.node_id})
        ids.append(result.id)

qa_df = pd.read_json('./answers_filtered.jsonl', lines=True, orient='records')

# take only use
# for index, row in qa_df.iterrows():
#     for answer in row['new_answers']:
#         q = engines[answer['law_id']].query(node_type=answer['node_type'], node_id = answer['node_id'])
#         if len(q)!=0:
#             if q[0].id in ids:
#                 continue
#             documents.append(q[0].content)
#             metadatas.append({'law_id': answer['law_id'], 'node_type': q[0].node_type, 'node_id': q[0].node_id})
#             ids.append(q[0].id)
print("Number of documents:",len(documents))

Number of engines: 15
Number of documents: 761


# TDIDF

In [4]:
from underthesea import text_normalize,word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
def format_text(text,word_segmentation=False):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text_normalize(text)
    if word_segmentation:
        text = word_tokenize(text, format="text")
    return text

In [5]:
def run(word_segmentation):
    tdidf_docs= [format_text(doc,word_segmentation) for doc in documents]
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(tdidf_docs)
    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        query = row['cauhoi']
        query = format_text(query,word_segmentation)
        query_vector = tfidf.transform([query])
        cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
        related_docs_indices = cosine_similarities.argsort()[:-100:-1]
        result = { 'metadatas':[]}
        for j in related_docs_indices:
            result['metadatas'].append(metadatas[j])
        for k in topk:
            if all(r in result['metadatas'][:k] for r in row['new_answers']):
                acc[k] += 1
    # print as table
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc

In [6]:
rr= run(False)
rr['name'] = 'tdidf'
run_results.append(rr)
rr= run(True)
rr['name'] = 'tdidf_ws'
run_results.append(rr)

100%|██████████| 4259/4259 [00:08<00:00, 504.32it/s]


{5: 0.10002347969006808, 10: 0.18689833294200517, 20: 0.3204977694294435, 50: 0.5022305705564687, 100: 0.6609532754167645}


100%|██████████| 4259/4259 [00:23<00:00, 180.92it/s]

{5: 0.10213665179619628, 10: 0.1878375205447288, 20: 0.3207325663301244, 50: 0.5088048837755341, 100: 0.6778586522657901}





# BM 25

In [7]:
from rank_bm25 import BM25Okapi,BM25L,BM25Plus

def run_bm25(word_segmentation,bm25_algo):
    docs= [format_text(doc,word_segmentation) for doc in documents]
    tokenized_corpus = [doc.split(" ") for doc in docs]
    bm25 = bm25_algo(tokenized_corpus)
    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        query = row['cauhoi']
        query = format_text(query,word_segmentation)
        tokenized_query = query.split(" ")
        doc_scores = bm25.get_scores(tokenized_query)
        result = { 'metadatas':[]}
        for j in doc_scores.argsort()[:-100:-1]:
            result['metadatas'].append(metadatas[j])
        for k in topk:
            if all(r in result['metadatas'][:k] for r in row['new_answers']):
                acc[k] += 1
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc

In [12]:
print("BM25Okapi")
rr=run_bm25(False,BM25Okapi)
rr['name'] = 'bm25okapi'
run_results.append(rr)
rr=run_bm25(True,BM25Okapi)
rr['name'] = 'bm25okapi_ws'
run_results.append(rr)

print("BM25L")
rr=run_bm25(False,BM25L)
rr['name'] = 'bm25l'
run_results.append(rr)
rr=run_bm25(True,BM25L)
rr['name'] = 'bm25l_ws'
run_results.append(rr)

print("BM25Plus")
rr=run_bm25(False,BM25Plus)
rr['name'] = 'bm25plus'
run_results.append(rr)
rr=run_bm25(True,BM25Plus)
rr['name'] = 'bm25plus_ws'
run_results.append(rr)
print("Done")

BM25Okapi


100%|██████████| 4259/4259 [00:54<00:00, 77.51it/s] 


{5: 0.06527353838929326, 10: 0.1347734209908429, 20: 0.23479690068091102, 50: 0.4242779995304062, 100: 0.6114111293730923}


100%|██████████| 4259/4259 [00:59<00:00, 71.66it/s]


{5: 0.08804883775534163, 10: 0.16130547076778587, 20: 0.27588635830007047, 50: 0.45362761211552005, 100: 0.6344212256398215}
BM25L


100%|██████████| 4259/4259 [01:00<00:00, 70.74it/s] 


{5: 0.009861469828598262, 10: 0.029114815684432964, 20: 0.06856069499882601, 50: 0.19394223996243248, 100: 0.40220709086640055}


100%|██████████| 4259/4259 [00:55<00:00, 76.62it/s] 


{5: 0.010800657431321907, 10: 0.03005400328715661, 20: 0.07396102371448697, 50: 0.20356891289034984, 100: 0.41864287391406435}
BM25Plus


100%|██████████| 4259/4259 [00:59<00:00, 71.60it/s]


{5: 0.065977929091336, 10: 0.12725992016905377, 20: 0.23808405729044377, 50: 0.4144165297018079, 100: 0.6076543789621976}


100%|██████████| 4259/4259 [01:01<00:00, 69.66it/s] 

{5: 0.0852312749471707, 10: 0.15449636064803945, 20: 0.2702512326837286, 50: 0.44658370509509276, 100: 0.6311340690302888}
Done





# Chroma DB

## WITH  Instructor Embedding

In [13]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from InstructorEmbedding import INSTRUCTOR

  from tqdm.autonotebook import trange


In [14]:
def run_instructor(model_name):
    model = INSTRUCTOR(model_name)
    instructor_for_q = 'Đại diện cho câu hỏi pháp luật để truy xuất điều luật trong văn bản pháp luật:'
    instructor_for_r = 'Đại diện cho điều luật trong văn bản pháp luật để truy xuất:'
    texts_with_instructions = []
    for doc in documents:
        texts_with_instructions.append([instructor_for_r,doc])
    customized_embeddings = model.encode(texts_with_instructions,show_progress_bar=True)
    chroma_client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                        persist_directory="chroma"))
    chroma_client.reset()
    collection = chroma_client.get_or_create_collection(name="law_documents")
    collection.upsert(documents=documents,metadatas=metadatas,ids=ids,embeddings=customized_embeddings.tolist())
    chroma_client.persist()
    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        question = row['cauhoi']
        query_embeddings = model.encode([[instructor_for_q,question]],show_progress_bar=False).tolist()
        result = collection.query(query_embeddings=query_embeddings, n_results=100,include=["metadatas"])
        for k in topk:
            if all(r in result['metadatas'][0][:k] for r in row['new_answers']):
                acc[k] += 1
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc

In [15]:
rr=run_instructor('hkunlp/instructor-base')
rr['name'] = 'instructor-base'
run_results.append(rr)
rr=run_instructor('hkunlp/instructor-large')
rr['name'] = 'instructor-large'
run_results.append(rr)
rr=run_instructor('hkunlp/instructor-xl')
rr['name'] = 'instructor-xl'
run_results.append(rr)

load INSTRUCTOR_Transformer
max_seq_length  512


Batches:   0%|          | 0/24 [00:00<?, ?it/s]

100%|██████████| 4259/4259 [01:56<00:00, 36.52it/s]


{5: 0.014557407842216482, 10: 0.02582765907490021, 20: 0.04578539563277765, 50: 0.09485794787508806, 100: 0.17140173749706503}
load INSTRUCTOR_Transformer
max_seq_length  512


Batches:   0%|          | 0/24 [00:00<?, ?it/s]

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

100%|██████████| 4259/4259 [05:04<00:00, 13.99it/s]


{5: 0.008217891523831886, 10: 0.017140173749706503, 20: 0.03733270720826485, 50: 0.10518901150504813, 100: 0.20239492838694528}
load INSTRUCTOR_Transformer
max_seq_length  512


Batches:   0%|          | 0/24 [00:00<?, ?it/s]

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

100%|██████████| 4259/4259 [15:56<00:00,  4.45it/s]  

{5: 0.020427330359239257, 10: 0.03639351960554121, 20: 0.06691711669405964, 50: 0.13970415590514204, 100: 0.24583235501291384}





In [16]:
result_df = pd.DataFrame(run_results)
# rename col topk
for k in topk:
    result_df.rename(columns={k: 'top'+str(k)}, inplace=True)
result_df = result_df[['name','top5','top10','top20','top50','top100']]
result_df.to_csv('result.csv',index=False)
result_df

Unnamed: 0,name,top5,top10,top20,top50,top100
0,tdidf,0.100023,0.186898,0.320498,0.502231,0.660953
1,tdidf_ws,0.102137,0.187838,0.320733,0.508805,0.677859
2,bm25okapi,0.065274,0.134773,0.234797,0.424278,0.611411
3,bm25okapi_ws,0.088049,0.161305,0.275886,0.453628,0.634421
4,bm25l,0.009861,0.029115,0.068561,0.193942,0.402207
5,bm25l_ws,0.010801,0.030054,0.073961,0.203569,0.418643
6,bm25plus,0.065978,0.12726,0.238084,0.414417,0.607654
7,bm25plus_ws,0.085231,0.154496,0.270251,0.446584,0.631134
8,instructor-base,0.014557,0.025828,0.045785,0.094858,0.171402
9,instructor-large,0.008218,0.01714,0.037333,0.105189,0.202395
