In [1]:
import json
import lawquery
import pandas as pd
import os
from tqdm import tqdm

c:\Users\ngoph\.conda\envs\research\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\ngoph\.conda\envs\research\lib\site-packages\numpy\.libs\libopenblas.xwydx2ikjw2nmtwsfyngfuwkqu3lytcz.gfortran-win_amd64.dll
c:\Users\ngoph\.conda\envs\research\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


# Load data

In [2]:
topk = [5,10,20,50]
run_results = []

In [3]:
engines = {}
law_df = pd.read_csv('./documents/data.csv')
for i in range(len(law_df)):
    path = law_df['path'][i]
    so_hieu_van_ban = law_df['so_hieu_van_ban'][i]
    if so_hieu_van_ban not in engines:
        engines[so_hieu_van_ban] = lawquery.Engine(
            os.path.join('documents',path, 'tree.json.gz'),
        )
print("Number of engines:",len(engines))

documents = []
metadatas = []
ids = []

# take all
for engine in engines:
    results = engines[engine].query(node_type='điều')
    for result in results:
        documents.append(result.name+'\n'+result.content)
        metadatas.append({'law_id': engine, 'node_type': result.node_type, 'node_id': result.node_id})
        ids.append(result.id)

qa_df = pd.read_json('./answers_filtered.jsonl', lines=True, orient='records')

# take only use
# for index, row in qa_df.iterrows():
#     for answer in row['new_answers']:
#         q = engines[answer['law_id']].query(node_type=answer['node_type'], node_id = answer['node_id'])
#         if len(q)!=0:
#             if q[0].id in ids:
#                 continue
#             documents.append(q[0].content)
#             metadatas.append({'law_id': answer['law_id'], 'node_type': q[0].node_type, 'node_id': q[0].node_id})
#             ids.append(q[0].id)
print("Number of documents:",len(documents))

Number of engines: 15
Number of documents: 761


# TDIDF

In [None]:
from underthesea import text_normalize,word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
def format_text(text,word_segmentation=False):
    text = re.sub(r'\W', ' ', text) 
    text = re.sub(r'\s+', ' ', text)
    text = text_normalize(text)
    if word_segmentation:
        text = word_tokenize(text, format="text")
    return text

In [None]:
def run(word_segmentation):
    tdidf_docs= [format_text(doc,word_segmentation) for doc in documents]
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(tdidf_docs)
    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        query = row['cauhoi']
        query = format_text(query,word_segmentation)
        query_vector = tfidf.transform([query])
        cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
        related_docs_indices = cosine_similarities.argsort()[:-topk[-1]-1:-1]
        result = { 'metadatas':[]}
        for j in related_docs_indices:
            result['metadatas'].append(metadatas[j])
        for k in topk:
            if all(r in result['metadatas'][:k] for r in row['new_answers']):
                acc[k] += 1
    # print as table
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc

In [None]:
rr= run(False)
rr['name'] = 'tdidf'
run_results.append(rr)
rr= run(True)
rr['name'] = 'tdidf_ws'
run_results.append(rr)

# BM 25

In [None]:
from rank_bm25 import BM25Okapi

def run_bm25(word_segmentation,bm25_algo):
    docs= [format_text(doc,word_segmentation) for doc in documents]
    tokenized_corpus = [doc.split(" ") for doc in docs]
    bm25 = bm25_algo(tokenized_corpus)
    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        query = row['cauhoi']
        query = format_text(query,word_segmentation)
        tokenized_query = query.split(" ")
        doc_scores = bm25.get_scores(tokenized_query)
        result = { 'metadatas':[]}
        for j in doc_scores.argsort()[:-topk[-1]-1:-1]:
            result['metadatas'].append(metadatas[j])
        for k in topk:
            if all(r in result['metadatas'][:k] for r in row['new_answers']):
                acc[k] += 1
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc

In [None]:
print("BM25Okapi")
rr=run_bm25(False,BM25Okapi)
rr['name'] = 'bm25okapi'
run_results.append(rr)
rr=run_bm25(True,BM25Okapi)
rr['name'] = 'bm25okapi_ws'
run_results.append(rr)

# Chroma DB

## WITH  Instructor Embedding

In [None]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from InstructorEmbedding import INSTRUCTOR

In [None]:
def run_instructor(model_name):
    model = INSTRUCTOR(model_name)
    instructor_for_q = 'Đại diện cho câu hỏi để truy xuất đoạn văn liên quan:'
    instructor_for_r = 'Đại diện cho đoạn văn để truy xuất:'
    texts_with_instructions = []
    for doc in documents:
        texts_with_instructions.append([instructor_for_r,doc])
    customized_embeddings = model.encode(texts_with_instructions,show_progress_bar=True)
    chroma_client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                        persist_directory="chroma"))
    chroma_client.reset()
    collection = chroma_client.get_or_create_collection(name="law_documents")
    collection.upsert(documents=documents,metadatas=metadatas,ids=ids,embeddings=customized_embeddings.tolist())
    chroma_client.persist()
    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        question = row['cauhoi']
        query_embeddings = model.encode([[instructor_for_q,question]],show_progress_bar=False).tolist()
        result = collection.query(query_embeddings=query_embeddings, n_results=topk[-1],include=["metadatas"])
        for k in topk:
            if all(r in result['metadatas'][0][:k] for r in row['new_answers']):
                acc[k] += 1
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc

In [None]:
# rr=run_instructor('hkunlp/instructor-base')
# rr['name'] = 'instructor-base'
# run_results.append(rr)
# rr=run_instructor('hkunlp/instructor-large')
# rr['name'] = 'instructor-large'
# run_results.append(rr)
# rr=run_instructor('hkunlp/instructor-xl')
# rr['name'] = 'instructor-xl'
# run_results.append(rr)

# save result

In [None]:
result_df = pd.DataFrame(run_results)
# rename col topk
for k in topk:
    result_df.rename(columns={k: 'top'+str(k)}, inplace=True)
result_df.rename(columns={'name': 'Name',
                            'top5': 'Top_5@acc',
                            'top10': 'Top_10@acc',
                            'top20': 'Top_20@acc',
                            'top50': 'Top_50@acc',
                          }, inplace=True)
# reorder
result_df = result_df[['Name','Top_5@acc','Top_10@acc','Top_20@acc','Top_50@acc']]
result_df = result_df.round(4)
# reoder row, ws after no ws
# result_df = result_df.reindex([0,4,6,2,1,5,7,3,8,9,10])
result_df = result_df.reindex([0,2,1,3])
result_df.to_csv('../document/data/retrieval_result.csv',index=False)
# result_df.to_csv('retrieval_result_all.csv',index=False)
result_df