In [10]:
# !pip install ./law_query --force-reinstall

In [11]:
import json
import lawquery
import pandas as pd
import os
from tqdm import tqdm

In [12]:
df = pd.read_csv('../document/data/retrieval_result.csv')
# round all the scores to 4 decimal places
# col: top5, top10, top20, top50, top100
df = df.round(4)
# save the result
df.to_csv('../document/data/retrieval_result.csv', index=False)
df.head()

Unnamed: 0,name,top5,top10,top20,top50,top100
0,tdidf,0.1,0.1869,0.3205,0.5022,0.661
1,bm25l,0.0099,0.0291,0.0686,0.1939,0.4022
2,bm25plus,0.066,0.1273,0.2381,0.4144,0.6077
3,bm25okapi,0.0653,0.1348,0.2348,0.4243,0.6114
4,tdidf_ws,0.1021,0.1878,0.3207,0.5088,0.6779


# Load data

In [13]:
topk = [5,10,20,50,100]
run_results = []

In [14]:
engines = {}
law_df = pd.read_csv('./documents/data.csv')
for i in range(len(law_df)):
    path = law_df['path'][i]
    so_hieu_van_ban = law_df['so_hieu_van_ban'][i]
    if so_hieu_van_ban not in engines:
        engines[so_hieu_van_ban] = lawquery.Engine(
            os.path.join('documents',path, 'tree.json.gz'),
        )
print("Number of engines:",len(engines))

documents = []
metadatas = []
ids = []

# take all
for engine in engines:
    results = engines[engine].query(node_type='điều')
    for result in results:
        documents.append(result.name+'\n'+result.content)
        metadatas.append({'law_id': engine, 'node_type': result.node_type, 'node_id': result.node_id})
        ids.append(result.id)

qa_df = pd.read_json('./answers_filtered.jsonl', lines=True, orient='records')

# take only use
# for index, row in qa_df.iterrows():
#     for answer in row['new_answers']:
#         q = engines[answer['law_id']].query(node_type=answer['node_type'], node_id = answer['node_id'])
#         if len(q)!=0:
#             if q[0].id in ids:
#                 continue
#             documents.append(q[0].content)
#             metadatas.append({'law_id': answer['law_id'], 'node_type': q[0].node_type, 'node_id': q[0].node_id})
#             ids.append(q[0].id)
print("Number of documents:",len(documents))

Number of engines: 15
Number of documents: 761


# TDIDF

In [15]:
from underthesea import text_normalize,word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
def format_text(text,word_segmentation=False):
    text = re.sub(r'\W', ' ', text) 
    text = re.sub(r'\s+', ' ', text)
    text = text_normalize(text)
    if word_segmentation:
        text = word_tokenize(text, format="text")
    return text

In [16]:
def run(word_segmentation):
    tdidf_docs= [format_text(doc,word_segmentation) for doc in documents]
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(tdidf_docs)
    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        query = row['cauhoi']
        query = format_text(query,word_segmentation)
        query_vector = tfidf.transform([query])
        cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
        related_docs_indices = cosine_similarities.argsort()[:-100-1:-1]
        result = { 'metadatas':[]}
        for j in related_docs_indices:
            result['metadatas'].append(metadatas[j])
        for k in topk:
            if all(r in result['metadatas'][:k] for r in row['new_answers']):
                acc[k] += 1
    # print as table
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc

In [17]:
rr= run(False)
rr['name'] = 'tdidf'
run_results.append(rr)
rr= run(True)
rr['name'] = 'tdidf_ws'
run_results.append(rr)

100%|██████████| 4259/4259 [00:13<00:00, 307.64it/s]


{5: 0.10542380840572904, 10: 0.2002817562808171, 20: 0.343038271894811, 50: 0.522423104015027, 100: 0.6816154026766846}


100%|██████████| 4259/4259 [00:47<00:00, 90.09it/s] 

{5: 0.11763324724113641, 10: 0.20380370979103077, 20: 0.340455505987321, 50: 0.5268842451279643, 100: 0.6980511857243484}





# BM 25

In [18]:
from rank_bm25 import BM25Okapi,BM25L,BM25Plus

def run_bm25(word_segmentation,bm25_algo):
    docs= [format_text(doc,word_segmentation) for doc in documents]
    tokenized_corpus = [doc.split(" ") for doc in docs]
    bm25 = bm25_algo(tokenized_corpus)
    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        query = row['cauhoi']
        query = format_text(query,word_segmentation)
        tokenized_query = query.split(" ")
        doc_scores = bm25.get_scores(tokenized_query)
        result = { 'metadatas':[]}
        for j in doc_scores.argsort()[:-100-1:-1]:
            result['metadatas'].append(metadatas[j])
        for k in topk:
            if all(r in result['metadatas'][:k] for r in row['new_answers']):
                acc[k] += 1
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc

In [19]:
print("BM25Okapi")
rr=run_bm25(False,BM25Okapi)
rr['name'] = 'bm25okapi'
run_results.append(rr)
rr=run_bm25(True,BM25Okapi)
rr['name'] = 'bm25okapi_ws'
run_results.append(rr)

print("BM25L")
rr=run_bm25(False,BM25L)
rr['name'] = 'bm25l'
run_results.append(rr)
rr=run_bm25(True,BM25L)
rr['name'] = 'bm25l_ws'
run_results.append(rr)

print("BM25Plus")
rr=run_bm25(False,BM25Plus)
rr['name'] = 'bm25plus'
run_results.append(rr)
rr=run_bm25(True,BM25Plus)
rr['name'] = 'bm25plus_ws'
run_results.append(rr)
print("Done")

BM25Okapi


100%|██████████| 4259/4259 [01:30<00:00, 47.08it/s]


{5: 0.06785630429678328, 10: 0.1387649683024184, 20: 0.24465837050950928, 50: 0.4371918290678563, 100: 0.6266729279173515}


100%|██████████| 4259/4259 [01:40<00:00, 42.32it/s]


{5: 0.09227518196759803, 10: 0.168818971589575, 20: 0.2866870157313923, 50: 0.46748062925569384, 100: 0.6435783047663771}
BM25L


 13%|█▎        | 536/4259 [00:14<01:39, 37.50it/s]


KeyboardInterrupt: 

# Chroma DB

## WITH  Instructor Embedding

In [22]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from InstructorEmbedding import INSTRUCTOR

In [23]:
def run_instructor(model_name):
    model = INSTRUCTOR(model_name)
    instructor_for_q = 'Đại diện cho câu hỏi để truy xuất văn bản liên quan:'
    instructor_for_r = 'Đại diện cho văn bản để truy xuất:'
    texts_with_instructions = []
    for doc in documents:
        texts_with_instructions.append([instructor_for_r,doc])
    customized_embeddings = model.encode(texts_with_instructions,show_progress_bar=True)
    chroma_client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                        persist_directory="chroma"))
    chroma_client.reset()
    collection = chroma_client.get_or_create_collection(name="law_documents")
    collection.upsert(documents=documents,metadatas=metadatas,ids=ids,embeddings=customized_embeddings.tolist())
    chroma_client.persist()
    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        question = row['cauhoi']
        query_embeddings = model.encode([[instructor_for_q,question]],show_progress_bar=False).tolist()
        result = collection.query(query_embeddings=query_embeddings, n_results=100,include=["metadatas"])
        for k in topk:
            if all(r in result['metadatas'][0][:k] for r in row['new_answers']):
                acc[k] += 1
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc

In [24]:
rr=run_instructor('hkunlp/instructor-base')
rr['name'] = 'instructor-base'
run_results.append(rr)
rr=run_instructor('hkunlp/instructor-large')
rr['name'] = 'instructor-large'
run_results.append(rr)
rr=run_instructor('hkunlp/instructor-xl')
rr['name'] = 'instructor-xl'
run_results.append(rr)

Downloading (…)62736/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading (…)15e6562736/README.md:   0%|          | 0.00/66.2k [00:00<?, ?B/s]

Downloading (…)e6562736/config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/439M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)"spiece.model";:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)62736/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.43k [00:00<?, ?B/s]

Downloading (…)6562736/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


Batches:   0%|          | 0/24 [00:00<?, ?it/s]

In [None]:
result_df = pd.DataFrame(run_results)
# rename col topk
for k in topk:
    result_df.rename(columns={k: 'top'+str(k)}, inplace=True)
result_df = result_df[['name','top5','top10','top20','top50','top100']]
result_df.to_csv('result.csv',index=False)
result_df