In [3]:
import json
import lawquery
import pandas as pd
import os
from tqdm import tqdm

# Load data

In [41]:
topk = [5,10,20,50,100]
run_results = []

In [43]:
engines = {}
law_df = pd.read_csv('./documents/data.csv')
for i in range(len(law_df)):
    path = law_df['path'][i]
    so_hieu_van_ban = law_df['so_hieu_van_ban'][i]
    if so_hieu_van_ban not in engines:
        engines[so_hieu_van_ban] = lawquery.Engine(
            os.path.join('documents',path, 'tree.json.gz'),
        )
print("Number of engines:",len(engines))

documents = []
metadatas = []
ids = []

# take all
for engine in engines:
    results = engines[engine].query(node_type='điều')
    for result in results:
        documents.append(result.content)
        metadatas.append({'law_id': engine, 'node_type': result.node_type, 'node_id': result.node_id})
        ids.append(result.id)

qa_df = pd.read_json('./answers_filtered.jsonl', lines=True, orient='records')

# take only use
# for index, row in qa_df.iterrows():
#     for answer in row['new_answers']:
#         q = engines[answer['law_id']].query(node_type=answer['node_type'], node_id = answer['node_id'])
#         if len(q)!=0:
#             if q[0].id in ids:
#                 continue
#             documents.append(q[0].content)
#             metadatas.append({'law_id': answer['law_id'], 'node_type': q[0].node_type, 'node_id': q[0].node_id})
#             ids.append(q[0].id)
print("Number of documents:",len(documents))

Number of engines: 15
Number of documents: 761


# TDIDF

In [44]:
from underthesea import text_normalize,word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
def format_text(text,word_segmentation=False):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text_normalize(text)
    if word_segmentation:
        text = word_tokenize(text, format="text")
    return text

In [74]:
def run(word_segmentation):
    tdidf_docs= [format_text(doc,word_segmentation) for doc in documents]
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(tdidf_docs)
    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        query = row['cauhoi']
        query = format_text(query,word_segmentation)
        query_vector = tfidf.transform([query])
        cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
        related_docs_indices = cosine_similarities.argsort()[:-100:-1]
        result = { 'metadatas':[]}
        for j in related_docs_indices:
            result['metadatas'].append(metadatas[j])
        for k in topk:
            if all(r in result['metadatas'][:k] for r in row['new_answers']):
                acc[k] += 1
    # print as table
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc

In [75]:
run(False)
run(True)

100%|██████████| 4259/4259 [00:08<00:00, 521.38it/s]


{5: 0.10002347969006808, 10: 0.18689833294200517, 20: 0.3204977694294435, 50: 0.5022305705564687, 100: 0.6609532754167645}


100%|██████████| 4259/4259 [00:25<00:00, 165.98it/s]

{5: 0.10213665179619628, 10: 0.1878375205447288, 20: 0.3207325663301244, 50: 0.5088048837755341, 100: 0.6778586522657901}





{5: 0.10213665179619628,
 10: 0.1878375205447288,
 20: 0.3207325663301244,
 50: 0.5088048837755341,
 100: 0.6778586522657901}

# BM 25

In [82]:
from rank_bm25 import BM25Okapi,BM25L,BM25Plus

def run_bm25(word_segmentation,bm25_algo):
    docs= [format_text(doc,word_segmentation) for doc in documents]
    tokenized_corpus = [doc.split(" ") for doc in docs]
    bm25 = bm25_algo(tokenized_corpus)
    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        query = row['cauhoi']
        query = format_text(query,word_segmentation)
        tokenized_query = query.split(" ")
        doc_scores = bm25.get_scores(tokenized_query)
        result = { 'metadatas':[]}
        for j in doc_scores.argsort()[:-100:-1]:
            result['metadatas'].append(metadatas[j])
        for k in topk:
            if all(r in result['metadatas'][:k] for r in row['new_answers']):
                acc[k] += 1
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc

In [None]:
print("BM25Okapi")
run_bm25(False,BM25Okapi)
run_bm25(True,BM25Okapi)
print("BM25L")
run_bm25(False,BM25L)
run_bm25(True,BM25L)
print("BM25Plus")
run_bm25(False,BM25Plus)
run_bm25(True,BM25Plus)
print("Done")

# Chroma DB

In [58]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

# ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

ef = embedding_functions.InstructorEmbeddingFunction(device="cuda",model_name="hkunlp/instructor-base",
# # instruction='Represent the legislation question for retrieving supporting legislation article:'
# # instruction='Represent the legislation article for retrieval:'
)

chroma_client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                    persist_directory="chroma"))
chroma_client.reset()
collection = chroma_client.get_or_create_collection(name="law_documents",embedding_function=ef)

collection.upsert(documents=documents,metadatas=metadatas,ids=ids)
chroma_client.persist()

load INSTRUCTOR_Transformer
max_seq_length  512


True

In [59]:
to_run = len(qa_df)
acc ={}
for k in topk:
    acc[k] = 0
for i in tqdm(range(to_run)):
    row = qa_df.iloc[i]
    question = row['cauhoi']
    result = collection.query(query_texts=question, n_results=100,include=["metadatas"])
    for k in topk:
        if all(r in result['metadatas'][0][:k] for r in row['new_answers']):
            acc[k] += 1
# to tuple (name, top5, top10, top20, top50, top100)
this_run= ["chroma"]
for k in topk:
    this_run.append(acc[k]/to_run)
run_results.append(tuple(this_run))
this_run

  8%|▊         | 342/4259 [00:08<01:32, 42.45it/s]
