In [2]:
import json
import lawquery
import pandas as pd
import os
from tqdm import tqdm
import numpy as np

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(        
    chunk_size = 3000,
    chunk_overlap  = 0,
    length_function = len,
)

# Load data

In [4]:
topk = [5,10,20,50]
run_results = []

In [5]:
engines = {}
law_df = pd.read_csv('./documents/data.csv')
for i in range(len(law_df)):
    path = law_df['path'][i]
    so_hieu_van_ban = law_df['so_hieu_van_ban'][i]
    if so_hieu_van_ban not in engines:
        engines[so_hieu_van_ban] = lawquery.Engine(
            os.path.join('documents',path, 'tree.json.gz'),
        )
print("Number of engines:",len(engines))


documents = []
metadatas = []
ids = []

for engine in engines:
    results = engines[engine].query(node_type='điều')
    for result in results:
        for i,chunk in enumerate(text_splitter.split_text(result.content)):
            documents.append(result.name+'\n'+chunk)
            metadatas.append({'law_id': engine, 'node_type': result.node_type, 'node_id': result.node_id,
                              'tenvb':engines[engine].metadata['ten_van_ban']})
            ids.append(result.id+str(i))

qa_df = pd.read_json('./answers_filtered.jsonl', lines=True, orient='records')

# take only use
# for index, row in qa_df.iterrows():
#     for answer in row['new_answers']:
#         q = engines[answer['law_id']].query(node_type=answer['node_type'], node_id = answer['node_id'])
#         if len(q)!=0:
#             if q[0].id in ids:
#                 continue
#             documents.append(q[0].content)
#             metadatas.append({'law_id': answer['law_id'], 'node_type': q[0].node_type, 'node_id': q[0].node_id})
#             ids.append(q[0].id)
print("Number of documents:",len(documents))

Number of engines: 15
Number of documents: 875


# TDIDF

In [6]:
from underthesea import text_normalize,word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import string
def format_text(text,word_segmentation=False,remove_punctuation=False):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = text_normalize(text)
    if remove_punctuation:
        text = text.translate(str.maketrans('', '', string.punctuation))
    if word_segmentation:
        text = word_tokenize(text, format="text")
    return text

In [7]:
def run(word_segmentation):
    tdidf_docs= [format_text(doc,word_segmentation) for doc in documents]
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(tdidf_docs)
    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        query = row['cauhoi']
        query = format_text(query,word_segmentation)
        query_vector = tfidf.transform([query])
        cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
        related_docs_indices = cosine_similarities.argsort()[:-topk[-1]-1:-1]
        result = { 'metadatas':[]}
        for j in related_docs_indices:
            result['metadatas'].append(metadatas[j])
        for k in topk:
            if all(r in result['metadatas'][:k] for r in row['new_answers']):
                acc[k] += 1
    # print as table
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc

In [8]:
# rr= run(False)
# rr['name'] = 'tdidf'
# run_results.append(rr)
# rr= run(True)
# rr['name'] = 'tdidf_ws'
# run_results.append(rr)

# BM 25

In [9]:
from rank_bm25 import BM25Okapi

def run_bm25(word_segmentation,bm25_algo):
    docs= [format_text(doc,word_segmentation) for doc in documents]
    tokenized_corpus = [doc.split(" ") for doc in docs]
    bm25 = bm25_algo(tokenized_corpus)
    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        query = row['cauhoi']
        query = format_text(query,word_segmentation)
        tokenized_query = query.split(" ")
        doc_scores = bm25.get_scores(tokenized_query)
        result = { 'metadatas':[]}
        for j in doc_scores.argsort()[:-topk[-1]-1:-1]:
            result['metadatas'].append(metadatas[j])
        for k in topk:
            if all(r in result['metadatas'][:k] for r in row['new_answers']):
                acc[k] += 1
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc

In [10]:
# print("BM25Okapi")
# rr=run_bm25(False,BM25Okapi)
# rr['name'] = 'bm25'
# run_results.append(rr)
# rr=run_bm25(True,BM25Okapi)
# rr['name'] = 'bm25_ws'
# run_results.append(rr)

# Chroma DB

## WITH  Instructor Embedding

In [11]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from InstructorEmbedding import INSTRUCTOR

  from tqdm.autonotebook import trange


In [16]:
def metadata_compare(item,list):
    for i in list:
        if i['law_id'] == item['law_id'] and i['node_type'] == item['node_type'] and i['node_id'] == item['node_id']:
            return True
    return False

def run_instructor(model_name):
    model = INSTRUCTOR(model_name)
    instructor_for_q = 'Represent the legal question for retrieving evidence documents:'
    instructor_for_r = 'Represent the legal document for retrieval:'
    texts_with_instructions = []
    for doc in documents:
        texts_with_instructions.append([instructor_for_r,doc])
    chroma_client = chromadb.Client(Settings(chroma_api_impl="rest",
                                        chroma_server_host="localhost",
                                        chroma_server_http_port="8000",
                                        chroma_server_ssl_enabled=False
                                        ))
    chroma_client.reset()
    collection = chroma_client.get_or_create_collection(name="law_documents")
    embeddings = model.encode(texts_with_instructions,show_progress_bar=True).tolist()
    collection.add(
        embeddings=embeddings,
        documents=documents,
        metadatas=metadatas,
        ids=ids
    )

    to_run = len(qa_df)
    acc ={}
    for k in topk:
        acc[k] = 0
    print("Number of questions:",to_run)
    best = []
    for i in tqdm(range(to_run)):
        row = qa_df.iloc[i]
        question = row['cauhoi']
        query_embeddings = model.encode([[instructor_for_q,question]],show_progress_bar=False).tolist()
        result = collection.query(query_embeddings=query_embeddings, n_results=topk[-1],include=["metadatas"])
        for k in topk:
            if all(metadata_compare(r,result['metadatas'][0][:k]) for r in row['new_answers']):
                acc[k] += 1
                if k==5:
                    best.append(row)
        if len(best)>10:
            break
    for k in topk:
        acc[k] /= to_run
    print(acc)
    return acc,best

In [17]:
# rr=run_instructor('hkunlp/instructor-base')
# rr['name'] = 'instructor-base'
# run_results.append(rr)
# rr=run_instructor('hkunlp/instructor-large')
# rr['name'] = 'instructor-large'
# run_results.append(rr)
# rr=run_instructor('hkunlp/instructor-xl')
# rr['name'] = 'instructor-xl'
# run_results.append(rr)


rr,best=run_instructor('C:/Users/ngoph/Desktop/luanvan/model')
# rr['name'] = 'instructor-base finetune'
# run_results.append(rr)

load INSTRUCTOR_Transformer
max_seq_length  512


Batches: 100%|██████████| 28/28 [00:17<00:00,  1.58it/s]


Number of questions: 4205


  0%|          | 21/4205 [00:00<02:20, 29.81it/s]

{5: 0.002615933412604043, 10: 0.0035671819262782403, 20: 0.0035671819262782403, 50: 0.004042806183115339}





In [21]:
df = pd.DataFrame(best)
df.to_json('best.jsonl',orient='records',lines=True)

# save result

In [None]:
result_df = pd.DataFrame(run_results)
# rename col topk
for k in topk:
    result_df.rename(columns={k: 'top'+str(k)}, inplace=True)
result_df.rename(columns={'name': 'Name',
                            'top5': 'Top_5@acc',
                            'top10': 'Top_10@acc',
                            'top20': 'Top_20@acc',
                            'top50': 'Top_50@acc',
                          }, inplace=True)
# reorder
result_df = result_df[['Name','Top_5@acc','Top_10@acc','Top_20@acc','Top_50@acc']]
result_df = result_df.round(4)
# reoder row, ws after no ws
# result_df = result_df.reindex([0,4,6,2,1,5,7,3,8,9,10])
# result_df = result_df.reindex([0,2,1,3])
result_df.to_csv('../document/data/retrieval_result.csv',index=False)
# result_df.to_csv('retrieval_result_all.csv',index=False)
result_df

KeyError: "None of [Index(['Name', 'Top_5@acc', 'Top_10@acc', 'Top_20@acc', 'Top_50@acc'], dtype='object')] are in the [columns]"