In [1]:
import json
import lawquery
import pandas as pd
import os
from tqdm import tqdm

c:\Users\ngoph\.conda\envs\research\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\ngoph\.conda\envs\research\lib\site-packages\numpy\.libs\libopenblas.xwydx2ikjw2nmtwsfyngfuwkqu3lytcz.gfortran-win_amd64.dll
c:\Users\ngoph\.conda\envs\research\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


# Load data

In [2]:
from underthesea import text_normalize,word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import string

def format_text(text,word_segmentation=False):
    text = re.sub(r'\s+', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    text = text_normalize(text)
    if word_segmentation:
        text = word_tokenize(text, format="text")
    return text

In [3]:
engines = {}
law_df = pd.read_csv('./documents/data.csv')
for i in range(len(law_df)):
    path = law_df['path'][i]
    so_hieu_van_ban = law_df['so_hieu_van_ban'][i]
    if so_hieu_van_ban not in engines:
        engines[so_hieu_van_ban] = lawquery.Engine(
            os.path.join('documents',path, 'tree.json.gz'),
        )
print("Number of engines:",len(engines))

documents = []
metadatas = []
ids = []

# take all
for engine in engines:
    results = engines[engine].query(node_type='điều')
    for result in results:
        documents.append(result.name+'\n'+result.content)
        metadatas.append({'law_id': engine, 'node_type': result.node_type, 'node_id': result.node_id})
        ids.append(result.id)

qa_df = pd.read_json('./answers_filtered.jsonl', lines=True, orient='records')
print("Number of documents:",len(documents))

Number of engines: 15
Number of documents: 761


# Create training triplet using TF-IDF


In [4]:
tdidf_docs= [format_text(doc,True) for doc in documents]
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(tdidf_docs)

In [5]:
instructor_for_q = 'Represent the legal question for retrieving evidence documents:'
instructor_for_r = 'Represent the legal document for retrieval:'

In [6]:
datasets = []
to_run = len(qa_df)
for i in tqdm(range(to_run)):
    row = qa_df.iloc[i]
    data = {}
    query = row['cauhoi']
    pos = []
    for item in row['new_answers']:
        idx = metadatas.index(item)
        pos.append(documents[idx])
    query = format_text(query,True) 
    query_vector = tfidf.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-100-1:-1]
    results = []
    for j in related_docs_indices:
        results.append(documents[j])
    neg = []
    for j in range(len(results)):
        if results[j] not in pos:
            neg.append(results[j])
        if len(neg) == len(pos):
            break

    for j in range(len(pos)):
        data = {}
        data['pos'] = [instructor_for_r,pos[j]]
        data['neg'] = [instructor_for_r,neg[j]]
        data['query'] = [instructor_for_q,query]
        data['task_name'] = 'tracuuluat'
        datasets.append(data)

100%|██████████| 4205/4205 [00:23<00:00, 176.80it/s]


In [7]:
len(datasets)

6446

In [8]:
import gzip

with gzip.open('../models/cachedir/medi-data.json.gz', 'wt', encoding='utf-8') as fout:
    fout.write(json.dumps(datasets, ensure_ascii=False, indent=4))