In [1]:
import json
from openai import OpenAI
from groq import Groq
import os
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import pickle
from elasticsearch import Elasticsearch
import pandas as pd
import requests
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from requests.exceptions import HTTPError

import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('../data/vietnamese_rag/documents-with-ids1.json', 'rt') as f_in:
    documents1 = json.load(f_in)

In [3]:
with open('../data/vietnamese_rag/documents-with-ids2.json', 'rt') as f_in:
    documents2 = json.load(f_in)

In [4]:
with open('../data/vietnamese_rag/documents-with-ids3.json', 'rt') as f_in:
    documents3 = json.load(f_in)

In [5]:
with open('../data/vietnamese_rag/documents-with-ids4.json', 'rt') as f_in:
    documents4 = json.load(f_in)

In [6]:
with open('../data/vietnamese_rag/documents-with-ids5.json', 'rt') as f_in:
    documents5 = json.load(f_in)

In [7]:
def load_documents(base_path, num_files):
    documents = []
    for i in range(1, num_files + 1):
        file_path = f'{base_path}/documents-with-ids{i}.json'
        with open(file_path, 'rt') as f_in:
            documents.extend(json.load(f_in))
    return documents

In [8]:
base_path = '../data/vietnamese_rag'
num_files = 5
documents = load_documents(base_path, num_files)

In [9]:
df_ground_truth = pd.read_csv('../data/vietnamese_rag/ground_truth_data/ground_truth_data.csv')

ground_truth = df_ground_truth.to_dict(orient='records')
ground_truth[0]

{'question': "Minh Tú đã gặp khó khăn gì trong thử thách đi catwalk tại Asia's Next Top Model mùa 5?",
 'Group': 'General',
 'document': '75fafd29'}

In [10]:
doc_idx = {d['id']: d for d in documents}

In [11]:
doc_idx['75fafd29']['answer']

'Minh Tú đã vượt qua sự sợ hãi để hoàn thành tốt phần thử thách đi catwalk khi bị treo lơ lửng trên một tòa nhà cao tầng và đạt vị trí thứ 2 trong đêm chung kết của chương trình.'

In [12]:
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")



In [16]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "group": {"type": "keyword"},
            "context": {"type": "text"},
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "id": {"type": "keyword"},
            "context_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_context_answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "vietnamese-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vietnamese-questions'})

In [17]:
def load_vectors(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

def process_documents(documents, index_name, es_client):
    full_documents = []
    for i in range(1, 6):
        if i == 1:
            data = documents1.copy()
        elif i == 2:
            data = documents2.copy()
        elif i == 3:
            data = documents3.copy()
        elif i == 4:
            data = documents4.copy()
        elif i == 5:
            data = documents5.copy()
        document_qta_vector_list = load_vectors(f'../data/vietnamese_rag/question_context_answer_vector_pickle/question_context_answer_vector{i}.pkl')

        for j in range(len(data)):
            data[j]['question_context_answer_vector'] = document_qta_vector_list[j]['question_context_answer_vector']
        full_documents.extend(data)
    for doc in tqdm(full_documents):
        es_client.index(index=index_name, document=doc)

In [18]:
process_documents(documents, index_name, es_client)

100%|███████████████████████████████████████████| 6089/6089 [02:17<00:00, 44.17it/s]


In [19]:
def elastic_search_knn(field, vector, group):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "group": group
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["group", "context", "question", "answer", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs
def question_context_answer_vector_knn(q):
    question = q['question']
    group = q['Group']

    v_q = model.encode(question)

    return elastic_search_knn('question_context_answer_vector', v_q, group)

In [None]:
question_context_answer_vector_knn(dict(
    question='Lái xe đạp bằng chân bị phạt như nào ?',
    Group='Legal'
))

In [46]:
def build_prompt(query, search_results):
    prompt_template = """
You're an assistant working in customer service. Your job is to provide answers to users' questions. Answer the QUESTION based on the CONTEXT from the documents database.
Use only the facts from the CONTEXT when answering the QUESTION. Provide answer in Vietnamese , in normal text form, not using any markdown form, no need to rewrite the question and make sure that is an answer, not listing questions. Also make sure that the answer provides most information from the CONTEXT as possible .

QUESTION: {question}

CONTEXT: 
{context}
""".strip()
    context = ""
    
    for doc in search_results:
        context = context + f"group: {doc['group']}\nquestion: {doc['question']}\nanswer: {doc['answer']}\ncontext: {doc['context'][:1000]}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [53]:
client =  Groq(api_key = os.environ['GROQ_API_KEY'])
def llm(prompt, model = 'mixtral-8x7b-32768'):
    retries = 5
    for i in range(retries):
        try:
            response = client.chat.completions.create(
                model= 'llama3-8b-8192',
                messages=[{"role": "user", "content": prompt}]
            )
            json_response = response.choices[0].message.content
            return json_response
        except HTTPError as e:
            if e.response.status_code == 429:  # Rate limit error
                retry_after = float(e.response.json()['error']['message'].split('in ')[-1].split('s')[0])
                time.sleep(retry_after)
            else:
                raise
        except Exception as e:
            if i < retries - 1:
                time.sleep(2 ** i)  # Exponential backoff
            else:
                raise

In [22]:
# previously: rag(query: str) -> str
def rag(query: dict, model='mixtral-8x7b-32768') -> str:
    search_results = question_context_answer_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [51]:
ground_truth[997]

{'question': "Lễ phát động 'Tết trồng cây đời đời nhớ ơn Bác Hồ' được tổ chức tại đâu vào ngày 19-5?",
 'Group': 'General',
 'document': '279bc1b1'}

In [230]:
# rag(ground_truth[10], "llama-3.1-8b-instant")

In [231]:
# rag(ground_truth[998], "mixtral-8x7b-32768")

In [52]:
rag(ground_truth[997], "llama3-8b-8192")

"Lễ phát động 'Tết trồng cây đời đời nhớ ơn Bác Hồ' được tổ chức tại Công viên Công an TP và Trung tâm Huấn luyện và Bồi dưỡng nghiệp vụ Công an TP vào ngày 19-5."

In [39]:
# rag(ground_truth[997], "llama3-8b-8192")

"Lễ phát động 'Tết trồng cây đời đời nhớ ơn Bác Hồ' được tổ chức tại  nguyên Công viên Công an TP và Trung tâm Huấn luyện và Bồi dưỡng nghiệp vụ Công an TP.  \n"

In [None]:
doc_idx['279bc1b1']['question']

In [None]:
doc_idx['279bc1b1']['answer']

In [None]:

v_llm = model.encode(answer_llm)
v_orig = model.encode(answer_orig)

In [None]:
v_llm.dot(v_orig)

In [23]:
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

In [24]:
def compute_similarity(record, model):
    v_llm = normalize_vector(model.encode(record['answer_llm']))
    v_orig = normalize_vector(model.encode(record['answer_orig']))
    return v_llm.dot(v_orig)

In [None]:
record = {}
record['answer_orig'] = "Công an TP Cần Thơ đã tổ chức lễ phát động 'Tết trồng cây đời đời nhớ ơn Bác Hồ' vào ngày 19-5, trong khuôn viên Công viên Công an TP và Trung tâm Huấn luyện và Bồi dưỡng nghiệp vụ Công an TP."
record['answer_llm'] = "Công an Thành phố Cần Thơ đã tổ chức lễ phát động 'Tết trồng cây đời đời nhớ ơn Bác Hồ' vào ngày 19-5, trong khuôn viên Công viên Công an TP và Trung tâm Huấn luyện và Bồi dưỡng nghiệp vụ Công an TP."

In [185]:
compute_similarity(record, model)

np.float32(0.9659091)

In [25]:
pool = ThreadPoolExecutor(max_workers=6)
def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

def process_record(rec):
    model = 'mixtral-8x7b-32768'
    answer_llm = rag(rec, model = model)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['answer']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'group': rec['Group'],
    }

In [None]:
documents_current = ground_truth[:1217]
chunk_size = 15
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents_current[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_record)
    results.extend(processed_results)
    # # Store the results incrementally
    # for result in processed_results:
    #     if result is not None:
    #         doc_id, questions = result
    #         results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/llm_answer/llm_answer{i + 1}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(20)

100%|███████████████████████████████████████████████████| 15/15 [01:47<00:00, 10.70s/it]

In [242]:
documents_current = ground_truth[:1217]
chunk_size = 15
start_chunk = 5 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents_current[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_record)
    results.extend(processed_results)
    # # Store the results incrementally
    # for result in processed_results:
    #     if result is not None:
    #         doc_id, questions = result
    #         results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/llm_answer/llm_answer{i + 1}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(30)

100%|█████████████████████████| 15/15 [01:15<00:00,  5.03s/it]


Chunk 5 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer6.pkl


100%|█████████████████████████| 15/15 [01:46<00:00,  7.08s/it]


Chunk 6 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer7.pkl


100%|█████████████████████████| 15/15 [01:40<00:00,  6.68s/it]


Chunk 7 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer8.pkl


100%|█████████████████████████| 15/15 [01:33<00:00,  6.26s/it]


Chunk 8 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer9.pkl


100%|█████████████████████████| 15/15 [01:41<00:00,  6.79s/it]


Chunk 9 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer10.pkl


100%|█████████████████████████| 15/15 [01:15<00:00,  5.03s/it]


Chunk 10 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer11.pkl


100%|█████████████████████████| 15/15 [02:17<00:00,  9.18s/it]


Chunk 11 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer12.pkl


100%|█████████████████████████| 15/15 [01:09<00:00,  4.65s/it]


Chunk 12 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer13.pkl


100%|█████████████████████████| 15/15 [02:03<00:00,  8.24s/it]


Chunk 13 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer14.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:39<00:00,  6.62s/it]


Chunk 14 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer15.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:28<00:00,  5.87s/it]


Chunk 15 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer16.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:48<00:00,  7.21s/it]


Chunk 16 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer17.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:22<00:00,  5.50s/it]


Chunk 17 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer18.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:51<00:00,  7.42s/it]


Chunk 18 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer19.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:25<00:00,  5.72s/it]


Chunk 19 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer20.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:50<00:00,  7.37s/it]


Chunk 20 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer21.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:26<00:00,  5.74s/it]


Chunk 21 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer22.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:49<00:00,  7.29s/it]


Chunk 22 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer23.pkl


100%|█████████████████████████| 15/15 [01:37<00:00,  6.52s/it]


Chunk 23 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer24.pkl


100%|█████████████████████████| 15/15 [01:49<00:00,  7.28s/it]


Chunk 24 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer25.pkl


100%|█████████████████████████| 15/15 [01:35<00:00,  6.34s/it]


Chunk 25 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer26.pkl


100%|█████████████████████████| 15/15 [01:42<00:00,  6.85s/it]


Chunk 26 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer27.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:45<00:00,  7.01s/it]


Chunk 27 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer28.pkl


100%|█████████████████████████| 15/15 [01:31<00:00,  6.07s/it]


Chunk 28 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer29.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:53<00:00,  7.57s/it]


Chunk 29 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer30.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:37<00:00,  6.51s/it]


Chunk 30 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer31.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:53<00:00,  7.53s/it]


Chunk 31 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer32.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:38<00:00,  6.56s/it]


Chunk 32 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer33.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:38<00:00,  6.59s/it]


Chunk 33 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer34.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:47<00:00,  7.17s/it]


Chunk 34 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer35.pkl


100%|█████████████████████████| 15/15 [01:49<00:00,  7.29s/it]


Chunk 35 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer36.pkl


100%|█████████████████████████| 15/15 [01:41<00:00,  6.78s/it]


Chunk 36 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer37.pkl


100%|█████████████████████████| 15/15 [01:39<00:00,  6.64s/it]


Chunk 37 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer38.pkl


100%|█████████████████████████| 15/15 [01:34<00:00,  6.31s/it]


Chunk 38 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer39.pkl


100%|█████████████████████████| 15/15 [01:53<00:00,  7.58s/it]


Chunk 39 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer40.pkl


100%|█████████████████████████| 15/15 [01:44<00:00,  6.94s/it]


Chunk 40 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer41.pkl


100%|█████████████████████████| 15/15 [01:39<00:00,  6.61s/it]


Chunk 41 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer42.pkl


100%|█████████████████████████| 15/15 [01:51<00:00,  7.43s/it]


Chunk 42 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer43.pkl


100%|█████████████████████████| 15/15 [01:47<00:00,  7.16s/it]


Chunk 43 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer44.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:49<00:00,  7.32s/it]


Chunk 44 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer45.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:51<00:00,  7.41s/it]


Chunk 45 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer46.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:49<00:00,  7.30s/it]


Chunk 46 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer47.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:32<00:00,  6.17s/it]


Chunk 47 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer48.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:36<00:00,  6.41s/it]


Chunk 48 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer49.pkl


100%|█████████████████████████| 15/15 [02:07<00:00,  8.48s/it]


Chunk 49 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer50.pkl


100%|█████████████████████████| 15/15 [01:24<00:00,  5.66s/it]


Chunk 50 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer51.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:55<00:00,  7.70s/it]


Chunk 51 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer52.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:40<00:00,  6.70s/it]


Chunk 52 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer53.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:12<00:00,  4.86s/it]


Chunk 53 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer54.pkl


100%|███████████████████████████████████████████████████| 15/15 [02:04<00:00,  8.28s/it]


Chunk 54 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer55.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:44<00:00,  6.96s/it]


Chunk 55 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer56.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:38<00:00,  6.55s/it]


Chunk 56 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer57.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:20<00:00,  5.38s/it]


Chunk 57 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer58.pkl


100%|███████████████████████████████████████████████████| 15/15 [02:03<00:00,  8.25s/it]


Chunk 58 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer59.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:37<00:00,  6.51s/it]


Chunk 59 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer60.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:48<00:00,  7.20s/it]


Chunk 60 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer61.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:38<00:00,  6.59s/it]


Chunk 61 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer62.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:39<00:00,  6.64s/it]


Chunk 62 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer63.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:43<00:00,  6.93s/it]


Chunk 63 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer64.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:26<00:00,  5.77s/it]


Chunk 64 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer65.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:37<00:00,  6.53s/it]


Chunk 65 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer66.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:48<00:00,  7.26s/it]


Chunk 66 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer67.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:54<00:00,  7.61s/it]


Chunk 67 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer68.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:35<00:00,  6.40s/it]


Chunk 68 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer69.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:40<00:00,  6.68s/it]


Chunk 69 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer70.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:39<00:00,  6.66s/it]


Chunk 70 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer71.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:43<00:00,  6.91s/it]


Chunk 71 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer72.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:35<00:00,  6.38s/it]


Chunk 72 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer73.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:35<00:00,  6.35s/it]


Chunk 73 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer74.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:11<00:00,  4.76s/it]


Chunk 74 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer75.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:49<00:00,  7.29s/it]


Chunk 75 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer76.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:40<00:00,  6.69s/it]


Chunk 76 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer77.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:40<00:00,  6.72s/it]


Chunk 77 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer78.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:32<00:00,  6.15s/it]


Chunk 78 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer79.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:47<00:00,  7.19s/it]


Chunk 79 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer80.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:47<00:00,  7.13s/it]


Chunk 80 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer81.pkl


In [26]:
documents_current = ground_truth[:1217]

In [26]:
results = []
# Use map_progress to process documents
processed_results = map_progress(pool, documents_current[1200:], process_record)

# Store the results
results.extend(processed_results)
with open('../data/vietnamese_rag/llm_answer/llm_answer82.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|██████████████████| 17/17 [01:36<00:00,  5.70s/it]


In [None]:
documents_current = ground_truth[1217:1217*2]
chunk_size = 15
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents_current[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_record)
    results.extend(processed_results)
    # # Store the results incrementally
    # for result in processed_results:
    #     if result is not None:
    #         doc_id, questions = result
    #         results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/llm_answer/llm_answer{i + 83}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(20)

In [None]:
documents_current = ground_truth[1217:1217*2]
chunk_size = 15
start_chunk = 42 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents_current[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_record)
    results.extend(processed_results)
    # # Store the results incrementally
    # for result in processed_results:
    #     if result is not None:
    #         doc_id, questions = result
    #         results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/llm_answer/llm_answer{i + 83}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(25)

## currently !

In [54]:
documents_current = ground_truth[1217:1217*2]
chunk_size = 15
start_chunk = 70 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents_current[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_record)
    results.extend(processed_results)
    # # Store the results incrementally
    # for result in processed_results:
    #     if result is not None:
    #         doc_id, questions = result
    #         results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/llm_answer/llm_answer{i + 83}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(25)

100%|█| 15/15 [00:03<00:00,  4.


Chunk 70 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer153.pkl


100%|██████████████████| 15/15 [00:50<00:00,  3.40s/it]


Chunk 71 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer154.pkl


100%|██████████████████| 15/15 [00:27<00:00,  1.85s/it]


Chunk 72 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer155.pkl


100%|██████████████████| 15/15 [00:54<00:00,  3.61s/it]


Chunk 73 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer156.pkl


100%|██████████████████| 15/15 [00:42<00:00,  2.81s/it]


Chunk 74 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer157.pkl


100%|██████████████████| 15/15 [00:32<00:00,  2.19s/it]


Chunk 75 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer158.pkl


100%|██████████████████| 15/15 [00:44<00:00,  2.98s/it]


Chunk 76 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer159.pkl


100%|██████████████████| 15/15 [00:22<00:00,  1.49s/it]


Chunk 77 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer160.pkl


100%|██████████████████| 15/15 [00:56<00:00,  3.75s/it]


Chunk 78 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer161.pkl


100%|██████████████████| 15/15 [00:25<00:00,  1.67s/it]


Chunk 79 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer162.pkl


100%|██████████████████| 15/15 [00:52<00:00,  3.49s/it]


Chunk 80 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer163.pkl


In [28]:
results = []
# Use map_progress to process documents
processed_results = map_progress(pool, documents_current[1200:], process_record)

# Store the results
results.extend(processed_results)
with open('../data/vietnamese_rag/llm_answer/llm_answer164.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|███████████████████████████████████████████████| 17/17 [01:41<00:00,  5.96s/it]
 40%|▍| 6/15 [00:02<00:03,  2.3[02:26<00:00, 10.81s/it]5/15 [02:14<00:00, 12.53s/it]

In [None]:
documents_current = ground_truth[1217*2:1217*3]
chunk_size = 15
start_chunk = 1 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents_current[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_record)
    results.extend(processed_results)
    # # Store the results incrementally
    # for result in processed_results:
    #     if result is not None:
    #         doc_id, questions = result
    #         results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/llm_answer/llm_answer{i + 165}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(23)

100%|██████████████████| 15/15 [00:32<00:00,  2.17s/it]


Chunk 1 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer166.pkl


100%|██████████████████| 15/15 [00:44<00:00,  2.97s/it]


Chunk 2 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer167.pkl


100%|██████████████████| 15/15 [00:33<00:00,  2.20s/it]


Chunk 3 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer168.pkl


100%|██████████████████| 15/15 [00:50<00:00,  3.35s/it]


Chunk 4 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer169.pkl


100%|██████████████████| 15/15 [04:52<00:00, 19.51s/it]


Chunk 5 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer170.pkl


100%|██████████████████| 15/15 [06:10<00:00, 24.71s/it]


Chunk 6 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer171.pkl


100%|██████████████████| 15/15 [00:03<00:00,  4.40it/s]


Chunk 7 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer172.pkl


100%|██████████████████| 15/15 [02:26<00:00,  9.80s/it]


Chunk 8 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer173.pkl


100%|█| 15/15 [00:02<00:00,  5.


Chunk 9 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer174.pkl


100%|█| 15/15 [00:50<00:00,  3.


Chunk 10 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer175.pkl


100%|█| 15/15 [00:30<00:00,  2.


Chunk 11 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer176.pkl


  0%|   | 0/15 [00:00<?, ?it/s]

In [None]:
results = []
# Use map_progress to process documents
processed_results = map_progress(pool, documents_current[1200:], process_record)

# Store the results
results.extend(processed_results)
with open('../data/vietnamese_rag/llm_answer/llm_answer246.pkl', 'wb') as file:
    pickle.dump(results, file)

In [None]:
documents_current = ground_truth[1217*3:1217*4]
chunk_size = 15
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents_current[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_record)
    results.extend(processed_results)
    # # Store the results incrementally
    # for result in processed_results:
    #     if result is not None:
    #         doc_id, questions = result
    #         results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/llm_answer/llm_answer{i + 247}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(25)

In [None]:
results = []
# Use map_progress to process documents
processed_results = map_progress(pool, documents_current[1200:], process_record)

# Store the results
results.extend(processed_results)
with open('../data/vietnamese_rag/llm_answer/llm_answer328.pkl', 'wb') as file:
    pickle.dump(results, file)

In [None]:
documents_current = ground_truth[1217*4:]
chunk_size = 15
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents_current[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_record)
    results.extend(processed_results)
    # # Store the results incrementally
    # for result in processed_results:
    #     if result is not None:
    #         doc_id, questions = result
    #         results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/llm_answer/llm_answer{i + 329}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(25)

In [None]:
results = []
# Use map_progress to process documents
processed_results = map_progress(pool, documents_current[1215:], process_record)

# Store the results
results.extend(processed_results)
with open('../data/vietnamese_rag/llm_answer/llm_answer410.pkl', 'wb') as file:
    pickle.dump(results, file)