In [1]:
import json
from openai import OpenAI
from groq import Groq
import os
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import pickle
from requests.exceptions import HTTPError
import time
# from groq.exceptions import RateLimitError
from concurrent.futures import ThreadPoolExecutor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client =  Groq(api_key = os.environ['GROQ_API_KEY'])

In [3]:
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")



In [3]:
with open('../data/vietnamese_rag/documents-with-ids1.json', 'rt') as f_in:
    documents1 = json.load(f_in)

In [4]:
with open('../data/vietnamese_rag/documents-with-ids2.json', 'rt') as f_in:
    documents2 = json.load(f_in)

In [5]:
with open('../data/vietnamese_rag/documents-with-ids3.json', 'rt') as f_in:
    documents3 = json.load(f_in)

In [6]:
with open('../data/vietnamese_rag/documents-with-ids4.json', 'rt') as f_in:
    documents4 = json.load(f_in)

In [7]:
with open('../data/vietnamese_rag/documents-with-ids5.json', 'rt') as f_in:
    documents5 = json.load(f_in)

In [8]:
documents = documents1.copy()

In [9]:
documents.extend(documents2)

In [10]:
documents.extend(documents3)

In [11]:
documents.extend(documents4)

In [12]:
documents.extend(documents5)

In [13]:
len(documents)

6089

In [30]:
len(documents1)

1217

In [14]:
documents1[0].keys()

dict_keys(['group', 'context', 'question', 'answer', 'id'])

In [13]:
prompt_template = """
You emulate my assistant who works with me in a Q and A project .
Formulate 5 questions people might ask based on the record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. Make sure the questions should be in Vietnamese and the output can be parsed into json format.

The record:

question: {question}
answer: {answer}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [14]:
def generate_questions(doc):
    # Create a new dictionary excluding the first key-value pair
    doc_items = list(doc.items())[1:]  # Skip the first item
    doc_filtered = dict(doc_items)

    prompt = prompt_template.format(**doc_filtered)

    retries = 5
    for i in range(retries):
        try:
            response = client.chat.completions.create(
                model='Gemma2-9b-It',
                messages=[{"role": "user", "content": prompt}]
            )
            json_response = response.choices[0].message.content
            return json_response
        except HTTPError as e:
            if e.response.status_code == 429:  # Rate limit error
                retry_after = float(e.response.json()['error']['message'].split('in ')[-1].split('s')[0])
                time.sleep(retry_after)
            else:
                raise
        except Exception as e:
            if i < retries - 1:
                time.sleep(2 ** i)  # Exponential backoff
            else:
                raise

In [15]:
def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [16]:

# Initialize ThreadPoolExecutor
pool = ThreadPoolExecutor(max_workers=6)

# Process documents in parallel
results = {}

In [17]:
def process_document(doc):
    doc_id = doc['id']
    if doc_id in results:
        return None

    questions = generate_questions(doc)
    return (doc_id, questions)

In [18]:
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[:30], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions


100%|██████████████████████████| 30/30 [00:04<00:00,  7.45it/s]
 20%|█████▌                      | 1/5 [00:00<00:02,  1.67it/s]

In [None]:

# Print or save the results as needed
print(results)

In [22]:
len(processed_results)

30

In [23]:

# from collections import defaultdict

# hashes = defaultdict(list)

# for doc in documents1:
#     doc_id = doc['id']
#     hashes[doc_id].append(doc)
# # hashes['75fafd29']

In [24]:
# len = 0
# for hash in hashes:
#     print(hash)
#     len += 1
#     if len == 30:
#         break
    

In [25]:
# for result in results:
#     print(result)

In [26]:
with open('../data/vietnamese_rag/ground_truth_data/ground_truth1.pkl', 'wb') as file:
    pickle.dump(results, file)

In [27]:
with open('../data/vietnamese_rag/ground_truth_data/ground_truth1.pkl', 'rb') as file:
    test = pickle.load(file)

In [29]:
type(test)

dict

In [31]:
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[30:30*2], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions


100%|█████████████████████████| 30/30 [00:03<00:00,  9.56it/s]


In [35]:
def get_last_n_items(d, n):
    # Convert dictionary items to a list
    items = list(d.items())
    # Slice the list to get the last n items
    last_n_items = items[-n:]
    # Convert the sliced list back to a dictionary
    return dict(last_n_items)

In [36]:
last_30_items = get_last_n_items(results, 30)
len(last_30_items)

30

In [37]:
with open('../data/vietnamese_rag/ground_truth_data/ground_truth2.pkl', 'wb') as file:
    pickle.dump(last_30_items, file)

In [43]:
with open('../data/vietnamese_rag/ground_truth_data/ground_truth2.pkl', 'rb') as file:
    test = pickle.load(file)

In [None]:
test

In [45]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[30*2:30*3], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:02<00:00, 10.34it/s]


In [50]:
len(results)

30

In [51]:
with open('../data/vietnamese_rag/ground_truth_data/ground_truth3.pkl', 'wb') as file:
    pickle.dump(results, file)

In [10]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[30*3:30*4], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions

100%|███████████████████████████████████████████████████| 30/30 [00:38<00:00,  1.29s/it]
 40%|████████████████████▍                              | 12/30 [00:12<00:27,  1.53s/it]

In [11]:
with open('../data/vietnamese_rag/ground_truth_data/ground_truth4.pkl', 'wb') as file:
    pickle.dump(results, file)

In [10]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[30*4:30*5], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth5.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|███████████████████████████████████████████████████| 30/30 [00:50<00:00,  1.68s/it]
 17%|████████▋                                           | 5/30 [00:04<00:30,  1.20s/it]

In [11]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[30*5:30*6], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth6.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|█████████████████████████| 30/30 [00:13<00:00,  2.16it/s]


In [10]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[30*6:30*7], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth7.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|███████████████████████████████████████████████████| 30/30 [00:52<00:00,  1.75s/it]
100%|█████████████████████████| 17/17 [00:02<00:00,  7.52it/s]0 [00:03<00:00,  9.59it/s]

In [11]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[30*7:30*8], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth8.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|█████████████████████████| 30/30 [00:49<00:00,  1.66s/it]


In [12]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[30*8:30*9], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth9.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|█████████████████████████| 30/30 [00:44<00:00,  1.49s/it]


In [13]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[30*9:30*10], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth10.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|█████████████████████████| 30/30 [00:03<00:00,  7.54it/s]


In [14]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[30*10:30*11], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth11.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|███████████████████████████████████████████████████| 30/30 [00:11<00:00,  2.53it/s]


In [15]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[30*11:30*12], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth12.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|█████████████████████████| 30/30 [00:37<00:00,  1.25s/it]


In [16]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[30*12:30*13], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth13.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|█████████████████████████| 30/30 [00:23<00:00,  1.26it/s]


In [27]:
# Process documents in chunks of 30
chunk_size = 30
start_chunk = 13  # Starting chunk index
end_chunk = (len(documents1) // chunk_size)  # Ending chunk index
print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    chunk = documents1[chunk_start:chunk_end]
    print(chunk_start, chunk_end)

#     # Use map_progress to process documents
#     processed_results = map_progress(pool, chunk, process_document)

#     # Store the results incrementally
#     for result in processed_results:
#         if result is not None:
#             doc_id, questions = result
#             results[doc_id] = questions

#     # Save the results to a file
#     file_name = f'../data/vietnamese_rag/ground_truth_data/ground_truth{i}.pkl'
#     with open(file_name, 'wb') as file:
#         pickle.dump(results, file)

#     # Print out the results
#     print(f"Chunk {i} processed and saved to {file_name}")
#     print(results)

#     # Wait for 1 minute to reset rate limit
#     time.sleep(120)

40
390 420
420 450
450 480
480 510
510 540
540 570
570 600
600 630
630 660
660 690
690 720
720 750
750 780
780 810
810 840
840 870
870 900
900 930
930 960
960 990
990 1020
1020 1050
1050 1080
1080 1110
1110 1140
1140 1170
1170 1200


In [None]:
# Process documents in chunks of 30
chunk_size = 30
start_chunk = 14  # Starting chunk index
end_chunk = (len(documents1) // chunk_size)  # Ending chunk index
print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    chunk = documents1[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_document)

    # Store the results incrementally
    for result in processed_results:
        if result is not None:
            doc_id, questions = result
            results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/ground_truth_data/ground_truth{i + 1}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(60)

In [32]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents1[1200:], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth41.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|█████████████████████████| 17/17 [00:02<00:00,  7.63it/s]


In [33]:
with open('../data/vietnamese_rag/documents-with-ids2.json', 'rt') as f_in:
    documents2 = json.load(f_in)

In [34]:
len(documents2)

1217

In [39]:
chunk_size = 30
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents2) // chunk_size)  # Ending chunk index
print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    print(i + 42, chunk_start, chunk_end)
    # chunk = documents1[chunk_start:chunk_end]

40
42 0 30
43 30 60
44 60 90
45 90 120
46 120 150
47 150 180
48 180 210
49 210 240
50 240 270
51 270 300
52 300 330
53 330 360
54 360 390
55 390 420
56 420 450
57 450 480
58 480 510
59 510 540
60 540 570
61 570 600
62 600 630
63 630 660
64 660 690
65 690 720
66 720 750
67 750 780
68 780 810
69 810 840
70 840 870
71 870 900
72 900 930
73 930 960
74 960 990
75 990 1020
76 1020 1050
77 1050 1080
78 1080 1110
79 1110 1140
80 1140 1170
81 1170 1200


In [None]:
chunk_size = 30
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents2) // chunk_size)  # Ending chunk index
print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    # print(i + 42, chunk_start, chunk_end)
    chunk = documents2[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_document)

    # Store the results incrementally
    for result in processed_results:
        if result is not None:
            doc_id, questions = result
            results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/ground_truth_data/ground_truth{i + 42}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(60)

In [41]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents2[1200:], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth82.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|█████████████████████████| 17/17 [00:05<00:00,  3.09it/s]


In [42]:
with open('../data/vietnamese_rag/documents-with-ids3.json', 'rt') as f_in:
    documents3 = json.load(f_in)

In [None]:
chunk_size = 30
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents3) // chunk_size)  # Ending chunk index
print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    # print(i + 42, chunk_start, chunk_end)
    chunk = documents3[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_document)

    # Store the results incrementally
    for result in processed_results:
        if result is not None:
            doc_id, questions = result
            results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/ground_truth_data/ground_truth{i + 83}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(60)

In [46]:
# results = {}
# # Use map_progress to process documents
# processed_results = map_progress(pool, documents3[1200:], process_document)

# # Store the results
# for result in processed_results:
#     if result is not None:
#         doc_id, questions = result
#         results[doc_id] = questions
# with open('../data/vietnamese_rag/ground_truth_data/ground_truth84.pkl', 'wb') as file:
#     pickle.dump(results, file)

In [47]:
# with open('../data/vietnamese_rag/documents-with-ids4.json', 'rt') as f_in:
#     documents4 = json.load(f_in)

In [48]:
# chunk_size = 30
# start_chunk = 0 # Starting chunk index
# end_chunk = (len(documents4) // chunk_size)  # Ending chunk index
# print(end_chunk)
# for i in range(start_chunk, end_chunk):
#     results = {}
#     chunk_start = i * chunk_size
#     chunk_end = chunk_start + chunk_size
#     # print(i + 42, chunk_start, chunk_end)
#     chunk = documents4[chunk_start:chunk_end]

#     # Use map_progress to process documents
#     processed_results = map_progress(pool, chunk, process_document)

#     # Store the results incrementally
#     for result in processed_results:
#         if result is not None:
#             doc_id, questions = result
#             results[doc_id] = questions

#     # Save the results to a file
#     file_name = f'../data/vietnamese_rag/ground_truth_data/ground_truth{i + 85}.pkl'
#     with open(file_name, 'wb') as file:
#         pickle.dump(results, file)

#     # Print out the results
#     print(f"Chunk {i} processed and saved to {file_name}")
#     print(results)

#     # Wait for 1 minute to reset rate limit
#     time.sleep(60)

In [49]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents3[1200:], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth123.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|███████████████████████████████████████████████████| 17/17 [00:02<00:00,  6.71it/s]


In [50]:
with open('../data/vietnamese_rag/documents-with-ids4.json', 'rt') as f_in:
    documents4 = json.load(f_in)

In [None]:
chunk_size = 30
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents4) // chunk_size)  # Ending chunk index
print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    # print(i + 42, chunk_start, chunk_end)
    chunk = documents4[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_document)

    # Store the results incrementally
    for result in processed_results:
        if result is not None:
            doc_id, questions = result
            results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/ground_truth_data/ground_truth{i + 124}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(60)

In [52]:
chunk_size = 30
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents4) // chunk_size)  # Ending chunk index
print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    print(i + 124, chunk_start, chunk_end)

40
124 0 30
125 30 60
126 60 90
127 90 120
128 120 150
129 150 180
130 180 210
131 210 240
132 240 270
133 270 300
134 300 330
135 330 360
136 360 390
137 390 420
138 420 450
139 450 480
140 480 510
141 510 540
142 540 570
143 570 600
144 600 630
145 630 660
146 660 690
147 690 720
148 720 750
149 750 780
150 780 810
151 810 840
152 840 870
153 870 900
154 900 930
155 930 960
156 960 990
157 990 1020
158 1020 1050
159 1050 1080
160 1080 1110
161 1110 1140
162 1140 1170
163 1170 1200


In [53]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents4[1200:], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth164.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|█████████████████████████| 17/17 [00:02<00:00,  6.55it/s]


In [8]:
with open('../data/vietnamese_rag/documents-with-ids5.json', 'rt') as f_in:
    documents5 = json.load(f_in)

In [9]:
len(documents5)

1221

In [10]:
chunk_size = 30
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents5) // chunk_size)  # Ending chunk index
print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    print(i + 165, chunk_start, chunk_end)

40
165 0 30
166 30 60
167 60 90
168 90 120
169 120 150
170 150 180
171 180 210
172 210 240
173 240 270
174 270 300
175 300 330
176 330 360
177 360 390
178 390 420
179 420 450
180 450 480
181 480 510
182 510 540
183 540 570
184 570 600
185 600 630
186 630 660
187 660 690
188 690 720
189 720 750
190 750 780
191 780 810
192 810 840
193 840 870
194 870 900
195 900 930
196 930 960
197 960 990
198 990 1020
199 1020 1050
200 1050 1080
201 1080 1110
202 1110 1140
203 1140 1170
204 1170 1200


In [None]:
chunk_size = 30
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents5) // chunk_size)  # Ending chunk index
print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    # print(i + 42, chunk_start, chunk_end)
    chunk = documents5[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_document)

    # Store the results incrementally
    for result in processed_results:
        if result is not None:
            doc_id, questions = result
            results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/ground_truth_data/ground_truth{i + 165}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(30)

In [11]:
chunk_size = 30
start_chunk = 39 # Starting chunk index
end_chunk = (len(documents5) // chunk_size)  # Ending chunk index
print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    # print(i + 165, chunk_start, chunk_end)
    chunk = documents5[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_document)

    # Store the results incrementally
    for result in processed_results:
        if result is not None:
            doc_id, questions = result
            results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/ground_truth_data/ground_truth{i + 165}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(30)

40


100%|███████████████████████████| 30/30 [00:03<00:00,  7.88it/s]


Chunk 39 processed and saved to ../data/vietnamese_rag/ground_truth_data/ground_truth204.pkl


100%|███████████████████████████| 21/21 [00:23<00:00,  2.95s/it]

In [12]:
results = {}
# Use map_progress to process documents
processed_results = map_progress(pool, documents5[1200:], process_document)

# Store the results
for result in processed_results:
    if result is not None:
        doc_id, questions = result
        results[doc_id] = questions
with open('../data/vietnamese_rag/ground_truth_data/ground_truth205.pkl', 'wb') as file:
    pickle.dump(results, file)

100%|███████████████████████████| 21/21 [00:23<00:00,  1.14s/it]


In [96]:
parsed_results = {}
for doc_id, json_questions in results.items():
    try:
        parsed_questions = json.loads(json_questions)
        parsed_results[doc_id] = parsed_questions
        # print(parsed_questions)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for document {doc_id}: {e}")
        # json_questions = json_questions.replace("\n", "\\n")
        # print(json_questions)

Error decoding JSON for document 64016d4f: Invalid control character at: line 5 column 49 (char 303)
Error decoding JSON for document df658334: Invalid control character at: line 1 column 338 (char 337)


In [97]:

# def clean_json_string(json_string):
#     # Remove extra closing brackets if they exist
#     json_string = re.sub(r'\s*\]\s*\]$', ']', json_string)

#     # Escape internal double quotes within the strings
#     json_string = re.sub(r'(?<!\\)"', r'\\"', json_string)

#     # Unescape properly formatted double quotes around the whole JSON
#     json_string = re.sub(r'\\\\"', r'"', json_string)
    
#     return json_string

# parsed_results = {}

# for doc_id, json_questions in results.items():
#     try:
#         parsed_questions = json.loads(json_questions)
#         parsed_results[doc_id] = parsed_questions
#     except json.JSONDecodeError as e:
#         print(f"Error decoding JSON for document {doc_id}: {e}")
#         cleaned_json_questions = clean_json_string(json_questions)
#         print(cleaned_json_questions)
#         try:
#             parsed_questions = json.loads(cleaned_json_questions)
#             parsed_results[doc_id] = parsed_questions
#         except json.JSONDecodeError as e:
#             print(f"Error decoding cleaned JSON for document {doc_id}: {e}")
#             print(cleaned_json_questions)

# # # Print the parsed results
# # for doc_id, questions in parsed_results.items():
# #     print(f"Document ID: {doc_id}")
# #     print(f"Questions: {questions}")

In [98]:
# import re
# # Function to clean up JSON strings
# def clean_json_string(json_string):
#     # Remove non-relevant characters and control characters
#     json_string = re.sub(r'[\n\r\t]', '', json_string)  # Remove newlines, carriage returns, and tabs
#     json_string = re.sub(r'\\', '', json_string)  # Remove backslashes
#     json_string = re.sub(r'“|”', '"', json_string)  # Replace fancy quotes with standard quotes
#     json_string = re.sub(r'^\[|\]$', '', json_string)  # Remove leading and trailing square brackets
#     json_string = re.sub(r'^\{|\}$', '', json_string)  # Remove leading and trailing curly braces
#     return json_string
# parsed_results = {}
# for doc_id, json_questions in results.items():
#     try:
        
#         parsed_questions = json.loads(json_questions)
#         parsed_results[doc_id] = parsed_questions
#         print(json_questions)
#         # print(parsed_questions)
#     except json.JSONDecodeError as e:
#         print(f"Error decoding JSON for document {doc_id}: {e}")
#         # print(json_questions)
#         # json_questions = clean_json_string(json_questions)
#         # print(json_questions)
#         parsed_questions = json.loads(json_questions)
#         parsed_results[doc_id] = parsed_questions
#         # print(json_questions)

In [100]:
len(parsed_results)

19

In [19]:

doc_index = {d['id']: d for d in documents}

In [102]:
# doc_index

In [103]:
# Function to clean and normalize JSON data
def clean_and_normalize_data(data):
    if isinstance(data, dict) and 'questions' in data:
        return data['questions']
    elif isinstance(data, list):
        return data
    else:
        return []

# Process the cleaned data
final_results = []

for doc_id, questions in parsed_results.items():
    group = doc_index.get(doc_id, {}).get('group', 'Unknown')
    cleaned_questions = clean_and_normalize_data(questions)
    for q in cleaned_questions:
        final_results.append((q, group, doc_id))

# Print the final results
for result in final_results:
    # print(result)
    pass

In [104]:
import pandas as pd

In [105]:
df = pd.DataFrame(final_results, columns=['question', 'Group', 'document'])

In [106]:
df

Unnamed: 0,question,Group,document
0,Trong trường hợp môi trường có nhiều mối đe dọ...,Expert,0ebe745c
1,Tại sao PWO cần phải ưu tiên giải quyết mối đe...,Expert,0ebe745c
2,Biện pháp nào được đề xuất để đối phó với tên ...,Expert,0ebe745c
3,Việc phát hiện ngư lôi ở cự ly 4.000 yds tạo ...,Expert,0ebe745c
4,Những gì PWO nên làm để đối phó với ngư lôi đa...,Expert,0ebe745c
...,...,...,...
80,Để hợp tác hiệu quả với nhà tổ chức sự kiện và...,Expert,10833546
81,"Trong quá trình cộng tác, bạn cần thảo luận về...",Expert,10833546
82,"Kế hoạch trước khi tham gia sự kiện, bạn nên l...",Expert,10833546
83,Bạn có nên phối hợp với các nhiếp ảnh gia và n...,Expert,10833546


In [110]:
with open('../data/vietnamese_rag/ground_truth_data/ground_truth204.pkl', 'rb') as file:
    results = pickle.load(file)

In [115]:
parsed_results = {}
for doc_id, json_questions in results.items():
    try:
        parsed_questions = json.loads(json_questions)
        parsed_results[doc_id] = parsed_questions
        # print(parsed_questions)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for document {doc_id}: {e}")
        # json_questions = json_questions.replace("\n", "\\n")
        # print(json_questions)

Error decoding JSON for document 1a090ecc: Expecting ',' delimiter: line 3 column 22 (char 106)
Error decoding JSON for document 95a7e364: Expecting ',' delimiter: line 1 column 299 (char 298)
Error decoding JSON for document 3ee1a8e5: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for document 913280cb: Expecting value: line 1 column 1 (char 0)


In [116]:
# Function to clean and normalize JSON data
def clean_and_normalize_data(data):
    if isinstance(data, dict) and 'questions' in data:
        return data['questions']
    elif isinstance(data, list):
        return data
    else:
        return []

# Process the cleaned data
final_results = []

for doc_id, questions in parsed_results.items():
    group = doc_index.get(doc_id, {}).get('group', 'Unknown')
    cleaned_questions = clean_and_normalize_data(questions)
    for q in cleaned_questions:
        final_results.append((q, group, doc_id))

# Print the final results
for result in final_results:
    # print(result)
    pass

In [117]:
df1 = pd.DataFrame(final_results, columns=['question', 'Group', 'document'])

In [118]:
df1

Unnamed: 0,question,Group,document
0,kích thước trung bình của một đàn kiến ​​Form...,Expert,1cfe0283
1,Formica rufa thich nghi với những điều kiện mô...,Expert,1cfe0283
2,Tại sao không có công thức tính toán kích thướ...,Expert,1cfe0283
3,Các yếu tố nào có thể ảnh hưởng đến kích thước...,Expert,1cfe0283
4,"Ngoài số lượng ong chúa, yếu tố nào khác có th...",Expert,1cfe0283
...,...,...,...
105,Theo Quy định quốc tế về ngăn ngừa va chạm trê...,Expert,ed2f5cc9
106,"Trong tình huống vượt bến hẹp, tàu nào phải nh...",Expert,ed2f5cc9
107,Khi tàu bạn ở góc 30 độ trên mũi tàu bên trái ...,Expert,ed2f5cc9
108,Ngành nào sẽ phát ban hành hành động phù hợp k...,Expert,ed2f5cc9


In [125]:
df1['document'].value_counts()

document
1cfe0283    5
97c28872    5
b4f78202    5
cbe124b0    5
c1e10600    5
448bbc4b    5
2b8110f8    5
a1ec240e    5
8ce8ccc0    5
3a454b93    5
690899f9    5
5b72e815    5
172b4332    5
bad5b7b8    5
b331b5fb    5
29cb7829    5
3e66f963    5
d44f62a9    5
a06dbd01    5
d86f5b4e    5
d513d21f    5
ed2f5cc9    5
Name: count, dtype: int64

In [120]:
len(df1)

110

In [122]:
len(parsed_results)

26

In [42]:
def process_ground_truth_file(file_path, doc_index):
    with open(file_path, 'rb') as file:
        results = pickle.load(file)
    
    falsed_json_doc_id = []
    parsed_results = {}

    for doc_id, json_questions in results.items():
        try:
            parsed_questions = json.loads(json_questions)
            parsed_results[doc_id] = parsed_questions
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON for document {doc_id}: {e}")
            falsed_json_doc_id.append(doc_id)
            cleaned_json_questions = clean_and_normalize_json_string(json_questions)
            try:
                parsed_questions = json.loads(cleaned_json_questions)
                parsed_results[doc_id] = parsed_questions
            except json.JSONDecodeError as e:
                print(f"Error decoding cleaned JSON for document {doc_id}: {e}")

    final_results = []

    for doc_id, questions in parsed_results.items():
        group = doc_index.get(doc_id, {}).get('group', 'Unknown')
        cleaned_questions = clean_and_normalize_data(questions)
        for q in cleaned_questions:
            final_results.append((q, group, doc_id))

    return final_results, falsed_json_doc_id

def process_all_ground_truth_files(directory, doc_index):
    all_final_results = []
    all_failed_docs = []

    for i in range(1, 205):
        file_path = os.path.join(directory, f'ground_truth{i}.pkl')
        if os.path.exists(file_path):
            final_results, failed_docs = process_ground_truth_file(file_path, doc_index)
            all_final_results.extend(final_results)
            all_failed_docs.extend(failed_docs)
        else:
            print(f"File {file_path} does not exist.")

    df = pd.DataFrame(all_final_results, columns=['question', 'Group', 'document'])
    return df, all_failed_docs

In [43]:
directory = '../data/vietnamese_rag/ground_truth_data'
df, failed_docs = process_all_ground_truth_files(directory, doc_index)

Error decoding JSON for document efda5ff9: Expecting value: line 1 column 1 (char 0)
Error decoding cleaned JSON for document efda5ff9: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for document 2209b484: Invalid control character at: line 5 column 21 (char 312)
Error decoding JSON for document 16f38e34: Expecting ',' delimiter: line 2 column 1 (char 372)
Error decoding cleaned JSON for document 16f38e34: Expecting ',' delimiter: line 1 column 372 (char 371)
Error decoding JSON for document ed30bad1: Expecting ',' delimiter: line 1 column 62 (char 61)
Error decoding cleaned JSON for document ed30bad1: Expecting ',' delimiter: line 1 column 62 (char 61)
Error decoding JSON for document 4cd67d7a: Expecting value: line 1 column 1 (char 0)
Error decoding cleaned JSON for document 4cd67d7a: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for document 6434830c: Expecting value: line 7 column 1 (char 466)
Error decoding cleaned JSON for document 6434830c: Expecti

In [44]:
df

Unnamed: 0,question,Group,document
0,Minh Tú đã gặp khó khăn gì trong thử thách đi ...,General,75fafd29
1,Điểm đến nào là thử thách khó khăn nhất đối vớ...,General,75fafd29
2,Vị trí của Minh Tú trong đêm chung kết Asia's ...,General,75fafd29
3,Minh Tú đã thể hiện kỹ năng gì khi thực hiện t...,General,75fafd29
4,Những thử thách nào đã giúp Minh Tú gặt hái th...,General,75fafd29
...,...,...,...
27825,Theo Quy định quốc tế về ngăn ngừa va chạm trê...,Expert,ed2f5cc9
27826,"Trong tình huống vượt bến hẹp, tàu nào phải nh...",Expert,ed2f5cc9
27827,Khi tàu bạn ở góc 30 độ trên mũi tàu bên trái ...,Expert,ed2f5cc9
27828,Ngành nào sẽ phát ban hành hành động phù hợp k...,Expert,ed2f5cc9


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27830 entries, 0 to 27829
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  27830 non-null  object
 1   Group     27830 non-null  object
 2   document  27830 non-null  object
dtypes: object(3)
memory usage: 652.4+ KB


In [45]:
len(failed_docs)

290

In [151]:
documents6 = documents.copy()

In [152]:
len(documents6)

6089

In [153]:
documents7 = []
for doc in documents6:
    if doc['id'] in failed_docs:
        documents7.append(doc)
        

In [155]:
len(documents7)

290

In [161]:
with open('../data/vietnamese_rag/ground_truth_failed_data/documents_failed.json', 'wt') as file:
    json.dump(documents7 , file, indent=2)

In [33]:
with open('../data/vietnamese_rag/ground_truth_failed_data/documents_failed.json', 'rt') as f_in:
    documents7 = json.load(f_in)

In [21]:
chunk_size = 29
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents7) // chunk_size)  # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    if (i == end_chunk - 1):
        chunk_end = chunk_start + chunk_size + 1
    
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents7[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_document)

    # Store the results incrementally
    for result in processed_results:
        if result is not None:
            doc_id, questions = result
            results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/ground_truth_failed_data/ground_truth_failed{i + 1}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(30)

100%|██████████████████████| 29/29 [00:03<00:00,  8.34it/s]


Chunk 0 processed and saved to ../data/vietnamese_rag/ground_truth_failed_data/ground_truth_failed1.pkl


100%|██████████████████████| 29/29 [00:31<00:00,  1.08s/it]


Chunk 1 processed and saved to ../data/vietnamese_rag/ground_truth_failed_data/ground_truth_failed2.pkl


100%|██████████████████████| 29/29 [00:41<00:00,  1.44s/it]


Chunk 2 processed and saved to ../data/vietnamese_rag/ground_truth_failed_data/ground_truth_failed3.pkl


100%|██████████████████████| 29/29 [00:28<00:00,  1.01it/s]


Chunk 3 processed and saved to ../data/vietnamese_rag/ground_truth_failed_data/ground_truth_failed4.pkl


100%|██████████████████████| 29/29 [00:40<00:00,  1.39s/it]


Chunk 4 processed and saved to ../data/vietnamese_rag/ground_truth_failed_data/ground_truth_failed5.pkl


100%|██████████████████████| 29/29 [00:35<00:00,  1.22s/it]


Chunk 5 processed and saved to ../data/vietnamese_rag/ground_truth_failed_data/ground_truth_failed6.pkl


100%|██████████████████████| 29/29 [00:34<00:00,  1.18s/it]


Chunk 6 processed and saved to ../data/vietnamese_rag/ground_truth_failed_data/ground_truth_failed7.pkl


100%|██████████████████████| 29/29 [00:35<00:00,  1.22s/it]


Chunk 7 processed and saved to ../data/vietnamese_rag/ground_truth_failed_data/ground_truth_failed8.pkl


100%|██████████████████████| 29/29 [00:31<00:00,  1.08s/it]


Chunk 8 processed and saved to ../data/vietnamese_rag/ground_truth_failed_data/ground_truth_failed9.pkl


100%|██████████████████████| 29/29 [00:34<00:00,  1.18s/it]


Chunk 9 processed and saved to ../data/vietnamese_rag/ground_truth_failed_data/ground_truth_failed10.pkl


 80%|██████████████████▍    | 8/10 [00:01<00:00,  6.71it/s]

In [46]:
import re
import pandas as pd
def clean_and_normalize_json_string(json_string):
    # Remove unwanted characters and control characters
    json_string = re.sub(r'[\n\r\t]', '', json_string)  # Remove newlines, carriage returns, and tabs
    json_string = re.sub(r'\\', '', json_string)  # Remove backslashes
    json_string = re.sub(r'“|”', '"', json_string)  # Replace fancy quotes with standard quotes
    return json_string
def clean_and_normalize_data(data):
    if isinstance(data, dict) and 'questions' in data:
        return data['questions']
    elif isinstance(data, list):
        return data
    else:
        return []
def process_ground_truth_file(file_path, doc_index):
    with open(file_path, 'rb') as file:
        results = pickle.load(file)
    
    falsed_json_doc_id = []
    parsed_results = {}

    for doc_id, json_questions in results.items():
        try:
            parsed_questions = json.loads(json_questions)
            parsed_results[doc_id] = parsed_questions
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON for document {doc_id}: {e}")
            falsed_json_doc_id.append(doc_id)
            cleaned_json_questions = clean_and_normalize_json_string(json_questions)
            try:
                parsed_questions = json.loads(cleaned_json_questions)
                parsed_results[doc_id] = parsed_questions
            except json.JSONDecodeError as e:
                print(f"Error decoding cleaned JSON for document {doc_id}: {e}")

    final_results = []

    for doc_id, questions in parsed_results.items():
        group = doc_index.get(doc_id, {}).get('group', 'Unknown')
        cleaned_questions = clean_and_normalize_data(questions)
        for q in cleaned_questions:
            final_results.append((q, group, doc_id))

    return final_results, falsed_json_doc_id

def process_all_ground_truth_files(directory, doc_index):
    all_final_results = []
    all_failed_docs = []

    for i in range(1, 205):
        file_path = os.path.join(directory, f'ground_truth_failed{i}.pkl')
        if os.path.exists(file_path):
            final_results, failed_docs = process_ground_truth_file(file_path, doc_index)
            all_final_results.extend(final_results)
            all_failed_docs.extend(failed_docs)
        else:
            print(f"File {file_path} does not exist.")

    df = pd.DataFrame(all_final_results, columns=['question', 'Group', 'document'])
    return df, all_failed_docs

In [47]:
directory = '../data/vietnamese_rag/ground_truth_failed_data'
df1, failed_docs = process_all_ground_truth_files(directory, doc_index)

Error decoding JSON for document efda5ff9: Expecting value: line 1 column 1 (char 0)
Error decoding cleaned JSON for document efda5ff9: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for document 2209b484: Expecting value: line 1 column 1 (char 0)
Error decoding cleaned JSON for document 2209b484: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for document e57a25cb: Expecting value: line 1 column 1 (char 0)
Error decoding cleaned JSON for document e57a25cb: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for document e1ed1ea4: Expecting ',' delimiter: line 1 column 15 (char 14)
Error decoding cleaned JSON for document e1ed1ea4: Expecting ',' delimiter: line 1 column 15 (char 14)
Error decoding JSON for document 231b9d80: Expecting value: line 1 column 1 (char 0)
Error decoding cleaned JSON for document 231b9d80: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for document 2aa81fa8: Expecting value: line 1 column 1 (char 0)
Error

In [48]:
len(failed_docs)

36

In [36]:
df1

Unnamed: 0,question,Group,document


In [44]:
documents8 = documents7.copy()

In [45]:
documents9 = []
for doc in documents8:
    if doc['id'] in failed_docs:
        documents9.append(doc)
        

In [46]:
with open('../data/vietnamese_rag/ground_truth_failed_data2/documents_failed.json', 'wt') as file:
    json.dump(documents9 , file, indent=2)

In [38]:
with open('../data/vietnamese_rag/ground_truth_failed_data2/documents_failed.json', 'rt') as f_in:
    documents10 = json.load(f_in)

In [48]:
chunk_size = 36
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents10) // chunk_size)  # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    if (i == end_chunk - 1):
        chunk_end = chunk_start + chunk_size + 1
    
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents10[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_document)

    # Store the results incrementally
    for result in processed_results:
        if result is not None:
            doc_id, questions = result
            results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/ground_truth_failed_data2/ground_truth_failed{i + 1}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(30)

In [49]:
import re
import pandas as pd
def clean_and_normalize_json_string(json_string):
    # Remove unwanted characters and control characters
    json_string = re.sub(r'[\n\r\t]', '', json_string)  # Remove newlines, carriage returns, and tabs
    json_string = re.sub(r'\\', '', json_string)  # Remove backslashes
    json_string = re.sub(r'“|”', '"', json_string)  # Replace fancy quotes with standard quotes
    return json_string
def clean_and_normalize_data(data):
    if isinstance(data, dict) and 'questions' in data:
        return data['questions']
    elif isinstance(data, list):
        return data
    else:
        return []
def process_ground_truth_file(file_path, doc_index):
    with open(file_path, 'rb') as file:
        results = pickle.load(file)
    
    falsed_json_doc_id = []
    parsed_results = {}

    for doc_id, json_questions in results.items():
        try:
            parsed_questions = json.loads(json_questions)
            parsed_results[doc_id] = parsed_questions
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON for document {doc_id}: {e}")
            falsed_json_doc_id.append(doc_id)
            cleaned_json_questions = clean_and_normalize_json_string(json_questions)
            try:
                parsed_questions = json.loads(cleaned_json_questions)
                parsed_results[doc_id] = parsed_questions
            except json.JSONDecodeError as e:
                print(f"Error decoding cleaned JSON for document {doc_id}: {e}")

    final_results = []

    for doc_id, questions in parsed_results.items():
        group = doc_index.get(doc_id, {}).get('group', 'Unknown')
        cleaned_questions = clean_and_normalize_data(questions)
        for q in cleaned_questions:
            final_results.append((q, group, doc_id))

    return final_results, falsed_json_doc_id

def process_all_ground_truth_files(directory, doc_index):
    all_final_results = []
    all_failed_docs = []

    for i in range(1, 205):
        file_path = os.path.join(directory, f'ground_truth_failed{i}.pkl')
        if os.path.exists(file_path):
            final_results, failed_docs = process_ground_truth_file(file_path, doc_index)
            all_final_results.extend(final_results)
            all_failed_docs.extend(failed_docs)
        else:
            print(f"File {file_path} does not exist.")

    df = pd.DataFrame(all_final_results, columns=['question', 'Group', 'document'])
    return df, all_failed_docs

In [50]:
directory = '../data/vietnamese_rag/ground_truth_failed_data2'
df2, failed_docs = process_all_ground_truth_files(directory, doc_index)

Error decoding JSON for document efda5ff9: Expecting value: line 1 column 1 (char 0)
Error decoding cleaned JSON for document efda5ff9: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for document e1ed1ea4: Expecting ',' delimiter: line 5 column 8 (char 310)
Error decoding cleaned JSON for document e1ed1ea4: Expecting ',' delimiter: line 1 column 307 (char 306)
Error decoding JSON for document 8687b41f: Expecting value: line 1 column 1 (char 0)
Error decoding cleaned JSON for document 8687b41f: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for document b400adb9: Expecting ',' delimiter: line 3 column 31 (char 148)
Error decoding cleaned JSON for document b400adb9: Expecting ',' delimiter: line 1 column 147 (char 146)
Error decoding JSON for document d6c6ecb3: Expecting value: line 1 column 1 (char 0)
Error decoding cleaned JSON for document d6c6ecb3: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for document dcc0b949: Expecting ',' delimite

In [51]:
len(failed_docs)

10

In [20]:
# documents11 = []
# for doc in documents10:
#     if doc['id'] in failed_docs:
#         documents11.append(doc)
# with open('../data/vietnamese_rag/ground_truth_failed_data3/documents_failed.json', 'wt') as file:
#     json.dump(documents11 , file, indent=2)
with open('../data/vietnamese_rag/ground_truth_failed_data3/documents_failed.json', 'rt') as f_in:
    documents12 = json.load(f_in)
chunk_size = 10
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents12) // chunk_size)  # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    if (i == end_chunk - 1):
        chunk_end = chunk_start + chunk_size + 1
    
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents12[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_document)

    # Store the results incrementally
    for result in processed_results:
        if result is not None:
            doc_id, questions = result
            results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/ground_truth_failed_data3/ground_truth_failed{i + 1}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(30)

100%|██████████████████████████| 10/10 [00:01<00:00,  7.33it/s]


Chunk 0 processed and saved to ../data/vietnamese_rag/ground_truth_failed_data3/ground_truth_failed1.pkl


In [52]:
directory = '../data/vietnamese_rag/ground_truth_failed_data3'
df3, failed_docs = process_all_ground_truth_files(directory, doc_index)

Error decoding JSON for document e1ed1ea4: Invalid control character at: line 1 column 325 (char 324)
Error decoding cleaned JSON for document e1ed1ea4: Unterminated string starting at: line 1 column 70 (char 69)
Error decoding JSON for document 0a639f21: Expecting value: line 1 column 1 (char 0)
Error decoding cleaned JSON for document 0a639f21: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for document 56f1d0c8: Expecting value: line 1 column 1 (char 0)
Error decoding cleaned JSON for document 56f1d0c8: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for document 4b955266: Expecting ',' delimiter: line 1 column 27 (char 26)
Error decoding cleaned JSON for document 4b955266: Expecting ',' delimiter: line 1 column 27 (char 26)
Error decoding JSON for document 3424690a: Expecting value: line 1 column 1 (char 0)
Error decoding cleaned JSON for document 3424690a: Expecting value: line 1 column 1 (char 0)
File ../data/vietnamese_rag/ground_truth_failed_data3/g

In [53]:
len(failed_docs)

5

In [25]:
documents13 = []
for doc in documents12:
    if doc['id'] in failed_docs:
        documents13.append(doc)
with open('../data/vietnamese_rag/ground_truth_failed_data4/documents_failed.json', 'wt') as file:
    json.dump(documents13 , file, indent=2)
with open('../data/vietnamese_rag/ground_truth_failed_data4/documents_failed.json', 'rt') as f_in:
    documents14 = json.load(f_in)
chunk_size = 5
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents14) // chunk_size)  # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = {}
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    if (i == end_chunk - 1):
        chunk_end = chunk_start + chunk_size + 1
    
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents14[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_document)

    # Store the results incrementally
    for result in processed_results:
        if result is not None:
            doc_id, questions = result
            results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/ground_truth_failed_data4/ground_truth_failed{i + 1}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    time.sleep(30)

100%|████████████████████████████| 5/5 [00:00<00:00,  7.32it/s]


Chunk 0 processed and saved to ../data/vietnamese_rag/ground_truth_failed_data4/ground_truth_failed1.pkl


In [54]:
directory = '../data/vietnamese_rag/ground_truth_failed_data4'
df4, failed_docs = process_all_ground_truth_files(directory, doc_index)

File ../data/vietnamese_rag/ground_truth_failed_data4/ground_truth_failed2.pkl does not exist.
File ../data/vietnamese_rag/ground_truth_failed_data4/ground_truth_failed3.pkl does not exist.
File ../data/vietnamese_rag/ground_truth_failed_data4/ground_truth_failed4.pkl does not exist.
File ../data/vietnamese_rag/ground_truth_failed_data4/ground_truth_failed5.pkl does not exist.
File ../data/vietnamese_rag/ground_truth_failed_data4/ground_truth_failed6.pkl does not exist.
File ../data/vietnamese_rag/ground_truth_failed_data4/ground_truth_failed7.pkl does not exist.
File ../data/vietnamese_rag/ground_truth_failed_data4/ground_truth_failed8.pkl does not exist.
File ../data/vietnamese_rag/ground_truth_failed_data4/ground_truth_failed9.pkl does not exist.
File ../data/vietnamese_rag/ground_truth_failed_data4/ground_truth_failed10.pkl does not exist.
File ../data/vietnamese_rag/ground_truth_failed_data4/ground_truth_failed11.pkl does not exist.
File ../data/vietnamese_rag/ground_truth_failed_

In [55]:
len(failed_docs)

0

In [56]:
df4

Unnamed: 0,question,Group,document
0,Phim Thái Bản 'Vì sao đưa anh tới' sẽ có những...,General,e1ed1ea4
1,Có những thay đổi nào trong nội dung phim để p...,General,e1ed1ea4
2,Văn hóa và lịch sử của Thái Lan được thể hiện ...,General,e1ed1ea4
3,Tại sao phim ' Vì sao đưa anh tới' bản Thái lạ...,General,e1ed1ea4
4,Sự khác biệt về văn hóa giữa Hàn Quốc và Thái ...,General,e1ed1ea4
5,"Để tránh hiệu ứng lô hay lứa trong thí nghiệm,...",Expert,0a639f21
6,Mỗi đợt nhân giống có bao nhiêu con chuột có t...,Expert,0a639f21
7,Có bao nhiêu nhóm thí nghiệm trong thí nghiệm ...,Expert,0a639f21
8,Vì sao việc phân ngẫu nhiên chuột quan trọng t...,Expert,0a639f21
9,Khi nào sẽ đạt được tổng số 20 con chuột trong...,Expert,0a639f21


In [57]:
# Assuming df, df1, df2, df3, and df4 are your DataFrames
ground_truth_data = pd.concat([df, df1, df2, df3, df4], axis=0, ignore_index=True)


In [58]:
ground_truth_data

Unnamed: 0,question,Group,document
0,Minh Tú đã gặp khó khăn gì trong thử thách đi ...,General,75fafd29
1,Điểm đến nào là thử thách khó khăn nhất đối vớ...,General,75fafd29
2,Vị trí của Minh Tú trong đêm chung kết Asia's ...,General,75fafd29
3,Minh Tú đã thể hiện kỹ năng gì khi thực hiện t...,General,75fafd29
4,Những thử thách nào đã giúp Minh Tú gặt hái th...,General,75fafd29
...,...,...,...
29210,Những khó khăn nào mà vận động viên thường gặp...,Expert,3424690a
29211,Tâm lý học thể thao có những kỹ thuật nào cụ t...,Expert,3424690a
29212,Tại sao không có một kỹ thuật duy nhất được xe...,Expert,3424690a
29213,Thiết lập mục tiêu là một kỹ thuật gì trong tâ...,Expert,3424690a


In [60]:
ground_truth_data.to_csv('../data/vietnamese_rag/ground_truth_data/ground_truth_data.csv', index = False)