In [3]:
import json
from openai import OpenAI
from groq import Groq
import os
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import pickle
from elasticsearch import Elasticsearch
import pandas as pd
import requests
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from requests.exceptions import HTTPError

import time

In [4]:
with open('../data/vietnamese_rag/documents-with-ids1.json', 'rt') as f_in:
    documents1 = json.load(f_in)

In [5]:
with open('../data/vietnamese_rag/documents-with-ids2.json', 'rt') as f_in:
    documents2 = json.load(f_in)

In [6]:
with open('../data/vietnamese_rag/documents-with-ids3.json', 'rt') as f_in:
    documents3 = json.load(f_in)

In [7]:
with open('../data/vietnamese_rag/documents-with-ids4.json', 'rt') as f_in:
    documents4 = json.load(f_in)

In [8]:
with open('../data/vietnamese_rag/documents-with-ids5.json', 'rt') as f_in:
    documents5 = json.load(f_in)

In [9]:
def load_documents(base_path, num_files):
    documents = []
    for i in range(1, num_files + 1):
        file_path = f'{base_path}/documents-with-ids{i}.json'
        with open(file_path, 'rt') as f_in:
            documents.extend(json.load(f_in))
    return documents
base_path = '../data/vietnamese_rag'
num_files = 5
documents = load_documents(base_path, num_files)
df_ground_truth = pd.read_csv('../data/vietnamese_rag/ground_truth_data/ground_truth_data.csv')

ground_truth = df_ground_truth.to_dict(orient='records')
doc_idx = {d['id']: d for d in documents}
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")




In [10]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "group": {"type": "keyword"},
            "context": {"type": "text"},
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "id": {"type": "keyword"},
            "context_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_context_answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "vietnamese-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vietnamese-questions'})

In [11]:
def load_vectors(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

def process_documents(documents, index_name, es_client):
    full_documents = []
    for i in range(1, 6):
        if i == 1:
            data = documents1.copy()
        elif i == 2:
            data = documents2.copy()
        elif i == 3:
            data = documents3.copy()
        elif i == 4:
            data = documents4.copy()
        elif i == 5:
            data = documents5.copy()
        document_qta_vector_list = load_vectors(f'../data/vietnamese_rag/question_context_answer_vector_pickle/question_context_answer_vector{i}.pkl')

        for j in range(len(data)):
            data[j]['question_context_answer_vector'] = document_qta_vector_list[j]['question_context_answer_vector']
        full_documents.extend(data)
    for doc in tqdm(full_documents):
        es_client.index(index=index_name, document=doc)
process_documents(documents, index_name, es_client)

100%|█████████| 6089/6089 [02:20<00:00, 43.46it/s]


In [12]:
def elastic_search_knn(field, vector, group):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "group": group
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["group", "context", "question", "answer", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs
def question_context_answer_vector_knn(q):
    question = q['question']
    group = q['Group']

    v_q = model.encode(question)

    return elastic_search_knn('question_context_answer_vector', v_q, group)
def build_prompt(query, search_results):
    prompt_template = """
You're an assistant working in customer service. Your job is to provide answers to users' questions. Answer the QUESTION based on the CONTEXT from the documents database.
Use only the facts from the CONTEXT when answering the QUESTION. Provide answer in Vietnamese , in normal text form, not using any markdown form, no need to rewrite the question and make sure that is an answer, not listing questions. Also make sure that the answer provides most information from the CONTEXT as possible .

QUESTION: {question}

CONTEXT: 
{context}
""".strip()
    context = ""
    
    for doc in search_results:
        context = context + f"group: {doc['group']}\nquestion: {doc['question']}\nanswer: {doc['answer']}\ncontext: {doc['context'][:1000]}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt
client =  Groq(api_key = os.environ['GROQ_API_KEY6'])
def llm(prompt, model = 'mixtral-8x7b-32768'):
    retries = 5
    for i in range(retries):
        try:
            response = client.chat.completions.create(
                model= 'llama3-8b-8192',
                messages=[{"role": "user", "content": prompt}]
            )
            json_response = response.choices[0].message.content
            return json_response
        except HTTPError as e:
            if e.response.status_code == 429:  # Rate limit error
                retry_after = float(e.response.json()['error']['message'].split('in ')[-1].split('s')[0])
                time.sleep(retry_after)
            else:
                raise
        except Exception as e:
            if i < retries - 1:
                time.sleep(2 ** i)  # Exponential backoff
            else:
                raise
# previously: rag(query: str) -> str
def rag(query: dict, model='mixtral-8x7b-32768') -> str:
    search_results = question_context_answer_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer
pool = ThreadPoolExecutor(max_workers=6)
def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

def process_record(rec):
    model = 'mixtral-8x7b-32768'
    answer_llm = rag(rec, model = model)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['answer']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'group': rec['Group'],
    }


In [None]:
# documents_current = ground_truth[-1215*5:-1215*4]

In [17]:
documents_current = ground_truth[-1215*4:-1215*3]

In [16]:
chunk_size = 15
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    if (i == end_chunk - 1):
        chunk_end = chunk_start + chunk_size + 1
    print(i + 244, chunk_start, chunk_end)

244 0 15
245 15 30
246 30 45
247 45 60
248 60 75
249 75 90
250 90 105
251 105 120
252 120 135
253 135 150
254 150 165
255 165 180
256 180 195
257 195 210
258 210 225
259 225 240
260 240 255
261 255 270
262 270 285
263 285 300
264 300 315
265 315 330
266 330 345
267 345 360
268 360 375
269 375 390
270 390 405
271 405 420
272 420 435
273 435 450
274 450 465
275 465 480
276 480 495
277 495 510
278 510 525
279 525 540
280 540 555
281 555 570
282 570 585
283 585 600
284 600 615
285 615 630
286 630 645
287 645 660
288 660 675
289 675 690
290 690 705
291 705 720
292 720 735
293 735 750
294 750 765
295 765 780
296 780 795
297 795 810
298 810 825
299 825 840
300 840 855
301 855 870
302 870 885
303 885 900
304 900 915
305 915 930
306 930 945
307 945 960
308 960 975
309 975 990
310 990 1005
311 1005 1020
312 1020 1035
313 1035 1050
314 1050 1065
315 1065 1080
316 1080 1095
317 1095 1110
318 1110 1125
319 1125 1140
320 1140 1155
321 1155 1170
322 1170 1185
323 1185 1200
324 1200 1216


## 163, 244, 325

 ## +81

In [19]:
chunk_size = 15
start_chunk = 44 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    if (i == end_chunk - 1):
        chunk_end = chunk_start + chunk_size + 1
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents_current[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_record)
    results.extend(processed_results)
    # # Store the results incrementally
    # for result in processed_results:
    #     if result is not None:
    #         doc_id, questions = result
    #         results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/llm_answer/llm_answer_last{i + 244}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    # time.sleep(23)

100%|███████████████████████████████████████████████████| 15/15 [00:28<00:00,  1.87s/it]


Chunk 44 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last288.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:22<00:00,  5.48s/it]


Chunk 45 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last289.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:36<00:00,  6.44s/it]


Chunk 46 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last290.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:24<00:00,  5.65s/it]


Chunk 47 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last291.pkl


100%|███████████████████████████████████████████████████| 15/15 [02:01<00:00,  8.11s/it]


Chunk 48 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last292.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:19<00:00,  5.27s/it]


Chunk 49 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last293.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:30<00:00,  6.06s/it]


Chunk 50 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last294.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:45<00:00,  7.02s/it]


Chunk 51 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last295.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:31<00:00,  6.09s/it]


Chunk 52 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last296.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:16<00:00,  5.08s/it]


Chunk 53 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last297.pkl


100%|███████████████████████████████████████████████████| 15/15 [00:56<00:00,  3.79s/it]


Chunk 54 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last298.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:21<00:00,  5.44s/it]


Chunk 55 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last299.pkl


100%|███████████████████████████████████████████████████| 15/15 [02:02<00:00,  8.20s/it]


Chunk 56 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last300.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:30<00:00,  6.04s/it]


Chunk 57 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last301.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:03<00:00,  4.22s/it]


Chunk 58 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last302.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:40<00:00,  6.69s/it]


Chunk 59 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last303.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:32<00:00,  6.17s/it]


Chunk 60 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last304.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:14<00:00,  4.99s/it]


Chunk 61 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last305.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:33<00:00,  6.23s/it]


Chunk 62 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last306.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:28<00:00,  5.93s/it]


Chunk 63 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last307.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:35<00:00,  6.39s/it]


Chunk 64 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last308.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:33<00:00,  6.25s/it]


Chunk 65 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last309.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:25<00:00,  5.73s/it]


Chunk 66 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last310.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:25<00:00,  5.71s/it]


Chunk 67 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last311.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:37<00:00,  6.47s/it]


Chunk 68 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last312.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:04<00:00,  4.28s/it]


Chunk 69 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last313.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:48<00:00,  7.20s/it]


Chunk 70 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last314.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:27<00:00,  5.82s/it]


Chunk 71 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last315.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:45<00:00,  7.02s/it]


Chunk 72 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last316.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:14<00:00,  4.99s/it]


Chunk 73 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last317.pkl


100%|███████████████████████████████████████████████████| 15/15 [00:57<00:00,  3.85s/it]


Chunk 74 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last318.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:58<00:00,  7.93s/it]


Chunk 75 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last319.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:07<00:00,  4.53s/it]


Chunk 76 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last320.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:38<00:00,  6.56s/it]


Chunk 77 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last321.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:44<00:00,  6.94s/it]


Chunk 78 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last322.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:25<00:00,  5.69s/it]


Chunk 79 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last323.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:14<00:00,  5.00s/it]

Chunk 80 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last324.pkl





In [20]:
documents_current = ground_truth[-1215*8:-1215*7]

In [None]:
chunk_size = 15
start_chunk = 36 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    if (i == end_chunk - 1):
        chunk_end = chunk_start + chunk_size + 1
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents_current[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_record)
    results.extend(processed_results)
    # # Store the results incrementally
    # for result in processed_results:
    #     if result is not None:
    #         doc_id, questions = result
    #         results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/llm_answer/llm_answer_last{i + 568}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    # time.sleep(23)

100%|███████████████████████████████████████████████████| 15/15 [01:41<00:00,  6.77s/it]


Chunk 36 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last604.pkl


100%|███████████████████████████████████████████████████| 15/15 [00:58<00:00,  3.89s/it]


Chunk 37 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last605.pkl


100%|███████████████████████████████████████████████████| 15/15 [02:18<00:00,  9.26s/it]


Chunk 38 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last606.pkl


  0%|                     | 0/15 [00:00<?, ?it/s]