In [1]:
import json
from openai import OpenAI
from groq import Groq
import os
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import pickle
from elasticsearch import Elasticsearch
import pandas as pd
import requests
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from requests.exceptions import HTTPError

import time

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open('../data/vietnamese_rag/documents-with-ids1.json', 'rt') as f_in:
    documents1 = json.load(f_in)

In [4]:
with open('../data/vietnamese_rag/documents-with-ids2.json', 'rt') as f_in:
    documents2 = json.load(f_in)

In [5]:
with open('../data/vietnamese_rag/documents-with-ids3.json', 'rt') as f_in:
    documents3 = json.load(f_in)

In [6]:
with open('../data/vietnamese_rag/documents-with-ids4.json', 'rt') as f_in:
    documents4 = json.load(f_in)

In [7]:
with open('../data/vietnamese_rag/documents-with-ids5.json', 'rt') as f_in:
    documents5 = json.load(f_in)

In [8]:
def load_documents(base_path, num_files):
    documents = []
    for i in range(1, num_files + 1):
        file_path = f'{base_path}/documents-with-ids{i}.json'
        with open(file_path, 'rt') as f_in:
            documents.extend(json.load(f_in))
    return documents
base_path = '../data/vietnamese_rag'
num_files = 5
documents = load_documents(base_path, num_files)
df_ground_truth = pd.read_csv('../data/vietnamese_rag/ground_truth_data/ground_truth_data.csv')

ground_truth = df_ground_truth.to_dict(orient='records')
doc_idx = {d['id']: d for d in documents}
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")




In [9]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "group": {"type": "keyword"},
            "context": {"type": "text"},
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "id": {"type": "keyword"},
            "context_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_context_answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "vietnamese-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vietnamese-questions'})

In [10]:
def load_vectors(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

def process_documents(documents, index_name, es_client):
    full_documents = []
    for i in range(1, 6):
        if i == 1:
            data = documents1.copy()
        elif i == 2:
            data = documents2.copy()
        elif i == 3:
            data = documents3.copy()
        elif i == 4:
            data = documents4.copy()
        elif i == 5:
            data = documents5.copy()
        document_qta_vector_list = load_vectors(f'../data/vietnamese_rag/question_context_answer_vector_pickle/question_context_answer_vector{i}.pkl')

        for j in range(len(data)):
            data[j]['question_context_answer_vector'] = document_qta_vector_list[j]['question_context_answer_vector']
        full_documents.extend(data)
    for doc in tqdm(full_documents):
        es_client.index(index=index_name, document=doc)
process_documents(documents, index_name, es_client)

100%|█████████| 6089/6089 [02:18<00:00, 43.86it/s]


In [11]:
def elastic_search_knn(field, vector, group):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "group": group
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["group", "context", "question", "answer", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs
def question_context_answer_vector_knn(q):
    question = q['question']
    group = q['Group']

    v_q = model.encode(question)

    return elastic_search_knn('question_context_answer_vector', v_q, group)
def build_prompt(query, search_results):
    prompt_template = """
You're an assistant working in customer service. Your job is to provide answers to users' questions. Answer the QUESTION based on the CONTEXT from the documents database.
Use only the facts from the CONTEXT when answering the QUESTION. Provide answer in Vietnamese , in normal text form, not using any markdown form, no need to rewrite the question and make sure that is an answer, not listing questions. Also make sure that the answer provides most information from the CONTEXT as possible .

QUESTION: {question}

CONTEXT: 
{context}
""".strip()
    context = ""
    
    for doc in search_results:
        context = context + f"group: {doc['group']}\nquestion: {doc['question']}\nanswer: {doc['answer']}\ncontext: {doc['context'][:1000]}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt
client =  Groq(api_key = os.environ['GROQ_API_KEY5'])
def llm(prompt, model = 'mixtral-8x7b-32768'):
    retries = 5
    for i in range(retries):
        try:
            response = client.chat.completions.create(
                model= 'llama3-8b-8192',
                messages=[{"role": "user", "content": prompt}]
            )
            json_response = response.choices[0].message.content
            return json_response
        except HTTPError as e:
            if e.response.status_code == 429:  # Rate limit error
                retry_after = float(e.response.json()['error']['message'].split('in ')[-1].split('s')[0])
                time.sleep(retry_after)
            else:
                raise
        except Exception as e:
            if i < retries - 1:
                time.sleep(2 ** i)  # Exponential backoff
            else:
                raise
# previously: rag(query: str) -> str
def rag(query: dict, model='mixtral-8x7b-32768') -> str:
    search_results = question_context_answer_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer
pool = ThreadPoolExecutor(max_workers=6)
def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

def process_record(rec):
    model = 'mixtral-8x7b-32768'
    answer_llm = rag(rec, model = model)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['answer']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'group': rec['Group'],
    }


In [12]:
documents_current = ground_truth[-1215*3:-1215*2]

In [13]:
chunk_size = 15
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    if (i == end_chunk - 1):
        chunk_end = chunk_start + chunk_size + 1
    print(i + 1, chunk_start, chunk_end)

1 0 15
2 15 30
3 30 45
4 45 60
5 60 75
6 75 90
7 90 105
8 105 120
9 120 135
10 135 150
11 150 165
12 165 180
13 180 195
14 195 210
15 210 225
16 225 240
17 240 255
18 255 270
19 270 285
20 285 300
21 300 315
22 315 330
23 330 345
24 345 360
25 360 375
26 375 390
27 390 405
28 405 420
29 420 435
30 435 450
31 450 465
32 465 480
33 480 495
34 495 510
35 510 525
36 525 540
37 540 555
38 555 570
39 570 585
40 585 600
41 600 615
42 615 630
43 630 645
44 645 660
45 660 675
46 675 690
47 690 705
48 705 720
49 720 735
50 735 750
51 750 765
52 765 780
53 780 795
54 795 810
55 810 825
56 825 840
57 840 855
58 855 870
59 870 885
60 885 900
61 900 915
62 915 930
63 930 945
64 945 960
65 960 975
66 975 990
67 990 1005
68 1005 1020
69 1020 1035
70 1035 1050
71 1050 1065
72 1065 1080
73 1080 1095
74 1095 1110
75 1110 1125
76 1125 1140
77 1140 1155
78 1155 1170
79 1170 1185
80 1185 1200
81 1200 1216


In [16]:
chunk_size = 15
start_chunk = 61 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    if (i == end_chunk - 1):
        chunk_end = chunk_start + chunk_size + 1
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents_current[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_record)
    results.extend(processed_results)
    # # Store the results incrementally
    # for result in processed_results:
    #     if result is not None:
    #         doc_id, questions = result
    #         results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/llm_answer/llm_answer_last{i + 163}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    # time.sleep(23)

100%|███████████████████████████████████████████████████| 15/15 [00:20<00:00,  1.36s/it]


Chunk 61 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last224.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:26<00:00,  5.79s/it]


Chunk 62 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last225.pkl


100%|████████████| 15/15 [01:22<00:00,  5.47s/it]


Chunk 63 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last226.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:03<00:00,  4.20s/it]


Chunk 64 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last227.pkl


100%|███████████████████████████████████████████████████| 15/15 [02:00<00:00,  8.03s/it]


Chunk 65 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last228.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:37<00:00,  6.53s/it]


Chunk 66 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last229.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:30<00:00,  6.06s/it]


Chunk 67 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last230.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:28<00:00,  5.92s/it]


Chunk 68 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last231.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:30<00:00,  6.06s/it]


Chunk 69 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last232.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:20<00:00,  5.37s/it]


Chunk 70 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last233.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:34<00:00,  6.27s/it]


Chunk 71 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last234.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:41<00:00,  6.75s/it]


Chunk 72 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last235.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:30<00:00,  6.02s/it]


Chunk 73 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last236.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:24<00:00,  5.64s/it]


Chunk 74 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last237.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:36<00:00,  6.46s/it]


Chunk 75 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last238.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:25<00:00,  5.72s/it]


Chunk 76 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last239.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:35<00:00,  6.38s/it]


Chunk 77 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last240.pkl


100%|████████████| 15/15 [01:40<00:00,  6.69s/it]


Chunk 78 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last241.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:27<00:00,  5.83s/it]


Chunk 79 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last242.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:38<00:00,  6.59s/it]

Chunk 80 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last243.pkl





In [17]:
documents_current = ground_truth[-1215*7:-1215*6]

In [None]:
chunk_size = 15
start_chunk = 8 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    if (i == end_chunk - 1):
        chunk_end = chunk_start + chunk_size + 1
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents_current[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_record)
    results.extend(processed_results)
    # # Store the results incrementally
    # for result in processed_results:
    #     if result is not None:
    #         doc_id, questions = result
    #         results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/llm_answer/llm_answer_last{i + 487}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    # time.sleep(23)

100%|███████████████████████████████████████████████████| 15/15 [00:45<00:00,  3.03s/it]


Chunk 8 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last495.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:37<00:00,  6.47s/it]


Chunk 9 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last496.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:29<00:00,  5.98s/it]


Chunk 10 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last497.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:16<00:00,  5.08s/it]


Chunk 11 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last498.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:28<00:00,  5.91s/it]


Chunk 12 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last499.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:35<00:00,  6.37s/it]


Chunk 13 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last500.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:08<00:00,  4.55s/it]


Chunk 14 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last501.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:48<00:00,  7.27s/it]


Chunk 15 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last502.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:47<00:00,  7.17s/it]


Chunk 16 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last503.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:04<00:00,  4.33s/it]


Chunk 17 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last504.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:44<00:00,  6.98s/it]


Chunk 18 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last505.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:36<00:00,  6.46s/it]


Chunk 19 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last506.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:15<00:00,  5.00s/it]


Chunk 20 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last507.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:49<00:00,  7.31s/it]


Chunk 21 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last508.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:11<00:00,  4.76s/it]


Chunk 22 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last509.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:46<00:00,  7.13s/it]


Chunk 23 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last510.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:29<00:00,  5.97s/it]


Chunk 24 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last511.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:28<00:00,  5.92s/it]


Chunk 25 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last512.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:36<00:00,  6.43s/it]


Chunk 26 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last513.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:14<00:00,  4.99s/it]


Chunk 27 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last514.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:43<00:00,  6.92s/it]


Chunk 28 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last515.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:14<00:00,  4.99s/it]


Chunk 29 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last516.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:30<00:00,  6.02s/it]


Chunk 30 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last517.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:07<00:00,  4.52s/it]


Chunk 31 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last518.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:28<00:00,  5.91s/it]


Chunk 32 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last519.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:43<00:00,  6.88s/it]


Chunk 33 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last520.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:33<00:00,  6.22s/it]


Chunk 34 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last521.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:05<00:00,  4.37s/it]


Chunk 35 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last522.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:47<00:00,  7.17s/it]


Chunk 36 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last523.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:38<00:00,  6.58s/it]


Chunk 37 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last524.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:27<00:00,  5.81s/it]


Chunk 38 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last525.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:25<00:00,  5.72s/it]


Chunk 39 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last526.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:37<00:00,  6.47s/it]


Chunk 40 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last527.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:30<00:00,  6.03s/it]


Chunk 41 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last528.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:17<00:00,  5.15s/it]


Chunk 42 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last529.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:17<00:00,  5.16s/it]


Chunk 43 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last530.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:34<00:00,  6.30s/it]


Chunk 44 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last531.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:29<00:00,  5.94s/it]


Chunk 45 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last532.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:36<00:00,  6.42s/it]


Chunk 46 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last533.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:08<00:00,  4.57s/it]


Chunk 47 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last534.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:58<00:00,  7.87s/it]


Chunk 48 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last535.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:23<00:00,  5.58s/it]


Chunk 49 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last536.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:22<00:00,  5.52s/it]


Chunk 50 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last537.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:33<00:00,  6.21s/it]


Chunk 51 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last538.pkl


100%|███████████████████████████████████████████████████| 15/15 [01:46<00:00,  7.12s/it]


Chunk 52 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last539.pkl


  0%|                     | 0/15 [00:00<?, ?it/s]