In [1]:
import json
from openai import OpenAI
from groq import Groq
import os
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import pickle
from elasticsearch import Elasticsearch
import pandas as pd
import requests
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from requests.exceptions import HTTPError

import time

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open('../data/vietnamese_rag/documents-with-ids1.json', 'rt') as f_in:
    documents1 = json.load(f_in)

In [4]:
with open('../data/vietnamese_rag/documents-with-ids2.json', 'rt') as f_in:
    documents2 = json.load(f_in)

In [5]:
with open('../data/vietnamese_rag/documents-with-ids3.json', 'rt') as f_in:
    documents3 = json.load(f_in)

In [6]:
with open('../data/vietnamese_rag/documents-with-ids4.json', 'rt') as f_in:
    documents4 = json.load(f_in)

In [7]:
with open('../data/vietnamese_rag/documents-with-ids5.json', 'rt') as f_in:
    documents5 = json.load(f_in)

In [8]:
def load_documents(base_path, num_files):
    documents = []
    for i in range(1, num_files + 1):
        file_path = f'{base_path}/documents-with-ids{i}.json'
        with open(file_path, 'rt') as f_in:
            documents.extend(json.load(f_in))
    return documents
base_path = '../data/vietnamese_rag'
num_files = 5
documents = load_documents(base_path, num_files)
df_ground_truth = pd.read_csv('../data/vietnamese_rag/ground_truth_data/ground_truth_data.csv')

ground_truth = df_ground_truth.to_dict(orient='records')
doc_idx = {d['id']: d for d in documents}
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")




In [9]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "group": {"type": "keyword"},
            "context": {"type": "text"},
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "id": {"type": "keyword"},
            "context_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_context_answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "vietnamese-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vietnamese-questions'})

In [10]:
def load_vectors(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

def process_documents(documents, index_name, es_client):
    full_documents = []
    for i in range(1, 6):
        if i == 1:
            data = documents1.copy()
        elif i == 2:
            data = documents2.copy()
        elif i == 3:
            data = documents3.copy()
        elif i == 4:
            data = documents4.copy()
        elif i == 5:
            data = documents5.copy()
        document_qta_vector_list = load_vectors(f'../data/vietnamese_rag/question_context_answer_vector_pickle/question_context_answer_vector{i}.pkl')

        for j in range(len(data)):
            data[j]['question_context_answer_vector'] = document_qta_vector_list[j]['question_context_answer_vector']
        full_documents.extend(data)
    for doc in tqdm(full_documents):
        es_client.index(index=index_name, document=doc)
process_documents(documents, index_name, es_client)

100%|███████████████████████████████████████████████| 6089/6089 [02:24<00:00, 42.03it/s]


In [11]:
def elastic_search_knn(field, vector, group):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "group": group
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["group", "context", "question", "answer", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs
def question_context_answer_vector_knn(q):
    question = q['question']
    group = q['Group']

    v_q = model.encode(question)

    return elastic_search_knn('question_context_answer_vector', v_q, group)
def build_prompt(query, search_results):
    prompt_template = """
You're an assistant working in customer service. Your job is to provide answers to users' questions. Answer the QUESTION based on the CONTEXT from the documents database.
Use only the facts from the CONTEXT when answering the QUESTION. Provide answer in Vietnamese , in normal text form, not using any markdown form, no need to rewrite the question and make sure that is an answer, not listing questions. Also make sure that the answer provides most information from the CONTEXT as possible .

QUESTION: {question}

CONTEXT: 
{context}
""".strip()
    context = ""
    
    for doc in search_results:
        context = context + f"group: {doc['group']}\nquestion: {doc['question']}\nanswer: {doc['answer']}\ncontext: {doc['context'][:1000]}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt
client =  Groq(api_key = os.environ['GROQ_API_KEY4'])
def llm(prompt, model = 'mixtral-8x7b-32768'):
    retries = 5
    for i in range(retries):
        try:
            response = client.chat.completions.create(
                model= 'llama3-8b-8192',
                messages=[{"role": "user", "content": prompt}]
            )
            json_response = response.choices[0].message.content
            return json_response
        except HTTPError as e:
            if e.response.status_code == 429:  # Rate limit error
                retry_after = float(e.response.json()['error']['message'].split('in ')[-1].split('s')[0])
                time.sleep(retry_after)
            else:
                raise
        except Exception as e:
            if i < retries - 1:
                time.sleep(2 ** i)  # Exponential backoff
            else:
                raise
# previously: rag(query: str) -> str
def rag(query: dict, model='mixtral-8x7b-32768') -> str:
    search_results = question_context_answer_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer
pool = ThreadPoolExecutor(max_workers=6)
def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

def process_record(rec):
    model = 'mixtral-8x7b-32768'
    answer_llm = rag(rec, model = model)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['answer']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'group': rec['Group'],
    }


In [17]:
documents_current = ground_truth[-1215*2:-1215]
len(documents_current)

1215

In [18]:
chunk_size = 15
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    if (i == end_chunk - 1):
        chunk_end = chunk_start + chunk_size + 1
    print(i + 82, chunk_start, chunk_end)

82 0 15
83 15 30
84 30 45
85 45 60
86 60 75
87 75 90
88 90 105
89 105 120
90 120 135
91 135 150
92 150 165
93 165 180
94 180 195
95 195 210
96 210 225
97 225 240
98 240 255
99 255 270
100 270 285
101 285 300
102 300 315
103 315 330
104 330 345
105 345 360
106 360 375
107 375 390
108 390 405
109 405 420
110 420 435
111 435 450
112 450 465
113 465 480
114 480 495
115 495 510
116 510 525
117 525 540
118 540 555
119 555 570
120 570 585
121 585 600
122 600 615
123 615 630
124 630 645
125 645 660
126 660 675
127 675 690
128 690 705
129 705 720
130 720 735
131 735 750
132 750 765
133 765 780
134 780 795
135 795 810
136 810 825
137 825 840
138 840 855
139 855 870
140 870 885
141 885 900
142 900 915
143 915 930
144 930 945
145 945 960
146 960 975
147 975 990
148 990 1005
149 1005 1020
150 1020 1035
151 1035 1050
152 1050 1065
153 1065 1080
154 1080 1095
155 1095 1110
156 1110 1125
157 1125 1140
158 1140 1155
159 1155 1170
160 1170 1185
161 1185 1200
162 1200 1216


In [None]:
chunk_size = 15
start_chunk = 0 # Starting chunk index
end_chunk = (len(documents_current) // chunk_size)   # Ending chunk index
# print(end_chunk)
for i in range(start_chunk, end_chunk):
    results = []
    chunk_start = i * chunk_size
    chunk_end = chunk_start + chunk_size
    if (i == end_chunk - 1):
        chunk_end = chunk_start + chunk_size + 1
    # print(i + 1, chunk_start, chunk_end)
    chunk = documents_current[chunk_start:chunk_end]

    # Use map_progress to process documents
    processed_results = map_progress(pool, chunk, process_record)
    results.extend(processed_results)
    # # Store the results incrementally
    # for result in processed_results:
    #     if result is not None:
    #         doc_id, questions = result
    #         results[doc_id] = questions

    # Save the results to a file
    file_name = f'../data/vietnamese_rag/llm_answer/llm_answer_last{i + 82}.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(results, file)

    # Print out the results
    print(f"Chunk {i} processed and saved to {file_name}")
    # print(results)

    # Wait for 1 minute to reset rate limit
    # time.sleep(23)

100%|█████████████| 15/15 [00:05<00:00,  2.66it/s]


Chunk 0 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last82.pkl


100%|█████████████| 15/15 [01:41<00:00,  6.74s/it]


Chunk 1 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last83.pkl


100%|█████████████| 15/15 [00:55<00:00,  3.67s/it]


Chunk 2 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last84.pkl


100%|█████████████| 15/15 [00:16<00:00,  1.10s/it]


Chunk 3 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last85.pkl


100%|█████████████| 15/15 [00:04<00:00,  3.60it/s]


Chunk 4 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last86.pkl


100%|█████████████| 15/15 [00:31<00:00,  2.09s/it]


Chunk 5 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last87.pkl


100%|█████████████| 15/15 [00:37<00:00,  2.51s/it]


Chunk 6 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last88.pkl


100%|█████████████| 15/15 [00:37<00:00,  2.53s/it]


Chunk 7 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last89.pkl


100%|█████████████| 15/15 [01:41<00:00,  6.76s/it]


Chunk 8 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last90.pkl


100%|█████████████| 15/15 [01:34<00:00,  6.27s/it]


Chunk 9 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last91.pkl


100%|█████████████| 15/15 [00:50<00:00,  3.35s/it]


Chunk 10 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last92.pkl


100%|█████████████| 15/15 [00:04<00:00,  3.60it/s]


Chunk 11 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last93.pkl


100%|█████████████| 15/15 [00:08<00:00,  1.68it/s]


Chunk 12 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last94.pkl


100%|█████████████| 15/15 [00:30<00:00,  2.01s/it]


Chunk 13 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last95.pkl


100%|█████████████| 15/15 [00:36<00:00,  2.43s/it]


Chunk 14 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last96.pkl


100%|█████████████| 15/15 [01:05<00:00,  4.38s/it]


Chunk 15 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last97.pkl


100%|█████████████| 15/15 [01:25<00:00,  5.68s/it]


Chunk 16 processed and saved to ../data/vietnamese_rag/llm_answer/llm_answer_last98.pkl


  0%|                      | 0/15 [00:00<?, ?it/s]