In [2]:
import json
from openai import OpenAI
from groq import Groq
import os
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import pickle
from elasticsearch import Elasticsearch
import pandas as pd
import requests
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from requests.exceptions import HTTPError

import time

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [6]:
client =  Groq(api_key = os.environ['GROQ_API_KEY1'])
def llm(prompt, model = 'mixtral-8x7b-32768'):
    retries = 5
    for i in range(retries):
        try:
            response = client.chat.completions.create(
                model= 'llama3-8b-8192',
                messages=[{"role": "user", "content": prompt}]
            )
            json_response = response.choices[0].message.content
            return json_response
        except HTTPError as e:
            if e.response.status_code == 429:  # Rate limit error
                retry_after = float(e.response.json()['error']['message'].split('in ')[-1].split('s')[0])
                time.sleep(retry_after)
            else:
                raise
        except Exception as e:
            if i < retries - 1:
                time.sleep(2 ** i)  # Exponential backoff
            else:
                raise

In [35]:
results = []
for i in range(1,83):
    
    with open(f"../data/vietnamese_rag/llm_answer/llm_answer{i}.pkl", 'rb') as file:
        r = pickle.load(file)
    results.extend(r)
df = pd.DataFrame(results)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1217 entries, 0 to 1216
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   answer_llm   1217 non-null   object
 1   answer_orig  1217 non-null   object
 2   document     1217 non-null   object
 3   question     1217 non-null   object
 4   group        1217 non-null   object
dtypes: object(5)
memory usage: 47.7+ KB


In [39]:
df.to_csv('../data/vietnamese_rag/llm_answer/llm_answer1.csv', index=False)

In [None]:
for i in range(2, 83):
    evaluations = []
    results = []
    json_evaluations = []
    
    # Load the pickle file
    with open(f"../data/vietnamese_rag/llm_answer/llm_answer{i}.pkl", 'rb') as file:
        r = pickle.load(file)
    results.extend(r)
    
    # Convert results to DataFrame
    df = pd.DataFrame(results)
    samples = df.to_dict(orient='records')
    
    # Generate evaluations
    for record in tqdm(samples):
        prompt = prompt1_template.format(**record)
        evaluation = llm(prompt)
        evaluations.append(evaluation)
    
    # Parse evaluations
    for j, str_eval in enumerate(evaluations):
        try:
            json_eval = json.loads(str_eval)
        except json.JSONDecodeError as e:
            print(f"JSONDecodeError: {e}")
            # Attempt to fix the JSON string
            try:
                str_eval = str_eval.rstrip('}') + '}'  # Ensure it ends with a closing brace
                json_eval = json.loads(str_eval)
            except json.JSONDecodeError as e:
                print(f"Failed to fix JSON string: {e}")
                continue  # Skip this evaluation if it cannot be fixed
        json_evaluations.append(json_eval)
    
    # Save evaluations to CSV
    if json_evaluations:
        df_evaluations = pd.DataFrame(json_evaluations)
        df_evaluations.to_csv(f'../data/vietnamese_rag/evaluations_aqa/evaluations-aqa{i}.csv', index=False)

100%|██████████████████| 15/15 [00:07<00:00,  2.01it/s]


JSONDecodeError: Expecting ',' delimiter: line 3 column 417 (char 445)
JSONDecodeError: Expecting ',' delimiter: line 3 column 593 (char 628)
JSONDecodeError: Expecting ',' delimiter: line 3 column 414 (char 442)
JSONDecodeError: Expecting ',' delimiter: line 3 column 418 (char 446)
JSONDecodeError: Expecting ',' delimiter: line 3 column 392 (char 420)
JSONDecodeError: Expecting ',' delimiter: line 3 column 321 (char 347)
JSONDecodeError: Expecting ',' delimiter: line 3 column 417 (char 450)


100%|██████████████████| 15/15 [00:19<00:00,  1.30s/it]


JSONDecodeError: Expecting ',' delimiter: line 3 column 395 (char 423)
JSONDecodeError: Expecting ',' delimiter: line 3 column 413 (char 441)
JSONDecodeError: Expecting ',' delimiter: line 3 column 339 (char 367)
JSONDecodeError: Expecting ',' delimiter: line 3 column 616 (char 642)
JSONDecodeError: Expecting ',' delimiter: line 3 column 321 (char 349)
JSONDecodeError: Expecting ',' delimiter: line 3 column 269 (char 297)
JSONDecodeError: Expecting ',' delimiter: line 3 column 444 (char 476)
JSONDecodeError: Expecting ',' delimiter: line 3 column 454 (char 489)
JSONDecodeError: Expecting ',' delimiter: line 3 column 306 (char 332)
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Failed to fix JSON string: Expecting value: line 1 column 1 (char 0)
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Failed to fix JSON string: Expecting value: line 1 column 1 (char 0)
JSONDecodeError: Expecting ',' delimiter: line 3 column 498 (char 526)


100%|██████████████████| 15/15 [00:39<00:00,  2.64s/it]


JSONDecodeError: Expecting ',' delimiter: line 3 column 306 (char 334)
JSONDecodeError: Expecting ',' delimiter: line 3 column 498 (char 533)
JSONDecodeError: Expecting ',' delimiter: line 3 column 323 (char 358)
JSONDecodeError: Expecting ',' delimiter: line 3 column 411 (char 446)
JSONDecodeError: Expecting ',' delimiter: line 3 column 318 (char 346)
JSONDecodeError: Expecting ',' delimiter: line 3 column 428 (char 456)
JSONDecodeError: Expecting ',' delimiter: line 3 column 364 (char 392)
JSONDecodeError: Expecting ',' delimiter: line 3 column 269 (char 297)
JSONDecodeError: Expecting ',' delimiter: line 3 column 392 (char 427)
JSONDecodeError: Expecting ',' delimiter: line 3 column 365 (char 391)
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Failed to fix JSON string: Expecting value: line 1 column 1 (char 0)


 13%|██▌                | 2/15 [00:05<00:36,  2.84s/it]

In [50]:

    evaluations = []
    results = []
    json_evaluations = []
    with open(f"../data/vietnamese_rag/llm_answer/llm_answer1.pkl", 'rb') as file:
        r = pickle.load(file)
    results.extend(r)
    df = pd.DataFrame(results)
    samples = df.to_dict(orient='records')
    for record in tqdm(samples):
        prompt = prompt1_template.format(**record)
        evaluation = llm(prompt)
        evaluations.append(evaluation)
    

100%|██████████████████| 15/15 [00:06<00:00,  2.30it/s]


In [54]:
for i, str_eval in enumerate(evaluations):
    # print(str_eval)
    try:
        json_eval = json.loads(str_eval)
        
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {e}")
        str_eval = str_eval + "}"
        json_eval = json.loads(str_eval)
        # print("After fix: ", json_eval)
    json_evaluations.append(json_eval)
df_evaluations = pd.DataFrame(json_evaluations)
df_evaluations.to_csv(f'../data/vietnamese_rag/evaluations_aqa/evaluations-aqa1.csv', index=False)

JSONDecodeError: Expecting ',' delimiter: line 3 column 403 (char 431)
JSONDecodeError: Expecting ',' delimiter: line 3 column 468 (char 496)
JSONDecodeError: Expecting ',' delimiter: line 3 column 583 (char 611)
JSONDecodeError: Expecting ',' delimiter: line 3 column 620 (char 648)
JSONDecodeError: Expecting ',' delimiter: line 3 column 515 (char 550)
JSONDecodeError: Expecting ',' delimiter: line 3 column 512 (char 540)
JSONDecodeError: Expecting ',' delimiter: line 3 column 402 (char 430)


In [56]:
df_evaluations

Unnamed: 0,Relevance,Explanation
0,PARTLY_RELEVANT,The generated answer partially corresponds to ...
1,RELEVANT,The generated answer is partially summarized f...
2,PARTLY_RELEVANT,The generated answer partially corresponds to ...
3,RELEVANT,The generated answer is partially summarized f...
4,RELEVANT,The generated answer is a direct and literal r...
5,RELEVANT,The generated answer is identical to the origi...
6,RELEVANT,The generated answer contains a specific event...
7,RELEVANT,The generated answer is highly similar to the ...
8,RELEVANT,The generated answer is highly relevant to the...
9,PARTLY_RELEVANT,The generated answer partially addresses the q...


In [47]:
import json

evaluations = [
    '{"Relevance": "PARTLY_RELEVANT", "Explanation": "The generated answer partially corresponds to the original answer, as it describes Minh Tú\'s experience in the catwalk challenge on Asia\'s Next Top Model, which is similar to the context in the original answer. However, the generated answer focuses more on the outcome (her photo being considered too sexy) and doesn\'t fully capture the original answer\'s description of Minh Tú overcoming fear to complete the challenge successfully. The relevance is partly relevant because it shares some similar context, but crucial aspects of the original answer are missing."}'
]

json_evaluations = []

for i, str_eval in enumerate(evaluations):
    print(f"Evaluation {i}: {str_eval}")
    try:
        json_eval = json.loads(str_eval)
        json_evaluations.append(json_eval)
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {e}")
        # Optionally, handle the error or log it for further investigation

# If no errors, create DataFrame
if json_evaluations:
    import pandas as pd
    df_evaluations = pd.DataFrame(json_evaluations)
    print(df_evaluations)

Evaluation 0: {"Relevance": "PARTLY_RELEVANT", "Explanation": "The generated answer partially corresponds to the original answer, as it describes Minh Tú's experience in the catwalk challenge on Asia's Next Top Model, which is similar to the context in the original answer. However, the generated answer focuses more on the outcome (her photo being considered too sexy) and doesn't fully capture the original answer's description of Minh Tú overcoming fear to complete the challenge successfully. The relevance is partly relevant because it shares some similar context, but crucial aspects of the original answer are missing."}
         Relevance                                        Explanation
0  PARTLY_RELEVANT  The generated answer partially corresponds to ...


In [48]:
df_evaluations

Unnamed: 0,Relevance,Explanation
0,PARTLY_RELEVANT,The generated answer partially corresponds to ...
