In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
!pip install -q -U lmql[hf]
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git

In [34]:
import lmql
import json
import numpy as np
import torch
# import torch
# from transformers import BitsAndBytesConfig

# quantization_config = BitsAndBytesConfig(
#    load_in_4bit=True,
#    bnb_4bit_quant_type="nf4",
#    bnb_4bit_use_double_quant=True,
#    bnb_4bit_compute_dtype=torch.bfloat16
# )

model = lmql.model("local:mistralai/Mistral-7B-Instruct-v0.2", load_in_4bit=True, device_map="auto")
# model = lmql.model("local:microsoft/phi-2", device_map="auto", load_in_4bit=True)

In [36]:
@lmql.query
async def score(instruction, target_context, statements):
  """
  argmax
    l = []
    for i in range(len(statements)):
      "[VERDICT: verdict(instruction, target_context, statements[i])]"
      l.append(VERDICT)
    return l
  from
    model
  """

@lmql.query
async def verdict(instruction, target_context, statement):
  """lmql
    "{instruction}\n"
    "Context: {target_context}\n"
    "Statement: {statement}\n"
    "Explanation: [EXPLANATION]" where len(TOKENS(EXPLANATION)) < 100 or STOPS_BEFORE(EXPLANATION, "\n") or STOPS_BEFORE(EXPLANATION, "\n\nVerdict: ")
    e = EXPLANATION.strip()
    "Verdict: [VERDICT]" where VERDICT in set(["True", "False"])
    return [VERDICT, e]
  """

faithfulness_instruction = "You are given a pair of context and statement. You want to judge whether the statement can be inferred from the context provided. You should provide explanation for reasoning before provoding verdict. Your explanation should be concise."
answer_relevancy_instruction = "You are given an answer for a question. You want to generate some questions for the given answer to guess the original answer."
context_relevancy_instruction = "You are given a pair of context and statement. You want to judge whether a statement is relevant to the context provided. You should provide explanation for reasoning before provoding verdict. Your explanation should be concise."

context_precision_instruction = "You are given a pair of context and statement. You want to judge whether the statement is relevant to arrive at the context. You should provide explanation for reasoning before provoding verdict. Your explanation should be concise."
context_recall_instruction = "You are given a pair of context and statement. You want to judge whether a statement can be inferred from the context provided. You should provide explanation for reasoning before provoding verdict. Your explanation should be concise."

In [17]:
import json
# with open("/kaggle/input/zero-shot-eval/zero_shot_eval_dataset_with_statements.json", "r") as f:
with open("/kaggle/input/few-shot-eval/few_shot_eval_dataset_with_statements.json", "r") as f:
# with open("/content/drive/MyDrive/Colab Notebooks/FYP_Eval/Dataset/zero_shot_eval_dataset_with_statements.json", "r") as f:
# with open("/content/drive/MyDrive/Colab Notebooks/FYP_Eval/Dataset/few_shot_eval_dataset_with_statements.json", "r") as f:
    data = json.load(f)
    # selecting the first 30 rows for valid questions only, skipped invalid questions
    data = data[:30]

In [18]:
# calculating faithfulness, requiring statements of actual output and context
verdict_pairs_list = []
for i in range(len(data)):
    print(f"evaluating {i+1}th row")
    row = data[i]
    # we want to know the proportion of output statements that can be inferred from context
    context = " ".join(row["retrieval_context"])
    statements = row["actual_output_statements"]
    verdict_pairs = await score(faithfulness_instruction, context, statements)
    verdict_pairs_list.append(verdict_pairs)

faithfulness_verdicts = []
# for all verdict results, we want to create a dict / json to store and map each verdict to its explanation, statement and context
for i, row in enumerate(data):
    context = " ".join(row["retrieval_context"])
    statements = row["actual_output_statements"]
    verdict_dicts = []
    for j, statement in enumerate(statements):
        verdict, explanation = verdict_pairs_list[i][j]
        verdict_dict = {
            "Context": context,
            "Statement": statement,
            "Verdict": verdict,
            "Explanation": explanation
        }
        verdict_dicts.append(verdict_dict)
    faithfulness_verdicts.append(verdict_dicts)

faithfulness_scores = []
# each verdict_dicts represent a row in original data
for verdict_dicts in faithfulness_verdicts:
    count_True = 0
    count_verdict = 0
    for  verdict_dict in verdict_dicts:
        count_verdict += 1
        if verdict_dict["Verdict"] == "True":
            count_True += 1
    # faithfulness is proportion of relevant statements to context
    f = count_True / count_verdict
    faithfulness_scores.append(f)
print(np.mean(faithfulness_scores))

faithfulness = {
    "Scores": faithfulness_scores,
    "Verdicts": faithfulness_verdicts
}

# evaluation time ~20mins, average score = 0.8022
with open("./faithfulness.json", "w") as f:
  json.dump(faithfulness, f)

evaluating 1th row
evaluating 2th row
evaluating 3th row
evaluating 4th row
evaluating 5th row
evaluating 6th row
evaluating 7th row
evaluating 8th row
evaluating 9th row
evaluating 10th row
evaluating 11th row
evaluating 12th row
evaluating 13th row
evaluating 14th row
evaluating 15th row
evaluating 16th row
evaluating 17th row
evaluating 18th row
evaluating 19th row
evaluating 20th row
evaluating 21th row
evaluating 22th row
evaluating 23th row
evaluating 24th row
evaluating 25th row
evaluating 26th row
evaluating 27th row
evaluating 28th row
evaluating 29th row
evaluating 30th row
0.8627777777777779


In [21]:
# calculating context relevancy, requiring context and question
verdict_pairs_list = []
for i in range(len(data)):
    row = data[i]
    # we want to know proportion of context statements that are relevant to user query
    context = row["input"]
    statements = row["context_statements"]
    print(f"evaluating {i+1}th row with {len(statements)} statements...") # it is quite long waiting time
    verdict_pairs = await score(context_relevancy_instruction, context, statements)
    verdict_pairs_list.append(verdict_pairs)

context_relevancy_verdicts = []
# for all verdict results, we want to create a dict / json to store and map each verdict to its explanation, statement and context
for i, row in enumerate(data):
    context = row["input"]
    statements = row["context_statements"]
    verdict_dicts = []
    for j, statement in enumerate(statements):
        verdict, explanation = verdict_pairs_list[i][j]
        verdict_dict = {
            "Context": context,
            "Statement": statement,
            "Verdict": verdict,
            "Explanation": explanation
        }
        verdict_dicts.append(verdict_dict)
    context_relevancy_verdicts.append(verdict_dicts)

context_relevancy_scores = []
# each verdict_dicts represent a row in original data
for verdict_dicts in context_relevancy_verdicts:
    count_True = 0
    count_verdict = 0
    for  verdict_dict in verdict_dicts:
        count_verdict += 1
        if verdict_dict["Verdict"] == "True":
            count_True += 1
    # context_relevancy is proportion of relevant statements to context
    cr = count_True / count_verdict
    context_relevancy_scores.append(cr)
print(np.mean(context_relevancy_scores))

context_relevancy = {
    "Scores": context_relevancy_scores,
    "Verdicts": context_relevancy_verdicts
}

with open("./context_relevancy.json", "w") as f:
    json.dump(context_relevancy, f)

evaluating 1th row with 6 statements...
evaluating 2th row with 5 statements...
evaluating 3th row with 7 statements...
evaluating 4th row with 7 statements...
evaluating 5th row with 3 statements...
evaluating 6th row with 14 statements...
evaluating 7th row with 14 statements...
evaluating 8th row with 14 statements...
evaluating 9th row with 14 statements...
evaluating 10th row with 13 statements...
evaluating 11th row with 5 statements...
evaluating 12th row with 4 statements...
evaluating 13th row with 5 statements...
evaluating 14th row with 5 statements...
evaluating 15th row with 6 statements...
evaluating 16th row with 4 statements...
evaluating 17th row with 6 statements...
evaluating 18th row with 11 statements...
evaluating 19th row with 5 statements...
evaluating 20th row with 6 statements...
evaluating 21th row with 8 statements...
evaluating 22th row with 3 statements...
evaluating 23th row with 5 statements...
evaluating 24th row with 13 statements...
evaluating 25th ro

In [28]:
from transformers import AutoTokenizer, AutoModel

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en-v1.5')
embed_model = AutoModel.from_pretrained('BAAI/bge-large-en-v1.5')
embed_model.eval()

# load actual question and generated questions from dataset
generated_questions = [row["generated_questions"] for row in data]
actual_questions = [row["input"] for row in data]

answer_relevancy_verdicts = []
answer_relevancy_scores = []
# calculate answer relevancy, requiring user queries and generated questions
for i in range(len(data)):
    print(f"evaluating {i+1}th row")
    encoded_generated_questions = tokenizer(generated_questions[i], padding=True, truncation=True, return_tensors='pt', max_length=512)
    encoded_actual_question = tokenizer([actual_questions[i]], padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        model_output = embed_model(**encoded_generated_questions) 
        embeddings_generated_questions = model_output[0][:, 0]
        model_output = embed_model(**encoded_actual_question)
        embeddings_actual_question = model_output[0][:, 0]
    # normalize embeddings
    embeddings_generated_questions = torch.nn.functional.normalize(embeddings_generated_questions, p=2, dim=1)
    embeddings_actual_question = torch.nn.functional.normalize(embeddings_actual_question, p=2, dim=1)
    
    # Compute cosine similarity
    embeddings_actual_question = embeddings_actual_question.repeat(embeddings_generated_questions.size(0), 1) # to make both tensors have same dimension
    cos = torch.nn.CosineSimilarity(dim=1)
    cosine_similarity = cos(embeddings_generated_questions, embeddings_actual_question)
    
    verdict_dicts = []
    # Store result in dict/json
    for j in range(len(generated_questions[i])):
        verdict_dict = {
            "Actaul_question": actual_questions[i],
            "Generated_questions": generated_questions[i][j],
            "Scores": float(cosine_similarity[j]),
        }
        verdict_dicts.append(verdict_dict)
    answer_relevancy_verdicts.append(verdict_dicts)
    answer_relevancy_scores.append(torch.mean(cosine_similarity).item())
print(np.mean(answer_relevancy_scores))

answer_relevancy = {
    "Scores": answer_relevancy_scores,
    "Verdicts": answer_relevancy_verdicts
}

with open("./answer_relevancy.json", "w") as f:
    json.dump(answer_relevancy, f)

evaluating 1th row
evaluating 2th row
evaluating 3th row
evaluating 4th row
evaluating 5th row
evaluating 6th row
evaluating 7th row
evaluating 8th row
evaluating 9th row
evaluating 10th row
evaluating 11th row
evaluating 12th row
evaluating 13th row
evaluating 14th row
evaluating 15th row
evaluating 16th row
evaluating 17th row
evaluating 18th row
evaluating 19th row
evaluating 20th row
evaluating 21th row
evaluating 22th row
evaluating 23th row
evaluating 24th row
evaluating 25th row
evaluating 26th row
evaluating 27th row
evaluating 28th row
evaluating 29th row
evaluating 30th row
0.815654041369756


In [35]:
# calculating context recall, requiring statements of actual output and context
verdict_pairs_list = []
for i in range(len(data)):
    print(f"evaluating {i+1}th row")
    row = data[i]
    # we want to know the proportion of ground truth statements that can be inferred from context
    context = " ".join(row["retrieval_context"])
    statements = row["actual_output_statements"]
    verdict_pairs = await score(context_recall_instruction, context, statements)
    verdict_pairs_list.append(verdict_pairs)

context_recall_verdicts = []
# for all verdict results, we want to create a dict / json to store and map each verdict to its explanation, statement and context
for i, row in enumerate(data):
    context = " ".join(row["retrieval_context"])
    statements = row["actual_output_statements"]
    verdict_dicts = []
    for j, statement in enumerate(statements):
        verdict, explanation = verdict_pairs_list[i][j]
        verdict_dict = {
            "Context": context,
            "Statement": statement,
            "Verdict": verdict,
            "Explanation": explanation
        }
        verdict_dicts.append(verdict_dict)
    context_recall_verdicts.append(verdict_dicts)

context_recall_scores = []
# each verdict_dicts represent a row in original data
for verdict_dicts in context_recall_verdicts:
    count_True = 0
    count_verdict = 0
    for  verdict_dict in verdict_dicts:
        count_verdict += 1
        if verdict_dict["Verdict"] == "True":
            count_True += 1
    # context_recall is proportion of relevant statements to context
    cr = count_True / count_verdict
    context_recall_scores.append(cr)
print(np.mean(context_recall_scores))

context_recall = {
    "Scores": context_recall_scores,
    "Verdicts": context_recall_verdicts
}

with open("./context_recall.json", "w") as f:
    json.dump(context_recall, f)

evaluating 1th row
evaluating 2th row
evaluating 3th row
evaluating 4th row
evaluating 5th row
evaluating 6th row
evaluating 7th row
evaluating 8th row
evaluating 9th row
evaluating 10th row
evaluating 11th row
evaluating 12th row
evaluating 13th row
evaluating 14th row
evaluating 15th row
evaluating 16th row
evaluating 17th row
evaluating 18th row
evaluating 19th row
evaluating 20th row
evaluating 21th row
evaluating 22th row
evaluating 23th row
evaluating 24th row
evaluating 25th row
evaluating 26th row
evaluating 27th row
evaluating 28th row
evaluating 29th row
evaluating 30th row
0.801111111111111


In [37]:
# calculating context precision, requiring context and ground truth
verdict_pairs_list = []
for i in range(len(data)):
    print(f"evaluating {i+1}th row")
    row = data[i]
    # we want to know whether positions of relevant nodes are higher than non relevant one
    context = row["expected_output"]
    statements = row["retrieval_context"]
    verdict_pairs = await score(context_precision_instruction, context, statements)
    verdict_pairs_list.append(verdict_pairs)

context_precision_verdicts = []
# for all verdict results, we want to create a dict / json to store and map each verdict to its explanation, statement and context
for i, row in enumerate(data):
    context = row["expected_output"]
    statements = row["retrieval_context"]
    verdict_dicts = []
    for j, statement in enumerate(statements):
        verdict, explanation = verdict_pairs_list[i][j]
        verdict_dict = {
            "Context": context,
            "Statement": statement,
            "Verdict": verdict,
            "Explanation": explanation
        }
        verdict_dicts.append(verdict_dict)
    context_precision_verdicts.append(verdict_dicts)

context_precision_scores = []
# each verdict_dicts represent a row in original data
for verdict_dicts in context_precision_verdicts:
    count_True = 0
    count_verdict = 0
    for  verdict_dict in verdict_dicts:
        count_verdict += 1
        if verdict_dict["Verdict"] == "True":
            count_True += 1
    # context_precision is proportion of relevant statements to context
    cp = count_True / count_verdict
    context_precision_scores.append(cp)
print(np.mean(context_precision_scores))

context_precision = {
    "Scores": context_precision_scores,
    "Verdicts": context_precision_verdicts
}

with open("./context_precision.json", "w") as f:
    json.dump(context_precision, f)

evaluating 1th row
evaluating 2th row
evaluating 3th row
evaluating 4th row
evaluating 5th row
evaluating 6th row
evaluating 7th row
evaluating 8th row
evaluating 9th row
evaluating 10th row
evaluating 11th row
evaluating 12th row
evaluating 13th row
evaluating 14th row
evaluating 15th row
evaluating 16th row
evaluating 17th row
evaluating 18th row
evaluating 19th row
evaluating 20th row
evaluating 21th row
evaluating 22th row
evaluating 23th row
evaluating 24th row
evaluating 25th row
evaluating 26th row
evaluating 27th row
evaluating 28th row
evaluating 29th row
evaluating 30th row
0.5333333333333333
