In [None]:
!pip install transformers
!pip install torch
!pip install datasets
!pip install transformers accelerate bitsandbytes
!pip install nltk
!pip install tqdm
!pip install rank-bm25 rouge-score nltk

from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict, Counter
from sklearn.metrics.pairwise import cosine_similarity
from datasets import Dataset, DatasetDict, load_dataset
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, BertTokenizer, BertModel, AutoTokenizer, AutoModel
from tqdm import tqdm
from sklearn.metrics import cohen_kappa_score
from rank_bm25 import BM25Okapi
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

import json
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import statistics
import numpy as np
import random
import torch
import re
import seaborn as sns
import transformers
import re
import nltk

nltk.download('punkt_tab')

In [None]:
HF_L = "XXX"
login(token=HF_L)

model = 'llama'

if model == 'llama':
    llama_model_name = "meta-llama/Llama-2-7b-chat-hf"
    llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name, use_auth_token=HF_L)
    llama_model = AutoModelForCausalLM.from_pretrained(
        llama_model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        use_auth_token=HF_L
    )
if model == 'falcon':
    falcon_model_name = "tiiuae/falcon-7b-instruct"
    falcon_model = AutoModelForCausalLM.from_pretrained(
        falcon_model_name,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto",
    )
    falcon_tokenizer = AutoTokenizer.from_pretrained(falcon_model_name)
    falcon_pipeline = transformers.pipeline(
        "text-generation",
        model=falcon_model,
        tokenizer=falcon_tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto",
    )
if model == 'gemma':
    gemma_tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it")
    gemma_model = AutoModelForCausalLM.from_pretrained(
        "google/gemma-7b-it", 
        device_map="auto", 
        revision="float16")
if model == 'mistral':
    device = "cuda"
    mistral_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
    mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

In [None]:
def get_llama_embeddings(prompt, seed=42):
    inputs = llama_tokenizer(prompt, return_tensors="pt").to("cuda")
    torch.manual_seed(seed)    
    llama_model.config.output_hidden_states = True    
    with torch.no_grad():
        outputs = llama_model(**inputs)    
    embeddings = outputs.hidden_states[-1]
    attention_mask = inputs["attention_mask"].unsqueeze(-1)
    embeddings = embeddings * attention_mask
    sum_embeddings = embeddings.sum(dim=1)
    mask_sum = attention_mask.sum(dim=1)
    mean_pooled = sum_embeddings / mask_sum
    mean_pooled = mean_pooled.float().detach().cpu().numpy()
    return mean_pooled[0]
    
def get_falcon_embeddings(prompt, seed=42):
    if not hasattr(falcon_model, "config"):
        raise ValueError("falcon_model is not properly initialized. Please load the model correctly.")
    inputs = falcon_tokenizer(prompt, return_tensors="pt").to("cuda")
    torch.manual_seed(seed)
    falcon_model.config.output_hidden_states = True
    with torch.no_grad():
        outputs = falcon_model(**inputs)
    embeddings = outputs.hidden_states[-1]
    embeddings = outputs.hidden_states[-1]
    attention_mask = inputs["attention_mask"].unsqueeze(-1)
    embeddings = embeddings * attention_mask
    sum_embeddings = embeddings.sum(dim=1)
    mask_sum = attention_mask.sum(dim=1)
    mean_pooled = sum_embeddings / mask_sum
    mean_pooled = mean_pooled.float().detach().cpu().numpy()
    return mean_pooled[0]
    
def get_gemma_embeddings(prompt, seed=42):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    device = "cuda" if torch.cuda.is_available() else "cpu"    
    gemma_model.to(device)    
    inputs = gemma_tokenizer(prompt, return_tensors="pt")  
    input_ids = inputs["input_ids"].to(device)    
    attention_mask = inputs["attention_mask"].to(device)
    gemma_model.config.output_hidden_states = True
    with torch.no_grad():
        outputs = gemma_model(input_ids=input_ids) 
    embeddings = outputs.hidden_states[-1]
    attention_mask = attention_mask.unsqueeze(-1)
    embeddings = embeddings * attention_mask    
    sum_embeddings = embeddings.sum(dim=1)
    mask_sum = attention_mask.sum(dim=1)    
    mean_pooled = sum_embeddings / mask_sum
    mean_pooled = mean_pooled.float().detach().cpu().numpy()    
    return mean_pooled[0]
    
def get_mistral_embeddings(prompt, seed=42):
    global mistral_model
    mistral_model = mistral_model.to("cuda")    
    inputs = mistral_tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cuda") for key, value in inputs.items()}    
    torch.manual_seed(seed)
    mistral_model.config.output_hidden_states = True    
    with torch.no_grad():
        outputs = mistral_model(**inputs)    
    embeddings = outputs.hidden_states[-1]
    attention_mask = inputs["attention_mask"].unsqueeze(-1)
    embeddings = embeddings * attention_mask
    sum_embeddings = embeddings.sum(dim=1)
    mask_sum = attention_mask.sum(dim=1)
    mean_pooled = sum_embeddings / mask_sum
    mean_pooled = mean_pooled.float().detach().cpu().numpy()
    return mean_pooled[0]

In [None]:
def get_cosine_similarity(emb1, emb2):
    emb1 = emb1.reshape(1, -1) if emb1.ndim == 1 else emb1
    emb2 = emb2.reshape(1, -1) if emb2.ndim == 1 else emb2
    similarity = cosine_similarity(emb1, emb2)
    return similarity[0, 0]

def get_case_alignment(case_embs, case_base):
    emb1, emb2 = case_embs
    emb1 = emb1.reshape(1, -1) if emb1.ndim == 1 else emb1
    emb2 = emb2.reshape(1, -1) if emb2.ndim == 1 else emb2
    alignment_scores = []
    for past_case in case_base:
        past_prob_emb, past_solution_emb = past_case
        past_prob_emb = past_prob_emb.reshape(1, -1) if past_prob_emb.ndim == 1 else past_prob_emb
        past_solution_emb = past_solution_emb.reshape(1, -1) if past_solution_emb.ndim == 1 else past_solution_emb
        prob_similarity = cosine_similarity(emb1, past_prob_emb)
        solution_similarity = cosine_similarity(emb2, past_solution_emb)
        alignment_score = (prob_similarity + solution_similarity) / 2.0
        alignment_scores.append(alignment_score)
    return (sum(alignment_scores) / len(alignment_scores))[0][0]

def get_weighted_case_alignment(case_embs, case_base):
    emb1, emb2 = case_embs
    emb1 = emb1.reshape(1, -1) if emb1.ndim == 1 else emb1
    emb2 = emb2.reshape(1, -1) if emb2.ndim == 1 else emb2

    alignment_scores = []
    weights = []
    for past_case in case_base:
        past_prob_emb, past_solution_emb = past_case
        past_prob_emb = past_prob_emb.reshape(1, -1) if past_prob_emb.ndim == 1 else past_prob_emb
        past_solution_emb = past_solution_emb.reshape(1, -1) if past_solution_emb.ndim == 1 else past_solution_emb
        prob_similarity = cosine_similarity(emb1, past_prob_emb)[0][0]
        solution_similarity = cosine_similarity(emb2, past_solution_emb)[0][0]
        alignment_score = (prob_similarity + solution_similarity) / 2.0
        alignment_scores.append(alignment_score)
        weights.append(prob_similarity)
    total_weight = sum(weights)
    if total_weight == 0:
        return 0
    normalized_weights = [w / total_weight for w in weights]
    weighted_alignment_score = sum(a * w for a, w in zip(alignment_scores, normalized_weights))
    return weighted_alignment_score

def get_question_alignment(case_embs, case_base):
    emb1, _ = case_embs
    emb1 = emb1.reshape(1, -1) if emb1.ndim == 1 else emb1
    alignment_scores = []    
    for past_case in case_base:
        past_prob_emb, _ = past_case
        past_prob_emb = past_prob_emb.reshape(1, -1) if past_prob_emb.ndim == 1 else past_prob_emb
        prob_similarity = cosine_similarity(emb1, past_prob_emb)
        alignment_scores.append(prob_similarity)    
    return (sum(alignment_scores) / len(alignment_scores))[0][0]

def calculate_iaa_scores(cosine_scores_rounded, external_cosine_score_rounded):
    cosine_scores = np.array(cosine_scores_rounded)
    bins = np.arange(0, 1.1, 0.1)

    def bin_scores(scores, bins):
        return np.digitize(scores, bins) - 1

    binned_scores = [bin_scores(scores, bins) for scores in cosine_scores]
    pairwise_similarities = []
    for i in range(len(binned_scores)):
        for j in range(i + 1, len(binned_scores)):
            similarity = np.mean(np.array(binned_scores[i]) == np.array(binned_scores[j]))
            pairwise_similarities.append(similarity)
    mean_iaa = np.mean(pairwise_similarities)
    external_binned_score = bin_scores([external_cosine_score_rounded], bins)[0]
    total_agreement = 0
    total_disagreement = 0
    for binned_score in binned_scores:
        total_agreement += np.sum(np.array(binned_score) == external_binned_score)
        total_disagreement += np.sum(np.array(binned_score) != external_binned_score)
    return mean_iaa, total_agreement / len(cosine_scores_rounded), total_disagreement / len(cosine_scores_rounded)

def calculate_bm25(text1, text2):
    corpus = [text1.split(), text2.split()]
    bm25 = BM25Okapi(corpus)
    query = text1.split()
    scores = bm25.get_scores(query)
    score = scores[1]
    max_score = max(scores) if len(scores) > 0 else 1.0
    return score / max_score

def calculate_rouge(text1, text2):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(text1, text2)
    rouge_l_score = scores['rougeL'].fmeasure
    return rouge_l_score

def calculate_bleu(text1, text2):
    reference = [word_tokenize(text1)]
    candidate = word_tokenize(text2)
    smoothing_function = SmoothingFunction().method1
    score = sentence_bleu(reference, candidate, smoothing_function=smoothing_function)
    return max(0.0, min(1.0, score))

def calculate_bm25_rouge_bleu(gold_standard, text1, text2, text3):
    bm25_score = (calculate_bm25(gold_standard, text1) + calculate_bm25(gold_standard, text2) + calculate_bm25(gold_standard, text3) / 3)
    rouge_score = (calculate_rouge(gold_standard, text1) + calculate_rouge(gold_standard, text2) + calculate_rouge(gold_standard, text3) / 3)
    bleu_score = (calculate_bleu(gold_standard, text1) + calculate_bleu(gold_standard, text2) + calculate_bleu(gold_standard, text3) / 3)
    return bm25_score, rouge_score, bleu_score


In [None]:
models = ['llama', 'falcon', 'gemma', 'mistral']

In [None]:
for mod in models:
    if mod != model:
        continue;
    print(mod)
    dataset = load_dataset("Ramitha/alqa-results-40-" + mod)
    df = pd.DataFrame(dataset['rawcases'])
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Rows"):
        if mod == 'llama':
            question_emb = json.dumps(get_llama_embeddings(df.at[index, 'question']).tolist())
            snippet_emb = json.dumps(get_llama_embeddings(df.at[index, 'snippet']).tolist())
            answer_emb = json.dumps(get_llama_embeddings(df.at[index, 'answer']).tolist())
            answerGenerated_emb = json.dumps(get_llama_embeddings(df.at[index, 'answerGenerated']).tolist())
            question_answerGenerated_falcon_emb = json.dumps(get_llama_embeddings(df.at[index, 'question_answerGenerated_falcon']).tolist())
            question_answerGenerated_gemma_emb = json.dumps(get_llama_embeddings(df.at[index, 'question_answerGenerated_gemma']).tolist())
            question_answerGenerated_mistral_emb = json.dumps(get_llama_embeddings(df.at[index, 'question_answerGenerated_mistral']).tolist())
            reverse_answer_answerGenerated_gemma_emb = json.dumps(get_llama_embeddings(df.at[index, 'reverse_answer_answerGenerated_gemma']).tolist())
            reverse_answer_answerGenerated_falcon_emb = json.dumps(get_llama_embeddings(df.at[index, 'reverse_answer_answerGenerated_falcon']).tolist())
            reverse_answer_answerGenerated_mistral_emb = json.dumps(get_llama_embeddings(df.at[index, 'reverse_answer_answerGenerated_mistral']).tolist())
            judge_answer_answerGenerated_gemma_emb = json.dumps(get_llama_embeddings(df.at[index, 'judge_answer_answerGenerated_gemma']).tolist())
            judge_answer_answerGenerated_falcon_emb = json.dumps(get_llama_embeddings(df.at[index, 'judge_answer_answerGenerated_falcon']).tolist())
            judge_answer_answerGenerated_mistral_emb = json.dumps(get_llama_embeddings(df.at[index, 'judge_answer_answerGenerated_mistral']).tolist())            
            df.at[index, 'question_emb'] = question_emb
            df.at[index, 'snippet_emb'] = snippet_emb
            df.at[index, 'answer_emb'] = answer_emb
            df.at[index, 'answerGenerated_emb'] = answerGenerated_emb
            df.at[index, 'question_answerGenerated_falcon_emb'] = question_answerGenerated_falcon_emb
            df.at[index, 'question_answerGenerated_gemma_emb'] = question_answerGenerated_gemma_emb
            df.at[index, 'question_answerGenerated_mistral_emb'] = question_answerGenerated_mistral_emb
            df.at[index, 'reverse_answer_answerGenerated_gemma_emb'] = reverse_answer_answerGenerated_gemma_emb
            df.at[index, 'reverse_answer_answerGenerated_falcon_emb'] = reverse_answer_answerGenerated_falcon_emb
            df.at[index, 'reverse_answer_answerGenerated_mistral_emb'] = reverse_answer_answerGenerated_mistral_emb
            df.at[index, 'judge_answer_answerGenerated_gemma_emb'] = judge_answer_answerGenerated_gemma_emb
            df.at[index, 'judge_answer_answerGenerated_falcon_emb'] = judge_answer_answerGenerated_falcon_emb
            df.at[index, 'judge_answer_answerGenerated_mistral_emb'] = judge_answer_answerGenerated_mistral_emb            
            df.at[index, 'gold_standard_cos'] = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(answerGenerated_emb)))    
            # Cosine
            df.at[index, 'question_answerGenerated_falcon_cos']  = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(question_answerGenerated_falcon_emb)))
            df.at[index, 'question_answerGenerated_gemma_cos']  = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(question_answerGenerated_gemma_emb)))
            df.at[index, 'question_answerGenerated_mistral_cos']  = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(question_answerGenerated_mistral_emb)))
            df.at[index, 'judge_answer_answerGenerated_falcon_cos']  = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(judge_answer_answerGenerated_falcon_emb)))
            df.at[index, 'judge_answer_answerGenerated_gemma_cos']  = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(judge_answer_answerGenerated_gemma_emb)))
            df.at[index, 'judge_answer_answerGenerated_mistral_cos']  = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(judge_answer_answerGenerated_mistral_emb)))
            df.at[index, 'ILRSim']  = np.mean([
                df.at[index, 'question_answerGenerated_falcon_cos'],
                df.at[index, 'question_answerGenerated_gemma_cos'],
                df.at[index, 'question_answerGenerated_mistral_cos']
            ])                        
            df.at[index, 'answer_judge_cos_mean']  = np.mean([
                df.at[index, 'judge_answer_answerGenerated_falcon_cos'],
                df.at[index, 'judge_answer_answerGenerated_gemma_cos'],
                df.at[index, 'judge_answer_answerGenerated_mistral_cos']
            ])
            df.at[index, 'question_snippet_similarity'] = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(snippet_emb)))
            # Iaa
            df.at[index, 'iaa_fleiss_kappa'], df.at[index, 'iaa_fleiss_kappa_agreement'], df.at[index, 'iaa_fleiss_kappa_disagreement'] = calculate_iaa_scores([
                [round(df.at[index, 'question_answerGenerated_falcon_cos'], 2)],
                [round(df.at[index, 'question_answerGenerated_gemma_cos'], 2)],
                [round(df.at[index, 'question_answerGenerated_mistral_cos'], 2)]
            ], round(df.at[index, 'gold_standard_cos'], 2))
            # Reconstruction error
            df.at[index, 'question_reconstruction_falcon_error'] = np.mean((
                np.array(json.loads(question_emb)) - np.array(json.loads(question_answerGenerated_falcon_emb))
            ) ** 2)
            df.at[index, 'question_reconstruction_gemma_error'] = np.mean((
                np.array(json.loads(question_emb)) - np.array(json.loads(question_answerGenerated_gemma_emb))
            ) ** 2)
            df.at[index, 'question_reconstruction_mistral_error'] = np.mean((
                np.array(json.loads(question_emb)) - np.array(json.loads(question_answerGenerated_mistral_emb))
            ) ** 2)
            df.at[index, 'ILRError']  = np.mean([
                df.at[index, 'question_reconstruction_falcon_error'],
                df.at[index, 'question_reconstruction_gemma_error'],
                df.at[index, 'question_reconstruction_mistral_error']
            ])
            # Case alignment
            case_ = [np.array(json.loads(question_emb)), np.array(json.loads(answerGenerated_emb))]
            case_base = [
                [np.array(json.loads(question_answerGenerated_falcon_emb)), np.array(json.loads(reverse_answer_answerGenerated_falcon_emb))],
                [np.array(json.loads(question_answerGenerated_gemma_emb)), np.array(json.loads(reverse_answer_answerGenerated_gemma_emb))],
                [np.array(json.loads(question_answerGenerated_mistral_emb)), np.array(json.loads(reverse_answer_answerGenerated_mistral_emb))]
            ]
            df.at[index, 'ILRAlign'] = get_case_alignment(case_, case_base)
            df.at[index, 'WILRAlign'] = get_weighted_case_alignment(case_, case_base)
            df.at[index, 'question_alignment'] = get_question_alignment(case_, case_base)
            df.at[index, 'question_bm25_score_mean'], df.at[index, 'question_rouge_score_mean'], df.at[index, 'question_bleu_score_mean'] = calculate_bm25_rouge_bleu(
                df.at[index, 'question'], df.at[index, 'question_answerGenerated_gemma'] , df.at[index, 'question_answerGenerated_falcon'], df.at[index, 'question_answerGenerated_mistral'])
        if mod == 'falcon':
            question_emb = json.dumps(get_falcon_embeddings(df.at[index, 'question']).tolist())
            snippet_emb = json.dumps(get_falcon_embeddings(df.at[index, 'snippet']).tolist())
            answer_emb = json.dumps(get_falcon_embeddings(df.at[index, 'answer']).tolist())
            answerGenerated_emb = json.dumps(get_falcon_embeddings(df.at[index, 'answerGenerated']).tolist())
            question_answerGenerated_llama_emb = json.dumps(get_falcon_embeddings(df.at[index, 'question_answerGenerated_llama']).tolist())
            question_answerGenerated_gemma_emb = json.dumps(get_falcon_embeddings(df.at[index, 'question_answerGenerated_gemma']).tolist())
            question_answerGenerated_mistral_emb = json.dumps(get_falcon_embeddings(df.at[index, 'question_answerGenerated_mistral']).tolist())
            reverse_answer_answerGenerated_gemma_emb = json.dumps(get_falcon_embeddings(df.at[index, 'reverse_answer_answerGenerated_gemma']).tolist())
            reverse_answer_answerGenerated_llama_emb = json.dumps(get_falcon_embeddings(df.at[index, 'reverse_answer_answerGenerated_llama']).tolist())
            reverse_answer_answerGenerated_mistral_emb = json.dumps(get_falcon_embeddings(df.at[index, 'reverse_answer_answerGenerated_mistral']).tolist())
            judge_answer_answerGenerated_gemma_emb = json.dumps(get_falcon_embeddings(df.at[index, 'judge_answer_answerGenerated_gemma']).tolist())
            judge_answer_answerGenerated_llama_emb = json.dumps(get_falcon_embeddings(df.at[index, 'judge_answer_answerGenerated_llama']).tolist())
            judge_answer_answerGenerated_mistral_emb = json.dumps(get_falcon_embeddings(df.at[index, 'judge_answer_answerGenerated_mistral']).tolist())            
            df.at[index, 'question_emb'] = question_emb
            df.at[index, 'snippet_emb'] = snippet_emb
            df.at[index, 'answer_emb'] = answer_emb
            df.at[index, 'answerGenerated_emb'] = answerGenerated_emb
            df.at[index, 'question_answerGenerated_llama_emb'] = question_answerGenerated_llama_emb
            df.at[index, 'question_answerGenerated_gemma_emb'] = question_answerGenerated_gemma_emb
            df.at[index, 'question_answerGenerated_mistral_emb'] = question_answerGenerated_mistral_emb
            df.at[index, 'reverse_answer_answerGenerated_gemma_emb'] = reverse_answer_answerGenerated_gemma_emb
            df.at[index, 'reverse_answer_answerGenerated_llama_emb'] = reverse_answer_answerGenerated_llama_emb
            df.at[index, 'reverse_answer_answerGenerated_mistral_emb'] = reverse_answer_answerGenerated_mistral_emb
            df.at[index, 'judge_answer_answerGenerated_gemma_emb'] = judge_answer_answerGenerated_gemma_emb
            df.at[index, 'judge_answer_answerGenerated_llama_emb'] = judge_answer_answerGenerated_llama_emb
            df.at[index, 'judge_answer_answerGenerated_mistral_emb'] = judge_answer_answerGenerated_mistral_emb            
            df.at[index, 'gold_standard_cos'] = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(answerGenerated_emb)))
            # Cosine
            df.at[index, 'question_answerGenerated_llama_cos']  = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(question_answerGenerated_llama_emb)))
            df.at[index, 'question_answerGenerated_gemma_cos']  = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(question_answerGenerated_gemma_emb)))
            df.at[index, 'question_answerGenerated_mistral_cos']  = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(question_answerGenerated_mistral_emb)))
            df.at[index, 'judge_answer_answerGenerated_llama_cos']  = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(judge_answer_answerGenerated_llama_emb)))
            df.at[index, 'judge_answer_answerGenerated_gemma_cos']  = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(judge_answer_answerGenerated_gemma_emb)))
            df.at[index, 'judge_answer_answerGenerated_mistral_cos']  = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(judge_answer_answerGenerated_mistral_emb)))
            df.at[index, 'ILRSim']  = np.mean([
                df.at[index, 'question_answerGenerated_llama_cos'],
                df.at[index, 'question_answerGenerated_gemma_cos'],
                df.at[index, 'question_answerGenerated_mistral_cos']
            ])
            df.at[index, 'answer_judge_cos_mean']  = np.mean([
                df.at[index, 'judge_answer_answerGenerated_llama_cos'],
                df.at[index, 'judge_answer_answerGenerated_gemma_cos'],
                df.at[index, 'judge_answer_answerGenerated_mistral_cos']
            ])
            df.at[index, 'question_snippet_similarity'] = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(snippet_emb)))
            # Iaa
            df.at[index, 'iaa_fleiss_kappa'], df.at[index, 'iaa_fleiss_kappa_agreement'], df.at[index, 'iaa_fleiss_kappa_disagreement'] = calculate_iaa_scores([
                [round(df.at[index, 'question_answerGenerated_llama_cos'], 2)],
                [round(df.at[index, 'question_answerGenerated_gemma_cos'], 2)],
                [round(df.at[index, 'question_answerGenerated_mistral_cos'], 2)]
            ], round(df.at[index, 'gold_standard_cos'], 2))
            # Reconstruction error
            df.at[index, 'question_reconstruction_llama_error'] = np.mean((
                np.array(json.loads(question_emb)) - np.array(json.loads(question_answerGenerated_llama_emb))
            ) ** 2)
            df.at[index, 'question_reconstruction_gemma_error'] = np.mean((
                np.array(json.loads(question_emb)) - np.array(json.loads(question_answerGenerated_gemma_emb))
            ) ** 2)
            df.at[index, 'question_reconstruction_mistral_error'] = np.mean((
                np.array(json.loads(question_emb)) - np.array(json.loads(question_answerGenerated_mistral_emb))
            ) ** 2)
            df.at[index, 'ILRError']  = np.mean([
                df.at[index, 'question_reconstruction_llama_error'],
                df.at[index, 'question_reconstruction_gemma_error'],
                df.at[index, 'question_reconstruction_mistral_error']
            ])
            # Case alignment
            case_ = [np.array(json.loads(question_emb)), np.array(json.loads(answerGenerated_emb))]
            case_base = [
                [np.array(json.loads(question_answerGenerated_llama_emb)), np.array(json.loads(reverse_answer_answerGenerated_llama_emb))],
                [np.array(json.loads(question_answerGenerated_gemma_emb)), np.array(json.loads(reverse_answer_answerGenerated_gemma_emb))],
                [np.array(json.loads(question_answerGenerated_mistral_emb)), np.array(json.loads(reverse_answer_answerGenerated_mistral_emb))]
            ]
            df.at[index, 'ILRAlign'] = get_case_alignment(case_, case_base)
            df.at[index, 'WILRAlign'] = get_weighted_case_alignment(case_, case_base)
            df.at[index, 'question_alignment'] = get_question_alignment(case_, case_base)
            df.at[index, 'question_bm25_score_mean'], df.at[index, 'question_rouge_score_mean'], df.at[index, 'question_bleu_score_mean'] = calculate_bm25_rouge_bleu(
                df.at[index, 'question'], df.at[index, 'question_answerGenerated_gemma'] , df.at[index, 'question_answerGenerated_mistral'], df.at[index, 'question_answerGenerated_llama'])
        if mod == 'gemma':
            question_emb = json.dumps(get_gemma_embeddings(df.at[index, 'question']).tolist())
            snippet_emb = json.dumps(get_gemma_embeddings(df.at[index, 'snippet']).tolist())
            answer_emb = json.dumps(get_gemma_embeddings(df.at[index, 'answer']).tolist())
            answerGenerated_emb = json.dumps(get_gemma_embeddings(df.at[index, 'answerGenerated']).tolist())
            question_answerGenerated_falcon_emb = json.dumps(get_gemma_embeddings(df.at[index, 'question_answerGenerated_falcon']).tolist())
            question_answerGenerated_llama_emb = json.dumps(get_gemma_embeddings(df.at[index, 'question_answerGenerated_llama']).tolist())
            question_answerGenerated_mistral_emb = json.dumps(get_gemma_embeddings(df.at[index, 'question_answerGenerated_mistral']).tolist())
            reverse_answer_answerGenerated_falcon_emb = json.dumps(get_gemma_embeddings(df.at[index, 'reverse_answer_answerGenerated_falcon']).tolist())
            reverse_answer_answerGenerated_llama_emb = json.dumps(get_gemma_embeddings(df.at[index, 'reverse_answer_answerGenerated_llama']).tolist())
            reverse_answer_answerGenerated_mistral_emb = json.dumps(get_gemma_embeddings(df.at[index, 'reverse_answer_answerGenerated_mistral']).tolist())
            judge_answer_answerGenerated_falcon_emb = json.dumps(get_gemma_embeddings(df.at[index, 'judge_answer_answerGenerated_falcon']).tolist())
            judge_answer_answerGenerated_llama_emb = json.dumps(get_gemma_embeddings(df.at[index, 'judge_answer_answerGenerated_llama']).tolist())
            judge_answer_answerGenerated_mistral_emb = json.dumps(get_gemma_embeddings(df.at[index, 'judge_answer_answerGenerated_mistral']).tolist())            
            df.at[index, 'question_emb'] = question_emb
            df.at[index, 'snippet_emb'] = snippet_emb
            df.at[index, 'answer_emb'] = answer_emb
            df.at[index, 'answerGenerated_emb'] = answerGenerated_emb
            df.at[index, 'question_answerGenerated_falcon_emb'] = question_answerGenerated_falcon_emb
            df.at[index, 'question_answerGenerated_llama_emb'] = question_answerGenerated_llama_emb
            df.at[index, 'question_answerGenerated_mistral_emb'] = question_answerGenerated_mistral_emb
            df.at[index, 'reverse_answer_answerGenerated_llama_emb'] = reverse_answer_answerGenerated_llama_emb
            df.at[index, 'reverse_answer_answerGenerated_falcon_emb'] = reverse_answer_answerGenerated_falcon_emb
            df.at[index, 'reverse_answer_answerGenerated_mistral_emb'] = reverse_answer_answerGenerated_mistral_emb
            df.at[index, 'judge_answer_answerGenerated_llama_emb'] = judge_answer_answerGenerated_llama_emb
            df.at[index, 'judge_answer_answerGenerated_falcon_emb'] = judge_answer_answerGenerated_falcon_emb
            df.at[index, 'judge_answer_answerGenerated_mistral_emb'] = judge_answer_answerGenerated_mistral_emb
            df.at[index, 'gold_standard_cos'] = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(answerGenerated_emb)))    
            # Cosine
            df.at[index, 'question_answerGenerated_falcon_cos']  = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(question_answerGenerated_falcon_emb)))
            df.at[index, 'question_answerGenerated_llama_cos']  = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(question_answerGenerated_llama_emb)))
            df.at[index, 'question_answerGenerated_mistral_cos']  = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(question_answerGenerated_mistral_emb)))
            df.at[index, 'judge_answer_answerGenerated_falcon_cos']  = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(judge_answer_answerGenerated_falcon_emb)))
            df.at[index, 'judge_answer_answerGenerated_llama_cos']  = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(judge_answer_answerGenerated_llama_emb)))
            df.at[index, 'judge_answer_answerGenerated_mistral_cos']  = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(judge_answer_answerGenerated_mistral_emb)))
            df.at[index, 'ILRSim']  = np.mean([
                df.at[index, 'question_answerGenerated_falcon_cos'],
                df.at[index, 'question_answerGenerated_llama_cos'],
                df.at[index, 'question_answerGenerated_mistral_cos']
            ])
            df.at[index, 'answer_judge_cos_mean']  = np.mean([
                df.at[index, 'judge_answer_answerGenerated_falcon_cos'],
                df.at[index, 'judge_answer_answerGenerated_llama_cos'],
                df.at[index, 'judge_answer_answerGenerated_mistral_cos']
            ])
            df.at[index, 'question_snippet_similarity'] = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(snippet_emb)))
            # Iaa
            df.at[index, 'iaa_fleiss_kappa'], df.at[index, 'iaa_fleiss_kappa_agreement'], df.at[index, 'iaa_fleiss_kappa_disagreement'] = calculate_iaa_scores([
                [round(df.at[index, 'question_answerGenerated_falcon_cos'], 2)],
                [round(df.at[index, 'question_answerGenerated_llama_cos'], 2)],
                [round(df.at[index, 'question_answerGenerated_mistral_cos'], 2)]
            ], round(df.at[index, 'gold_standard_cos'], 2))
            # Reconstruction error
            df.at[index, 'question_reconstruction_falcon_error'] = np.mean((
                np.array(json.loads(question_emb)) - np.array(json.loads(question_answerGenerated_falcon_emb))
            ) ** 2)
            df.at[index, 'question_reconstruction_llama_error'] = np.mean((
                np.array(json.loads(question_emb)) - np.array(json.loads(question_answerGenerated_llama_emb))
            ) ** 2)
            df.at[index, 'question_reconstruction_mistral_error'] = np.mean((
                np.array(json.loads(question_emb)) - np.array(json.loads(question_answerGenerated_mistral_emb))
            ) ** 2)
            df.at[index, 'ILRError']  = np.mean([
                df.at[index, 'question_reconstruction_falcon_error'],
                df.at[index, 'question_reconstruction_llama_error'],
                df.at[index, 'question_reconstruction_mistral_error']
            ])
            # Case alignment
            case_ = [np.array(json.loads(question_emb)), np.array(json.loads(answerGenerated_emb))]
            case_base = [
                [np.array(json.loads(question_answerGenerated_falcon_emb)), np.array(json.loads(reverse_answer_answerGenerated_falcon_emb))],
                [np.array(json.loads(question_answerGenerated_llama_emb)), np.array(json.loads(reverse_answer_answerGenerated_llama_emb))],
                [np.array(json.loads(question_answerGenerated_mistral_emb)), np.array(json.loads(reverse_answer_answerGenerated_mistral_emb))]
            ]
            df.at[index, 'ILRAlign'] = get_case_alignment(case_, case_base)
            df.at[index, 'WILRAlign'] = get_weighted_case_alignment(case_, case_base)
            df.at[index, 'question_alignment'] = get_question_alignment(case_, case_base)
            df.at[index, 'question_bm25_score_mean'], df.at[index, 'question_rouge_score_mean'], df.at[index, 'question_bleu_score_mean'] = calculate_bm25_rouge_bleu(
                df.at[index, 'question'], df.at[index, 'question_answerGenerated_llama'] , df.at[index, 'question_answerGenerated_falcon'], df.at[index, 'question_answerGenerated_mistral'])
        if mod == 'mistral':
            question_emb = json.dumps(get_mistral_embeddings(df.at[index, 'question']).tolist())
            snippet_emb = json.dumps(get_mistral_embeddings(df.at[index, 'snippet']).tolist())
            answer_emb = json.dumps(get_mistral_embeddings(df.at[index, 'answer']).tolist())
            answerGenerated_emb = json.dumps(get_mistral_embeddings(df.at[index, 'answerGenerated']).tolist())
            question_answerGenerated_llama_emb = json.dumps(get_mistral_embeddings(df.at[index, 'question_answerGenerated_llama']).tolist())
            question_answerGenerated_gemma_emb = json.dumps(get_mistral_embeddings(df.at[index, 'question_answerGenerated_gemma']).tolist())
            question_answerGenerated_falcon_emb = json.dumps(get_mistral_embeddings(df.at[index, 'question_answerGenerated_falcon']).tolist())
            reverse_answer_answerGenerated_gemma_emb = json.dumps(get_mistral_embeddings(df.at[index, 'reverse_answer_answerGenerated_gemma']).tolist())
            reverse_answer_answerGenerated_falcon_emb = json.dumps(get_mistral_embeddings(df.at[index, 'reverse_answer_answerGenerated_falcon']).tolist())
            reverse_answer_answerGenerated_llama_emb = json.dumps(get_mistral_embeddings(df.at[index, 'reverse_answer_answerGenerated_llama']).tolist())
            judge_answer_answerGenerated_gemma_emb = json.dumps(get_mistral_embeddings(df.at[index, 'judge_answer_answerGenerated_gemma']).tolist())
            judge_answer_answerGenerated_falcon_emb = json.dumps(get_mistral_embeddings(df.at[index, 'judge_answer_answerGenerated_falcon']).tolist())
            judge_answer_answerGenerated_llama_emb = json.dumps(get_mistral_embeddings(df.at[index, 'judge_answer_answerGenerated_llama']).tolist())            
            df.at[index, 'gold_standard_cos'] = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(answerGenerated_emb)))            
            df.at[index, 'question_emb'] = question_emb
            df.at[index, 'snippet_emb'] = snippet_emb
            df.at[index, 'answer_emb'] = answer_emb
            df.at[index, 'answerGenerated_emb'] = answerGenerated_emb
            df.at[index, 'question_answerGenerated_falcon_emb'] = question_answerGenerated_falcon_emb
            df.at[index, 'question_answerGenerated_gemma_emb'] = question_answerGenerated_gemma_emb
            df.at[index, 'question_answerGenerated_llama_emb'] = question_answerGenerated_llama_emb
            df.at[index, 'reverse_answer_answerGenerated_gemma_emb'] = reverse_answer_answerGenerated_gemma_emb
            df.at[index, 'reverse_answer_answerGenerated_falcon_emb'] = reverse_answer_answerGenerated_falcon_emb
            df.at[index, 'reverse_answer_answerGenerated_llama_emb'] = reverse_answer_answerGenerated_llama_emb 
            df.at[index, 'judge_answer_answerGenerated_gemma_emb'] = judge_answer_answerGenerated_gemma_emb
            df.at[index, 'judge_answer_answerGenerated_falcon_emb'] = judge_answer_answerGenerated_falcon_emb
            df.at[index, 'judge_answer_answerGenerated_llama_emb'] = judge_answer_answerGenerated_llama_emb            
            # Cosine
            df.at[index, 'question_answerGenerated_llama_cos']  = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(question_answerGenerated_llama_emb)))
            df.at[index, 'question_answerGenerated_gemma_cos']  = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(question_answerGenerated_gemma_emb)))
            df.at[index, 'question_answerGenerated_falcon_cos']  = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(question_answerGenerated_falcon_emb)))
            df.at[index, 'judge_answer_answerGenerated_falcon_cos']  = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(judge_answer_answerGenerated_falcon_emb)))
            df.at[index, 'judge_answer_answerGenerated_gemma_cos']  = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(judge_answer_answerGenerated_gemma_emb)))
            df.at[index, 'judge_answer_answerGenerated_llama_cos']  = get_cosine_similarity(np.array(json.loads(answer_emb)), np.array(json.loads(judge_answer_answerGenerated_llama_emb)))
            df.at[index, 'ILRSim']  = np.mean([
                df.at[index, 'question_answerGenerated_llama_cos'],
                df.at[index, 'question_answerGenerated_gemma_cos'],
                df.at[index, 'question_answerGenerated_falcon_cos']
            ])            
            df.at[index, 'answer_judge_cos_mean']  = np.mean([
                df.at[index, 'judge_answer_answerGenerated_llama_cos'],
                df.at[index, 'judge_answer_answerGenerated_gemma_cos'],
                df.at[index, 'judge_answer_answerGenerated_falcon_cos']
            ])
            df.at[index, 'question_snippet_similarity'] = get_cosine_similarity(np.array(json.loads(question_emb)), np.array(json.loads(snippet_emb)))
            # Iaa
            df.at[index, 'iaa_fleiss_kappa'], df.at[index, 'iaa_fleiss_kappa_agreement'], df.at[index, 'iaa_fleiss_kappa_disagreement'] = calculate_iaa_scores([
                [round(df.at[index, 'question_answerGenerated_llama_cos'], 2)],
                [round(df.at[index, 'question_answerGenerated_gemma_cos'], 2)],
                [round(df.at[index, 'question_answerGenerated_falcon_cos'], 2)]
            ], round(df.at[index, 'gold_standard_cos'], 2))
            # Reconstruction error
            df.at[index, 'question_reconstruction_llama_error'] = np.mean((
                np.array(json.loads(question_emb)) - np.array(json.loads(question_answerGenerated_llama_emb))
            ) ** 2)
            df.at[index, 'question_reconstruction_gemma_error'] = np.mean((
                np.array(json.loads(question_emb)) - np.array(json.loads(question_answerGenerated_gemma_emb))
            ) ** 2)
            df.at[index, 'question_reconstruction_falcon_error'] = np.mean((
                np.array(json.loads(question_emb)) - np.array(json.loads(question_answerGenerated_falcon_emb))
            ) ** 2)
            df.at[index, 'ILRError']  = np.mean([
                df.at[index, 'question_reconstruction_llama_error'],
                df.at[index, 'question_reconstruction_gemma_error'],
                df.at[index, 'question_reconstruction_falcon_error']
            ])
            # Case alignment
            case_ = [np.array(json.loads(question_emb)), np.array(json.loads(answerGenerated_emb))]
            case_base = [
                [np.array(json.loads(question_answerGenerated_llama_emb)), np.array(json.loads(reverse_answer_answerGenerated_llama_emb))],
                [np.array(json.loads(question_answerGenerated_gemma_emb)), np.array(json.loads(reverse_answer_answerGenerated_gemma_emb))],
                [np.array(json.loads(question_answerGenerated_falcon_emb)), np.array(json.loads(reverse_answer_answerGenerated_falcon_emb))]
            ]
            df.at[index, 'ILRAlign'] = get_case_alignment(case_, case_base)
            df.at[index, 'WILRAlign'] = get_weighted_case_alignment(case_, case_base)
            df.at[index, 'question_alignment'] = get_question_alignment(case_, case_base)
            df.at[index, 'question_bm25_score_mean'], df.at[index, 'question_rouge_score_mean'], df.at[index, 'question_bleu_score_mean'] = calculate_bm25_rouge_bleu(
                df.at[index, 'question'], df.at[index, 'question_answerGenerated_gemma'] , df.at[index, 'question_answerGenerated_falcon'], df.at[index, 'question_answerGenerated_llama'])   

    hf_dataset = DatasetDict({
        'rawcases': Dataset.from_pandas(df)
    })
    hf_dataset.push_to_hub("Ramitha/alqa-results-40-" + mod + "-self")