In [None]:
# Required Libraries
import os
import json
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
# Function to compute Precision, Recall, and F1 scores at K
def get_micro_scores_at_K(actual, predicted, k):
    act_set = set(actual)
    pred_set = set(predicted[:k])

    correctly_retrieved = len(act_set & pred_set)  
    relevant_cases = len(act_set)  
    retrieved_cases = k  

    return correctly_retrieved, relevant_cases, retrieved_cases

# Function to calculate F1, Precision, and Recall vs K
def get_f1_vs_K(gold_labels, similarity_df):
    precision_vs_K = []
    recall_vs_K = []
    f1_vs_K = []

    for k in tqdm(range(1, 21)):  # Iterate over K from 1 to 20
        correct_retrieved_all = []
        relevant_cases_all = []
        retrieved_cases_all = []

        for query_case_id in similarity_df.query_case_id.values:
            if query_case_id not in [1864396, 1508893]:  # Exclude specific cases
                gold = gold_labels[gold_labels["query_case_id"].values == query_case_id].values[0][1:]
                actual = np.asarray(list(gold_labels.columns)[1:])[np.logical_or(gold == 1, gold == -2)]
                actual = [str(i) for i in actual]

                candidate_docs = [int(i) for i in gold_labels.columns.values[1:]]
                column_name = 'query_case_id' if 'query_case_id' in similarity_df.columns else 'Unnamed: 0'
                
                similarity_scores = similarity_df[similarity_df[column_name].values == query_case_id].values[0][1:]
                query_case_id = int(query_case_id)  # Ensure it's a Python int

                sorted_candidates = [x for _, x in sorted(zip(similarity_scores, candidate_docs), key=lambda pair: float(pair[0]), reverse=True)]

                if query_case_id not in candidate_docs:
                    print(f"⚠️ query_case_id {query_case_id} is missing from candidate_docs")
                elif query_case_id in sorted_candidates:
                    sorted_candidates.remove(query_case_id)
                else:
                    print(f"⚠️ query_case_id {query_case_id} is missing from sorted_candidates after sorting")

                sorted_candidates = [str(i) for i in sorted_candidates]

                # Compute scores
                correctly_retrieved, relevant_cases, retrieved_cases = get_micro_scores_at_K(actual, sorted_candidates, k)
                correct_retrieved_all.append(correctly_retrieved)
                relevant_cases_all.append(relevant_cases)
                retrieved_cases_all.append(retrieved_cases)

        # Compute final Precision, Recall, and F1-score at K
        recall_score = np.sum(correct_retrieved_all) / np.sum(relevant_cases_all)
        precision_score = np.sum(correct_retrieved_all) / np.sum(retrieved_cases_all)
        
        f1_score = 0 if recall_score == 0 or precision_score == 0 else (2 * precision_score * recall_score) / (precision_score + recall_score)

        recall_vs_K.append(recall_score)
        precision_vs_K.append(precision_score)
        f1_vs_K.append(f1_score)

    return {"recall_vs_K": recall_vs_K, "precision_vs_K": precision_vs_K, "f1_vs_K": f1_vs_K}

# Function to process labels from JSON file
def obtain_sim_df_from_labels(labels):
    query_numbers = [int(re.findall(r'\d+', i["id"])[0]) for i in labels["Query Set"]]
    
    # Ensure we filter only valid numerical candidates
    relevant_cases = []
    for i in labels["Query Set"]:
        cleaned_cases = [re.findall(r'\d+', j) for j in i["relevant candidates"]]
        cleaned_cases = [int(j[0]) for j in cleaned_cases if j]  # Only take non-empty matches
        relevant_cases.append(cleaned_cases)

    relevant_cases = {i: j for i, j in zip(query_numbers, relevant_cases)}

    candidate_numbers = [int(re.findall(r'\d+', i["id"])[0]) for i in labels["Candidate Set"]]
    candidate_numbers.sort()

    row_wise_dataframe = {}
    for query_number in sorted(list(relevant_cases.keys())):
        relevance_dict = {}
        for candidate in candidate_numbers:
            relevance_dict[candidate] = -1 if candidate == query_number else (1 if candidate in relevant_cases[query_number] else 0)
        row_wise_dataframe[query_number] = relevance_dict

    df = pd.DataFrame(row_wise_dataframe).T
    df.insert(loc=0, column='query_case_id', value=row_wise_dataframe.keys())
    df = df.reset_index(drop=True)
    return df


In [None]:
# Load True Labels JSON
with open("true_labels.json", 'r') as f:
    true_labels = json.load(f)

gold_labels_df = obtain_sim_df_from_labels(true_labels)  # Convert JSON to DataFrame

# Load BM25 Similarity CSV
sim_df = pd.read_csv("bm25_similarity.csv")

In [None]:
# Compute Evaluation Metrics
results = get_f1_vs_K(gold_labels_df, sim_df)

In [None]:
# Plot Precision, Recall, and F1-score vs K
plt.figure(figsize=(10, 6))
plt.plot(range(1, 21), results["precision_vs_K"], label="Precision", marker="o", linestyle="dashed")
plt.plot(range(1, 21), results["recall_vs_K"], label="Recall", marker="s", linestyle="dashed")
plt.plot(range(1, 21), results["f1_vs_K"], label="F1-score", marker="^", linestyle="dashed")

plt.xlabel("K")
plt.ylabel("Score")
plt.title("Precision, Recall, and F1-score vs K")
plt.legend()
plt.grid()
plt.show()