In [6]:
import pandas as pd
import numpy as np
import os
from pathlib import Path


In [17]:
# Get the directory containing the notebook
notebook_dir = Path(os.getcwd())
# Get the parent directory (project root)
project_root = str(notebook_dir.parent)
# Add to Python path if not already there


not_cleaned_results_path = project_root + "/results/not_cleaned"
cleaned_results_path = project_root + "/results/cleaned"

# Check if the directories exist, create them if they don't
if not os.path.exists(not_cleaned_results_path):
    print(f"Directory not found: {not_cleaned_results_path}")
else:
    print(f"Directory found: {not_cleaned_results_path}")

if not os.path.exists(cleaned_results_path):
    print(f"Directory not found: {cleaned_results_path}")

else:
    print(f"Directory found: {cleaned_results_path}")

Directory found: c:\Users\todor\Repositories\ir-project/results/not_cleaned
Directory found: c:\Users\todor\Repositories\ir-project/results/cleaned


In [47]:
agents = ["pinecone", "openai", "hybrid", "auto_merge"]
datasets = ["nq", "hotpotqa", "sse_single", "sse_multi"]
metrics_columns = ["context_precision", "context_recall", "faithfulness", "factual_correctness(mode=f1)", "context_entity_recall", "answer_relevancy"]

not_cleaned_results = {dataset: {} for dataset in datasets}  # Initialize nested structure
for dataset in datasets:
    for agent in agents:
        path = not_cleaned_results_path + f"/{dataset}/{agent}/evaluation_results.csv"
        if not os.path.exists(path):
            print(f"Evaluation results file not found for {path}")
        else:
            full_table = pd.read_csv(path)
            # metrics = full_table[metrics_columns]
            not_cleaned_results[dataset][agent] = full_table
            print(f"Number of rows for {dataset} and {agent}: {full_table.shape[0]}")

Number of rows for nq and pinecone: 100
Number of rows for nq and openai: 50
Number of rows for nq and hybrid: 150
Number of rows for nq and auto_merge: 50
Number of rows for hotpotqa and pinecone: 100
Number of rows for hotpotqa and openai: 50
Number of rows for hotpotqa and hybrid: 200
Number of rows for hotpotqa and auto_merge: 300
Number of rows for sse_single and pinecone: 50
Number of rows for sse_single and openai: 50
Number of rows for sse_single and hybrid: 49
Number of rows for sse_single and auto_merge: 50
Number of rows for sse_multi and pinecone: 22
Number of rows for sse_multi and openai: 22
Number of rows for sse_multi and hybrid: 44
Number of rows for sse_multi and auto_merge: 66


In the experiments it was initially appending results to same evaluation dataset so results of agents were appended one after another instead of renewing, the solution for this is to take the last n instances since they belong to the actual agent and the ones before are from other agents if there are any before. Below we get the actual sizes the result tables should be.

In [48]:
min_per_dataset= {dataset: {} for dataset in datasets}
for dataset in datasets:
    min_num_rows = float('inf')
    for agent in agents:
        min_num_rows = min(min_num_rows, not_cleaned_results[dataset][agent].shape[0])
    min_per_dataset[dataset] = min_num_rows
    print(f"Minimum number of rows is {min_num_rows} for dataset: {dataset}")


Minimum number of rows is 50 for dataset: nq
Minimum number of rows is 50 for dataset: hotpotqa
Minimum number of rows is 49 for dataset: sse_single
Minimum number of rows is 22 for dataset: sse_multi


Additionally for hotpotqa we first evaluated on 100 queries but due to excessive costs we cut down to 50. This was done only after all agents except openai were evaluated. So for these agents, since we need the first 50 samples, its not the last 50 rows we need for hotpotqa but the 50 rows before the last 50 rows.

In [49]:
min_per_dataset_not_openai = {dataset: {} for dataset in datasets}
for dataset in datasets:
    min_num_rows = float('inf')
    for agent in agents:
        if agent != "openai":
            min_num_rows = min(min_num_rows, not_cleaned_results[dataset][agent].shape[0])
            
    min_per_dataset_not_openai[dataset] = min_num_rows
    print(f"Minimum number of rows is {min_num_rows} for dataset: {dataset}")


Minimum number of rows is 50 for dataset: nq
Minimum number of rows is 100 for dataset: hotpotqa
Minimum number of rows is 49 for dataset: sse_single
Minimum number of rows is 22 for dataset: sse_multi


In [50]:
cleaned_results = {dataset: {} for dataset in datasets}
for dataset in datasets:
    for agent in agents:
        if not_cleaned_results[dataset][agent].shape[0] == min_per_dataset[dataset]:
            cleaned_metrics = not_cleaned_results[dataset][agent]
        elif agent != "openai" and dataset == "hotpotqa":
            cleaned_metrics = not_cleaned_results[dataset][agent].iloc[-2 * min_per_dataset[dataset]: -min_per_dataset[dataset]]
        else:
            cleaned_metrics = not_cleaned_results[dataset][agent].iloc[-min_per_dataset[dataset]:]
            
        cleaned_results[dataset][agent] = cleaned_metrics
        print(f"Number of rows for {dataset} and {agent}: {cleaned_metrics.shape[0]}")



Number of rows for nq and pinecone: 50
Number of rows for nq and openai: 50
Number of rows for nq and hybrid: 50
Number of rows for nq and auto_merge: 50
Number of rows for hotpotqa and pinecone: 50
Number of rows for hotpotqa and openai: 50
Number of rows for hotpotqa and hybrid: 50
Number of rows for hotpotqa and auto_merge: 50
Number of rows for sse_single and pinecone: 49
Number of rows for sse_single and openai: 49
Number of rows for sse_single and hybrid: 49
Number of rows for sse_single and auto_merge: 49
Number of rows for sse_multi and pinecone: 22
Number of rows for sse_multi and openai: 22
Number of rows for sse_multi and hybrid: 22
Number of rows for sse_multi and auto_merge: 22


In [52]:
# Store in cleaned results directory
for dataset in datasets:
    for agent in agents:
        # Create the directory if it doesn't exist
        os.makedirs(cleaned_results_path + f"/{dataset}/{agent}", exist_ok=True)
        cleaned_results[dataset][agent].to_csv(cleaned_results_path + f"/{dataset}/{agent}/evaluation_results.csv", index=False)