## Evidence Alignment Analysis

### Data Ingestion
- load the evidence alignment results
- unpack the experiment config results

In [None]:
import os
from typing import List, Dict
import json
import pandas as pd
import sys
sys.path.append('/home/ubuntu/BioDSA') # this will need to be updated for other machines/folders
from src import (
    REPO_ROOT,
    TOP_LEVEL_LOG_DIR,
    HYPOTHESIS_DIR
)

LOG_DIR = os.path.join(TOP_LEVEL_LOG_DIR, "eval_evidence_alignment")
print(LOG_DIR)
# location of the dataset to evaluate
BASE_DATASET_PATH = os.path.join(REPO_ROOT, "benchmark_datasets/cBioPortal")

# location of the hypothesis & dataset metadata
BASE_HYPOTHESIS_PATH = os.path.join(BASE_DATASET_PATH, "hypothesis")
BASE_DATASET_METADATA_PATH = os.path.join(BASE_DATASET_PATH, "dataset_metadata")

EVIDENCE_ALIGNMENT_RESULTS_FILE = os.path.join(LOG_DIR, "eval_results.json")

df = pd.read_json(EVIDENCE_ALIGNMENT_RESULTS_FILE)
df

In [None]:
# unpack the experiment_config column into separate columns

df = pd.concat([df.drop('experiment_config', axis=1), df['experiment_config'].apply(pd.Series)], axis=1)

# extract the agent type, which is composed of the agent_type, and it's hyperparameters
def get_agent_type(row: Dict) -> str:
    """
    Format agent type based on configuration.
    For react agent: (react, step_count)
    For reasoning coder: (reasoning_coder, planning_model, coding_model)
    For coder: (coder, model_name)
    """
    agent_type = row["agent_type"]
    if agent_type == "react":
        return f"(react, {row['step_count']}, {row['model_name']})"
    elif agent_type == "reasoning_coder":
        return f"(reasoning_coder, {row['planning_model']}, {row['coding_model']})"
    elif agent_type == "coder":
        return f"(coder, {row['model_name']})"
    elif agent_type == "reasoning_react":
        return f"(reasoning_react, {row['plan_model_name']}, {row['agent_model_name']}, {row['step_count']})"
    return agent_type


df['agent_summary'] = df.apply(get_agent_type, axis=1)
# display(df['agent_summary'].value_counts())

### Summarize Alignment Results

In [None]:
# create a summary of the alignment results
def summarize_alignment_results(row: Dict) -> str:
    """
    Summarize the alignment results for a given row.
    """
    ground_truth_evidence = row['ground_truth_evidence']
    generated_evidence = row['generated_evidence']
    alignment_results = row['eval_evidence_alignment']
    
    if not len(ground_truth_evidence) == len(alignment_results):
        print(f"WARNING: Length mismatch between ground_truth_evidence and alignment_results for row {row}")
        # print("ground_truth_evidence: ", ground_truth_evidence)
        # print("alignment_results: ", alignment_results)
        # print("generated_evidence: ", generated_evidence)
        # raise ValueError("Length mismatch between ground_truth_evidence and alignment_results")
    
    values = {
        "supported": 0,
        "contradicted": 0,
        "missed": 0
    }
    for i in range(len(ground_truth_evidence)):
        res = alignment_results[i]['alignment']
        res = res.strip().lower()
        
        if res not in ['supported', 'contradicted', 'missed']:
            raise ValueError(f"Invalid alignment result: {res}")
        
        values[res] += 1
        
    for key, value in values.items():
        row[f"alignment_eval_{key}"] = value
        
    return row

df = df.apply(summarize_alignment_results, axis=1)
df