## Evidence Alignment Analysis

### Data Ingestion
- load the evidence alignment results
- unpack the experiment config results

In [None]:
import os
from typing import List, Dict
import json
import pandas as pd


LOG_DIR = "/home/ubuntu/DSWizard/experiment_logs"
HYPOTHESIS_DIR = "/home/ubuntu/DSWizard/benchmark_datasets/cBioPortal/hypothesis"
OUTPUT_DIR = "/home/ubuntu/DSWizard/eval_evidence_alignment"
COMPLETED_TASKS_FILE = os.path.join(OUTPUT_DIR, "completed_tasks.txt")

EVIDENCE_ALIGNMENT_RESULTS_FILE = os.path.join(OUTPUT_DIR, "eval_results.json")

df = pd.read_json(EVIDENCE_ALIGNMENT_RESULTS_FILE)

In [2]:
# unpack the experiment_config column into separate columns

df = pd.concat([df.drop('experiment_config', axis=1), df['experiment_config'].apply(pd.Series)], axis=1)

# extract the agent type, which is composed of the agent_type, and it's hyperparameters
def get_agent_type(row: Dict) -> str:
    """
    Format agent type based on configuration.
    For react agent: (react, step_count)
    For reasoning coder: (reasoning_coder, planning_model, coding_model)
    For coder: (coder, model_name)
    """
    agent_type = row["agent_type"]
    if agent_type == "react":
        return f"(react, {row['step_count']}, {row['model_name']})"
    elif agent_type == "reasoning_coder":
        return f"(reasoning_coder, {row['planning_model']}, {row['coding_model']})"
    elif agent_type == "coder":
        return f"(coder, {row['model_name']})"
    elif agent_type == "reasoning_react":
        return f"(reasoning_react, {row['plan_model_name']}, {row['agent_model_name']}, {row['step_count']})"
    return agent_type


df['agent_summary'] = df.apply(get_agent_type, axis=1)
# display(df['agent_summary'].value_counts())

### Summarize Alignment Results

In [5]:
# create a summary of the alignment results
def summarize_alignment_results(row: Dict) -> str:
    """
    Summarize the alignment results for a given row.
    """
    ground_truth_evidence = row['ground_truth_evidence']
    generated_evidence = row['generated_evidence']
    alignment_results = row['eval_evidence_alignment']
    
    if not len(ground_truth_evidence) == len(alignment_results):
        print(f"WARNING: Length mismatch between ground_truth_evidence and alignment_results for row {row}")
        # print("ground_truth_evidence: ", ground_truth_evidence)
        # print("alignment_results: ", alignment_results)
        # print("generated_evidence: ", generated_evidence)
        # raise ValueError("Length mismatch between ground_truth_evidence and alignment_results")
    
    values = {
        "supported": 0,
        "contradicted": 0,
        "missed": 0
    }
    for i in range(len(ground_truth_evidence)):
        res = alignment_results[i]['alignment']
        res = res.strip().lower()
        
        if res not in ['supported', 'contradicted', 'missed']:
            raise ValueError(f"Invalid alignment result: {res}")
        
        values[res] += 1
        
    for key, value in values.items():
        row[f"alignment_eval_{key}"] = value
        
    return row

df = df.apply(summarize_alignment_results, axis=1)
df

generated_evidence         [Evidence 1: The frequency of TP53 alterations...
eval_evidence_alignment    [{'evidence_id': '0', 'alignment': 'Supported'...
agent_type                                                   reasoning_coder
api_type                                                               azure
language                                                              python
final_response_model                                                  gpt-4o
model_name                                                            gpt-4o
step_count                                                               NaN
pmid                                                                38630790
hypothesis_index                                                           1
dataset_ids                                                [plmeso_msk_2024]
planning_model                                                       o3-mini
coding_model                                                          gpt-4o

Unnamed: 0,ground_truth_evidence,generated_evidence,eval_evidence_alignment,agent_type,api_type,language,final_response_model,model_name,step_count,pmid,hypothesis_index,dataset_ids,planning_model,coding_model,agent_summary,alignment_eval_supported,alignment_eval_contradicted,alignment_eval_missed
0,[The disparity in genomic stability is more pr...,[Initial data on structural variation grouped ...,"[{'evidence_id': '0', 'alignment': 'Missed'}]",react,azure,python,gpt-4o,gpt-4o,16.0,32015526,1,[luad_oncosg_2020],,,"(react, 16.0, gpt-4o)",0,0,1
1,[10.5% of myxofibrosarcomas have NF1 mutations...,[No samples of Myxofibrosarcoma and Pleomorphi...,"[{'evidence_id': '0', 'alignment': 'Contradict...",coder,azure,python,gpt-4o,gpt-4o,,20601955,1,[sarc_mskcc],,,"(coder, gpt-4o)",0,3,0
2,[Higher frequency of KIT mutations in Triple-W...,[The data provided indicates that within the T...,"[{'evidence_id': '0', 'alignment': 'Contradict...",coder,azure,python,gpt-4o,gpt-4o,,26091043,1,[skcm_tcga_pub_2015],,,"(coder, gpt-4o)",0,2,0
3,[Microsatellite instability-high status was en...,[The contingency table only contains data for ...,"[{'evidence_id': '0', 'alignment': 'Missed'}]",react,azure,python,gpt-4o,o3-mini,16.0,35235413,2,[crc_dd_2022],,,"(react, 16.0, o3-mini)",0,0,1
4,[HiC subtype is associated with TP53 mutation.],[Data was collected and merged from 'data_clin...,"[{'evidence_id': '0', 'alignment': 'Contradict...",react,azure,python,gpt-4o,gpt-4o,16.0,25583476,1,[egc_tmucih_2015],,,"(react, 16.0, gpt-4o)",0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10234,[Microsatellite instability-high status was en...,[The dataset loaded does not contain explicit ...,"[{'evidence_id': '0', 'alignment': 'Missed'}]",react,azure,python,gpt-4o,gpt-4o,16.0,35235413,2,[crc_dd_2022],,,"(react, 16.0, gpt-4o)",0,0,1
10235,[Presence of truncating mutations in RBM10 and...,[Count of Truncating Mutations per Gene shows ...,"[{'evidence_id': '0', 'alignment': 'Supported'}]",reasoning_coder,azure,python,gpt-4o,gpt-4o,,22980975,1,[luad_broad],o3-mini,gpt-4o,"(reasoning_coder, o3-mini, gpt-4o)",1,0,0
10236,[Predictions from kinase-substrate network ana...,[The regression model assessing the prediction...,"[{'evidence_id': '0', 'alignment': 'Contradict...",coder,azure,python,gpt-4o,o3-mini,,32888432,4,[coadread_cass_2020],,,"(coder, o3-mini)",0,1,0
10237,[TP53 alterations occur in half of dysplasia c...,[A high proportion (79.64%) of colitis samples...,"[{'evidence_id': '0', 'alignment': 'Missed'}]",reasoning_coder,azure,python,gpt-4o,gpt-4o,,36611031,0,[bowel_colitis_msk_2022],o3-mini,gpt-4o,"(reasoning_coder, o3-mini, gpt-4o)",0,0,1
