In [1]:
import pandas as pd
import numpy as np
import os, glob

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from readers import read_and_merge_paths, read_json
import seaborn as sns

from collections import defaultdict
from matplotlib.ticker import MultipleLocator, PercentFormatter

from plotting_utils import *
from metrics import compute_recall, compute_error_rate, compute_doc_provenance, f_beta_score


# Put at top of plotting script (requires tex be installed though)
matplotlib.rc('font', family='serif')

FULL_WIDTH = 6.75133
COL_WIDTH  = 3.25063

COMBINATIONS = (2,)# 3, 4, 5, 10)
MODELS = (
    "accounts/fireworks/models/llama-v3p1-70b-instruct",
    "gpt-4o-2024-05-13",
    "gpt-3.5-turbo-0125",
    "accounts/fireworks/models/qwen2-72b-instruct",
    "gemini-1.5-flash",
)

DOMAINS = (
    "news",
    "conv",
)
PROMPT_SETTING = "subtopic"
GROUPBY_COLS = ["domain", "num_documents", "summarizer_model", "label_type"]
MERGING_COLS = ["domain", "filepath", "response_id", "label_type", "num_documents", "summarizer_model", "prompt"]
# =============================================================================================
# Add here the name of the mitigation strategies you want to compare against
# =============================================================================================
MITIGATION_STRATEGIES = [
    "position-6",
    "unrelated_subtopic-0.8",
    "unrelated_subtopic-null",
    "subtopic_is_paraphrase__nli_hf-0.6",
    "subtopic_is_paraphrase__sts_llm-null",
    "preds_are_redundant_sts_llm-null",
    "preds_are_redundant__nli_hf-0.6",
]

# Code has not been tested with other metric, correctness of results is not guaranteed
METRIC = "bidirectional"

## Load results

In this section, we load the results from the `run_final_postprocessing` step, which creates a file w/ all the labels for the different metrics, including:

- `ref_coverage_*`: concerns the use of LLM as a judge to determine the coverage of each reference. This results in a 1 to many mapping, where different predicted insights can be covering the same reference insight.
- `pred_coverage_*`: concerns the use of LLM as a judge to determine the coverage of each predicted insight. This results in a 1 to many mapping, where different reference insights can be covering the same predicted insight.
- `bidirectional`: concerns the use of LLM as a judge to determine the coverage of each reference and also vice-versa. Intuitively it merges the `ref_coverage_*` and `pred_coverage_*` labels by keeping labels for reference and predicted insights whose coverage match among the two runs. The final label is determined as the lowest coverage possible of the two.


This script will be pretty similar to `ResultsAnalysis_Metrics` with the only difference that we want to compute the metric differences between the two settings (w/ and w/o mitigation). In fact, the reported metrics will reflect the absolute % change (called **delta**) with respect to the "no-mitigation" score. 
To this end, we will need to report 

- **Original scores** (prior mitigation)
- **Scores after the different mitigation strategies**

In [2]:
def load_scores(base_dir_placeholder, domains, prompt_setting, models, combinations, max_rank, metric, merging_cols):
    assert metric == "bidirectional", "Code was not tested with other metric"
    
    f1_results = []
    recall_results = []
    err_rate_results = []
    for domain in domains:
        domain_recall_results = []
        domain_err_rate_results = []
        for model in models:
            for combination in combinations:
                base_dir = base_dir_placeholder.format(domain=domain, prompt_setting=prompt_setting, model=model, combination=combination)
                filepaths = sorted(glob.glob(f"{base_dir}/{metric}__*.json"))
                    
                for filepath in filepaths: 
                    data = read_json(filepath)
                    recall = compute_recall(data["labels"],
                                            ref_is_queried_subtopic=True,
                                            ref_is_shared=(prompt_setting == "subtopic_trustworthy"),
                                            max_rank=max_rank)
        
                    err_rate = compute_error_rate(data["labels"],
                                            ref_is_queried_subtopic=True,
                                            ref_is_shared=(prompt_setting == "subtopic_trustworthy"),
                                            max_rank=max_rank,
                                            metric_name=metric,
                                            debug_num_preds=False,
                    )
                    
                    for df in (recall, err_rate):
                        df.insert(0, "num_documents", len(df) * [combination])
                        df.insert(0, "prompt", len(df) * [prompt_setting])
                        df.insert(0, "domain", len(df) * [domain])
    
                        df["summarizer_model"] = model.rpartition("/")[-1]
                        df["metric"] = metric
                        df["filepath"] = filepath.rpartition(metric + "__")[-1]
                        df["unique_id"] = df["filepath"] + "__" + df["response_id"].astype(str)
                        #^will be useful for bootstrap sampling
                    
                    domain_recall_results.append(recall)
                    domain_err_rate_results.append(err_rate)

        if len(domain_recall_results) == 0:
            print("@@@ MISSING RESULTS FOR FILEPATH", base_dir_placeholder, domain)
            continue
        domain_recall_results = pd.concat(domain_recall_results)
        domain_err_rate_results = pd.concat(domain_err_rate_results)
        assert len(domain_recall_results) == len(domain_err_rate_results)
        
        # -----------------------------------------------------------------
        # Compute F1 Scores
        # -----------------------------------------------------------------
        domain_f1_results = domain_recall_results.set_index(merging_cols)
        domain_f1_results = domain_f1_results.join(domain_err_rate_results.set_index(merging_cols), how="inner", lsuffix="_recall", rsuffix="_err_rate")
        domain_f1_results = domain_f1_results.reset_index()
        domain_f1_results["fraction_precision"] = 1 - domain_f1_results["fraction_err_rate"]
        assert len(domain_f1_results) == len(domain_err_rate_results) == len(domain_recall_results)
        
        # always confirm the number of examples per n and summarizer model
        # - news domain: should be 500 for n=2-5, 200 for n=10
        # - conv domain: should be 500 for n=3-10 (except n=2, which should be n=341)
        domain_f1_results["f1_score"] = domain_f1_results[["fraction_recall", "fraction_precision"]].apply(lambda x: f_beta_score(x["fraction_recall"], x["fraction_precision"], beta=1), axis=1)
    
        recall_results.append(domain_recall_results)
        err_rate_results.append(domain_err_rate_results)
        f1_results.append(domain_f1_results)
        
    
    recall_results = pd.concat(recall_results)
    err_rate_results = pd.concat(err_rate_results)
    f1_results = pd.concat(f1_results)
    return f1_results, recall_results, err_rate_results

### Original Scores (before mitigation)

In [3]:
MAX_RANK = None # "num_refs" # 2

ORIG_F1_RESULTS, ORIG_RECALL_RESULTS, ORIG_ERR_RATE_RESULTS = load_scores(
    base_dir_placeholder="../outputs_{domain}/run_final_postprocessing-multi-request/gpt-4o-mini-2024-07-18/results_some_shared/{prompt_setting}/SummHay__combinations-{combination}/{model}", # placeholders will be replaced in the method
    domains=DOMAINS,
    prompt_setting="subtopic",
    max_rank=MAX_RANK, 
    metric="bidirectional",
    models=MODELS, 
    merging_cols=MERGING_COLS,
    combinations=COMBINATIONS, 
)
ORIG_F1_RESULTS.groupby(GROUPBY_COLS).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,filepath,response_id,prompt,n_recall,tp_recall,fraction_recall,compute_recall_kwargs,metric_recall,unique_id_recall,n_err_rate,tp_err_rate,fp,fraction_err_rate,compute_hallucination_kwargs,metric_err_rate,unique_id_err_rate,fraction_precision,f1_score
domain,num_documents,summarizer_model,label_type,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
conv,2,gemini-1.5-flash,fc,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341
conv,2,gemini-1.5-flash,fc+pc,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341
conv,2,gpt-3.5-turbo-0125,fc,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341
conv,2,gpt-3.5-turbo-0125,fc+pc,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341
conv,2,gpt-4o-2024-05-13,fc,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341
conv,2,gpt-4o-2024-05-13,fc+pc,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341
conv,2,llama-v3p1-70b-instruct,fc,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341
conv,2,llama-v3p1-70b-instruct,fc+pc,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341
conv,2,qwen2-72b-instruct,fc,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341
conv,2,qwen2-72b-instruct,fc+pc,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341,341


### After Mitigation Scores

In [4]:
MITIGATION_RECALL_RESULTS = {} 
MITIGATION_ERR_RATE_RESULTS = {}
MITIGATION_F1_RESULTS = {}

for mitigation in MITIGATION_STRATEGIES:
    results = load_scores(
        base_dir_placeholder="../outputs_{domain}/run_mitigation_final_postprocessing-multi-request/gpt-4o-mini-2024-07-18/results_some_shared/{prompt_setting}/SummHay__combinations-{combination}/{model}" + "/" + mitigation, # placeholders will be replaced in the method
        domains=DOMAINS,
        prompt_setting="subtopic",
        max_rank=MAX_RANK, 
        metric="bidirectional",
        models=MODELS, 
        merging_cols=MERGING_COLS,
        combinations=COMBINATIONS, 
    )

    MITIGATION_F1_RESULTS[mitigation] = results[0]
    MITIGATION_RECALL_RESULTS[mitigation] = results[1]
    MITIGATION_ERR_RATE_RESULTS[mitigation] = results[2]

MITIGATION_RECALL_RESULTS[MITIGATION_STRATEGIES[0]].groupby(GROUPBY_COLS).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,prompt,response_id,n,tp,fraction,compute_recall_kwargs,metric,filepath,unique_id
domain,num_documents,summarizer_model,label_type,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
conv,2,gemini-1.5-flash,fc,341,341,341,341,341,341,341,341,341
conv,2,gemini-1.5-flash,fc+pc,341,341,341,341,341,341,341,341,341
conv,2,gpt-3.5-turbo-0125,fc,341,341,341,341,341,341,341,341,341
conv,2,gpt-3.5-turbo-0125,fc+pc,341,341,341,341,341,341,341,341,341
conv,2,gpt-4o-2024-05-13,fc,341,341,341,341,341,341,341,341,341
conv,2,gpt-4o-2024-05-13,fc+pc,341,341,341,341,341,341,341,341,341
conv,2,llama-v3p1-70b-instruct,fc,341,341,341,341,341,341,341,341,341
conv,2,llama-v3p1-70b-instruct,fc+pc,341,341,341,341,341,341,341,341,341
conv,2,qwen2-72b-instruct,fc,341,341,341,341,341,341,341,341,341
conv,2,qwen2-72b-instruct,fc+pc,341,341,341,341,341,341,341,341,341


In [5]:
MITIGATION_RECALL_RESULTS[MITIGATION_STRATEGIES[0]].groupby(GROUPBY_COLS).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,prompt,response_id,n,tp,fraction,compute_recall_kwargs,metric,filepath,unique_id
domain,num_documents,summarizer_model,label_type,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
conv,2,gemini-1.5-flash,fc,341,341,341,341,341,341,341,341,341
conv,2,gemini-1.5-flash,fc+pc,341,341,341,341,341,341,341,341,341
conv,2,gpt-3.5-turbo-0125,fc,341,341,341,341,341,341,341,341,341
conv,2,gpt-3.5-turbo-0125,fc+pc,341,341,341,341,341,341,341,341,341
conv,2,gpt-4o-2024-05-13,fc,341,341,341,341,341,341,341,341,341
conv,2,gpt-4o-2024-05-13,fc+pc,341,341,341,341,341,341,341,341,341
conv,2,llama-v3p1-70b-instruct,fc,341,341,341,341,341,341,341,341,341
conv,2,llama-v3p1-70b-instruct,fc+pc,341,341,341,341,341,341,341,341,341
conv,2,qwen2-72b-instruct,fc,341,341,341,341,341,341,341,341,341
conv,2,qwen2-72b-instruct,fc+pc,341,341,341,341,341,341,341,341,341


### Get metrics

In [6]:
def get_df_subset(df, domain, label_type, num_docs, model):
    return df[
          (df["domain"] == domain)
        & (df["label_type"]==label_type) 
        & (df["num_documents"]== num_docs)
        & (df["summarizer_model"] == model.rpartition("/")[-1])
    ]


def get_metric_differences(
    before_df: pd.DataFrame,
    dict_of_afters: dict,
    domain: str,
    label_type: str,
    num_documents: int,
    metric_name: str = "fraction",
    models=MODELS,
) -> pd.DataFrame:
    results = defaultdict(list)
    for mitigation_strategy, after_df in dict_of_afters.items():
        for model in models:
            results["strategy"].append(mitigation_strategy)
            results["domain"].append(domain)
            results["num_documents"].append(num_documents)
            results["summarizer"].append(model.rpartition("/")[-1])

            before_model = get_df_subset(before_df, domain=domain, label_type=label_type, num_docs=num_documents, model=model)
            after_model = get_df_subset(after_df, domain=domain, label_type=label_type, num_docs=num_documents, model=model)
        
            if len(after_model) == 0:
                print(f"Haven't run results for model: {model}. Assigning 0")
                after_model_avg = np.nan
            else:
                after_model_avg = after_model[metric_name].mean()
                
            before_model_avg = before_model[metric_name].mean() # TODO: Add bootstrap sampling result
            results["metric"].append(f"\\delta {metric_name}")
            results["value"].append(after_model_avg-before_model_avg)
    return pd.DataFrame(results)


## F1-SCORE

In [7]:
ORIG_F1_RESULTS.columns

Index(['domain', 'filepath', 'response_id', 'label_type', 'num_documents',
       'summarizer_model', 'prompt', 'n_recall', 'tp_recall',
       'fraction_recall', 'compute_recall_kwargs', 'metric_recall',
       'unique_id_recall', 'n_err_rate', 'tp_err_rate', 'fp',
       'fraction_err_rate', 'compute_hallucination_kwargs', 'metric_err_rate',
       'unique_id_err_rate', 'fraction_precision', 'f1_score'],
      dtype='object')

In [8]:
f1_score_df = get_metric_differences(
    ORIG_F1_RESULTS,
    MITIGATION_F1_RESULTS,
    metric_name="f1_score",
    domain="news",
    label_type="fc+pc",
    num_documents=2,
)

(f1_score_df\
     .pivot_table(index="strategy", columns="summarizer", values="value") *1)\
    .style.format('{:,.2%}'.format)

summarizer,gemini-1.5-flash,gpt-3.5-turbo-0125,gpt-4o-2024-05-13,llama-v3p1-70b-instruct,qwen2-72b-instruct
strategy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
position-6,0.61%,0.09%,2.51%,0.42%,1.69%
preds_are_redundant__nli_hf-0.6,-0.80%,-1.23%,-0.49%,-0.98%,-0.28%
preds_are_redundant_sts_llm-null,-0.46%,-0.02%,-0.09%,-0.41%,0.16%
subtopic_is_paraphrase__nli_hf-0.6,0.03%,-0.01%,0.05%,0.09%,0.01%
subtopic_is_paraphrase__sts_llm-null,-1.69%,-2.00%,-1.19%,-0.86%,-1.19%
unrelated_subtopic-0.8,-1.88%,-0.38%,-2.61%,-1.95%,-1.49%
unrelated_subtopic-null,0.15%,-0.37%,-0.64%,-0.27%,-0.02%


In [14]:
f1_score_df = get_metric_differences(
    ORIG_F1_RESULTS,
    MITIGATION_F1_RESULTS,
    metric_name="f1_score",
    domain="conv",
    label_type="fc+pc",
    num_documents=2,
)

(f1_score_df\
     .pivot_table(index="strategy", columns="summarizer", values="value") *1)\
    .style.format('{:,.2%}'.format)

summarizer,gemini-1.5-flash,gpt-3.5-turbo-0125,gpt-4o-2024-05-13,llama-v3p1-70b-instruct,qwen2-72b-instruct
strategy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
position-6,1.37%,0.19%,2.28%,1.52%,1.52%
preds_are_redundant__nli_hf-0.6,0.36%,0.53%,0.46%,0.18%,-0.08%
preds_are_redundant_sts_llm-null,0.33%,0.11%,0.48%,0.40%,0.22%
subtopic_is_paraphrase__nli_hf-0.6,0.00%,-0.04%,0.01%,0.00%,0.00%
subtopic_is_paraphrase__sts_llm-null,-0.23%,-0.77%,-0.11%,-0.29%,-0.46%
unrelated_subtopic-0.8,0.09%,0.64%,0.85%,0.43%,0.85%
unrelated_subtopic-null,0.24%,0.50%,0.76%,0.36%,0.63%


In [17]:
### table main paper 
for domain in ("news", "conv"):
    print("----", domain, "----")
    models = ["gpt-4o-2024-05-13", "llama-v3p1-70b-instruct", "qwen2-72b-instruct"]
    f1_score_df = get_metric_differences(
        ORIG_F1_RESULTS,
        MITIGATION_F1_RESULTS,
        metric_name="f1_score",
        domain="news",
        label_type="fc+pc",
        num_documents=2,
    )
    
    print((f1_score_df\
         .pivot_table(index="strategy", columns="summarizer", values="value") *1)[models]\
        .style.format('{:,.2%}'.format).to_latex(label=""))

---- news ----
\begin{tabular}{lrrr}
summarizer & gpt-4o-2024-05-13 & llama-v3p1-70b-instruct & qwen2-72b-instruct \\
strategy &  &  &  \\
position-6 & 2.51% & 0.42% & 1.69% \\
preds_are_redundant__nli_hf-0.6 & -0.49% & -0.98% & -0.28% \\
preds_are_redundant_sts_llm-null & -0.09% & -0.41% & 0.16% \\
subtopic_is_paraphrase__nli_hf-0.6 & 0.05% & 0.09% & 0.01% \\
subtopic_is_paraphrase__sts_llm-null & -1.19% & -0.86% & -1.19% \\
unrelated_subtopic-0.8 & -2.61% & -1.95% & -1.49% \\
unrelated_subtopic-null & -0.64% & -0.27% & -0.02% \\
\end{tabular}

---- conv ----
\begin{tabular}{lrrr}
summarizer & gpt-4o-2024-05-13 & llama-v3p1-70b-instruct & qwen2-72b-instruct \\
strategy &  &  &  \\
position-6 & 2.51% & 0.42% & 1.69% \\
preds_are_redundant__nli_hf-0.6 & -0.49% & -0.98% & -0.28% \\
preds_are_redundant_sts_llm-null & -0.09% & -0.41% & 0.16% \\
subtopic_is_paraphrase__nli_hf-0.6 & 0.05% & 0.09% & 0.01% \\
subtopic_is_paraphrase__sts_llm-null & -1.19% & -0.86% & -1.19% \\
unrelated_subtopic

In [19]:
### table appendix paper -- all models
for domain in ("news", "conv"):
    print("----", domain, "----")
    f1_score_df = get_metric_differences(
        ORIG_F1_RESULTS,
        MITIGATION_F1_RESULTS,
        metric_name="f1_score",
        domain=domain,
        label_type="fc+pc",
        num_documents=2,
    )
    print((f1_score_df\
         .pivot_table(index="strategy", columns="summarizer", values="value") *1)\
        .style.format('{:,.2%}'.format).to_latex(label=""))

---- news ----
\begin{tabular}{lrrrrr}
summarizer & gemini-1.5-flash & gpt-3.5-turbo-0125 & gpt-4o-2024-05-13 & llama-v3p1-70b-instruct & qwen2-72b-instruct \\
strategy &  &  &  &  &  \\
position-6 & 0.61% & 0.09% & 2.51% & 0.42% & 1.69% \\
preds_are_redundant__nli_hf-0.6 & -0.80% & -1.23% & -0.49% & -0.98% & -0.28% \\
preds_are_redundant_sts_llm-null & -0.46% & -0.02% & -0.09% & -0.41% & 0.16% \\
subtopic_is_paraphrase__nli_hf-0.6 & 0.03% & -0.01% & 0.05% & 0.09% & 0.01% \\
subtopic_is_paraphrase__sts_llm-null & -1.69% & -2.00% & -1.19% & -0.86% & -1.19% \\
unrelated_subtopic-0.8 & -1.88% & -0.38% & -2.61% & -1.95% & -1.49% \\
unrelated_subtopic-null & 0.15% & -0.37% & -0.64% & -0.27% & -0.02% \\
\end{tabular}

---- conv ----
\begin{tabular}{lrrrrr}
summarizer & gemini-1.5-flash & gpt-3.5-turbo-0125 & gpt-4o-2024-05-13 & llama-v3p1-70b-instruct & qwen2-72b-instruct \\
strategy &  &  &  &  &  \\
position-6 & 1.37% & 0.19% & 2.28% & 1.52% & 1.52% \\
preds_are_redundant__nli_hf-0.6 & 0.3

### Recall

In [24]:
ORIG_F1_RESULTS

Unnamed: 0,domain,filepath,response_id,label_type,num_documents,summarizer_model,prompt,n_recall,tp_recall,fraction_recall,...,unique_id_recall,n_err_rate,tp_err_rate,fp,fraction_err_rate,compute_hallucination_kwargs,metric_err_rate,unique_id_err_rate,fraction_precision,f1_score
0,news,"topic_news1__300__examples0,100.json",0,fc,2,llama-v3p1-70b-instruct,subtopic,4,3,0.75,...,"topic_news1__300__examples0,100.json__0",5,3,2,0.400000,"{'max_rank': None, 'ref_is_shared': False, 're...",bidirectional,"topic_news1__300__examples0,100.json__0",0.600000,0.666667
1,news,"topic_news1__300__examples0,100.json",0,fc+pc,2,llama-v3p1-70b-instruct,subtopic,4,3,0.75,...,"topic_news1__300__examples0,100.json__0",5,3,2,0.400000,"{'max_rank': None, 'ref_is_shared': False, 're...",bidirectional,"topic_news1__300__examples0,100.json__0",0.600000,0.666667
2,news,"topic_news1__300__examples0,100.json",1,fc,2,llama-v3p1-70b-instruct,subtopic,4,4,1.00,...,"topic_news1__300__examples0,100.json__1",7,4,3,0.428571,"{'max_rank': None, 'ref_is_shared': False, 're...",bidirectional,"topic_news1__300__examples0,100.json__1",0.571429,0.727273
3,news,"topic_news1__300__examples0,100.json",1,fc+pc,2,llama-v3p1-70b-instruct,subtopic,4,4,1.00,...,"topic_news1__300__examples0,100.json__1",7,4,3,0.428571,"{'max_rank': None, 'ref_is_shared': False, 're...",bidirectional,"topic_news1__300__examples0,100.json__1",0.571429,0.727273
4,news,"topic_news1__300__examples0,100.json",2,fc,2,llama-v3p1-70b-instruct,subtopic,5,3,0.60,...,"topic_news1__300__examples0,100.json__2",6,3,3,0.500000,"{'max_rank': None, 'ref_is_shared': False, 're...",bidirectional,"topic_news1__300__examples0,100.json__2",0.500000,0.545455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3405,conv,"topic_conv5__300__examples0,100.json",82,fc+pc,2,gemini-1.5-flash,subtopic,2,2,1.00,...,"topic_conv5__300__examples0,100.json__82",7,2,5,0.714286,"{'max_rank': None, 'ref_is_shared': False, 're...",bidirectional,"topic_conv5__300__examples0,100.json__82",0.285714,0.444444
3406,conv,"topic_conv5__300__examples0,100.json",83,fc,2,gemini-1.5-flash,subtopic,2,1,0.50,...,"topic_conv5__300__examples0,100.json__83",7,1,6,0.857143,"{'max_rank': None, 'ref_is_shared': False, 're...",bidirectional,"topic_conv5__300__examples0,100.json__83",0.142857,0.222222
3407,conv,"topic_conv5__300__examples0,100.json",83,fc+pc,2,gemini-1.5-flash,subtopic,2,2,1.00,...,"topic_conv5__300__examples0,100.json__83",7,2,5,0.714286,"{'max_rank': None, 'ref_is_shared': False, 're...",bidirectional,"topic_conv5__300__examples0,100.json__83",0.285714,0.444444
3408,conv,"topic_conv5__300__examples0,100.json",84,fc,2,gemini-1.5-flash,subtopic,2,0,0.00,...,"topic_conv5__300__examples0,100.json__84",6,0,6,1.000000,"{'max_rank': None, 'ref_is_shared': False, 're...",bidirectional,"topic_conv5__300__examples0,100.json__84",0.000000,0.000000


In [25]:
### table appendix paper -- all models
for domain in ("news", "conv"):
    print("----", domain, "----", "recall")
    f1_score_df = get_metric_differences(
        ORIG_F1_RESULTS,
        MITIGATION_F1_RESULTS,
        metric_name="fraction_recall",
        domain=domain,
        label_type="fc+pc",
        num_documents=2,
    )
    print((f1_score_df\
         .pivot_table(index="strategy", columns="summarizer", values="value") *1)\
        .style.format('{:,.2%}'.format).to_latex(label=""))

---- news ---- recall
\begin{tabular}{lrrrrr}
summarizer & gemini-1.5-flash & gpt-3.5-turbo-0125 & gpt-4o-2024-05-13 & llama-v3p1-70b-instruct & qwen2-72b-instruct \\
strategy &  &  &  &  &  \\
position-6 & -2.72% & -0.07% & -4.66% & -5.67% & -3.58% \\
preds_are_redundant__nli_hf-0.6 & -2.89% & -1.94% & -2.21% & -4.12% & -1.96% \\
preds_are_redundant_sts_llm-null & -1.28% & -0.12% & -0.49% & -1.96% & -0.39% \\
subtopic_is_paraphrase__nli_hf-0.6 & -0.05% & -0.07% & -0.03% & 0.00% & 0.00% \\
subtopic_is_paraphrase__sts_llm-null & -3.30% & -3.34% & -3.14% & -3.18% & -3.10% \\
unrelated_subtopic-0.8 & -5.11% & -3.02% & -8.55% & -5.08% & -7.40% \\
unrelated_subtopic-null & -2.27% & -2.82% & -3.90% & -2.58% & -3.20% \\
\end{tabular}

---- conv ---- recall
\begin{tabular}{lrrrrr}
summarizer & gemini-1.5-flash & gpt-3.5-turbo-0125 & gpt-4o-2024-05-13 & llama-v3p1-70b-instruct & qwen2-72b-instruct \\
strategy &  &  &  &  &  \\
position-6 & -3.08% & -0.15% & -4.55% & -2.64% & -5.57% \\
preds_are

In [28]:
### table appendix paper -- all models
for domain in ("news", "conv"):
    print("----", domain, "----", "hallucination rate")
    f1_score_df = get_metric_differences(
        ORIG_F1_RESULTS,
        MITIGATION_F1_RESULTS,
        metric_name="fraction_err_rate",
        domain=domain,
        label_type="fc+pc",
        num_documents=2,
    )
    print((f1_score_df\
         .pivot_table(index="strategy", columns="summarizer", values="value") * -1)\
        .style.format('{:,.2%}'.format).to_latex(label=""))

---- news ---- hallucination rate
\begin{tabular}{lrrrrr}
summarizer & gemini-1.5-flash & gpt-3.5-turbo-0125 & gpt-4o-2024-05-13 & llama-v3p1-70b-instruct & qwen2-72b-instruct \\
strategy &  &  &  &  &  \\
position-6 & 2.29% & 0.18% & 6.09% & 3.58% & 4.28% \\
preds_are_redundant__nli_hf-0.6 & 1.10% & 0.23% & 0.57% & 1.26% & 0.92% \\
preds_are_redundant_sts_llm-null & 0.14% & 0.19% & 0.17% & 0.66% & 0.53% \\
subtopic_is_paraphrase__nli_hf-0.6 & 0.14% & 0.14% & 0.11% & 0.17% & 0.01% \\
subtopic_is_paraphrase__sts_llm-null & 0.67% & 0.87% & 0.28% & 0.92% & 0.44% \\
unrelated_subtopic-0.8 & 0.70% & 1.17% & 1.87% & 0.49% & 2.98% \\
unrelated_subtopic-null & 0.61% & 0.93% & 0.59% & 0.43% & 0.98% \\
\end{tabular}

---- conv ---- hallucination rate
\begin{tabular}{lrrrrr}
summarizer & gemini-1.5-flash & gpt-3.5-turbo-0125 & gpt-4o-2024-05-13 & llama-v3p1-70b-instruct & qwen2-72b-instruct \\
strategy &  &  &  &  &  \\
position-6 & 1.26% & 0.17% & 2.21% & 1.44% & 1.61% \\
preds_are_redundant__nl

### Error rate

In [27]:
ORIG_F1_RESULTS.columns

Index(['domain', 'filepath', 'response_id', 'label_type', 'num_documents',
       'summarizer_model', 'prompt', 'n_recall', 'tp_recall',
       'fraction_recall', 'compute_recall_kwargs', 'metric_recall',
       'unique_id_recall', 'n_err_rate', 'tp_err_rate', 'fp',
       'fraction_err_rate', 'compute_hallucination_kwargs', 'metric_err_rate',
       'unique_id_err_rate', 'fraction_precision', 'f1_score'],
      dtype='object')