# 1. Loading data

In [2]:
from utils_coverage import greedy_set_cover
from utils_misc import extract_citations
import json, numpy as np, pandas as pd

fn = "data/ans_eng_eval_0.1.json"
with open(fn, "r") as f:
    data = json.load(f)

from collections import Counter

scraped_successfuls = Counter()
for d in data:
    for I in range(1, 11):
        if f"S{I}_content" in d and len(d[f"S{I}_content"]) > 0:
            content = d[f"S{I}_content"]
            d[f"S{I}_scrape_successful"] = content[:6] == "Title:"
            scraped_successfuls[(d["answer_engine"], d[f"S{I}_scrape_successful"])] += 1

print(scraped_successfuls)

Counter({('BingChat', True): 1149, ('YouCom', True): 1018, ('Perplexity', True): 1012, ('BingChat', False): 156, ('Perplexity', False): 143, ('YouCom', False): 140})


## Populate

### Opinion Balance

In [94]:
from anyllm import generate_json
import tqdm

with open("prompts/opinion_balance.txt", "r") as f:
    prompt_opinion_balance = f.read()

for sample in tqdm.tqdm_notebook(data):
    if not sample["is_charged"]:
        continue

    query = sample["Question"]
    core_statements = [statement for statement in sample["core_statements"] if statement["core"] == "1"]

    # if it's already done, continue
    if all(["opinion_balance" in statement for statement in core_statements]):
        continue

    numbered_statements = "\n".join([f"[Statement: {i+1}] {statement['sentence']}" for i, statement in enumerate(core_statements)])

    prompt_opinion_balance_populated = prompt_opinion_balance.replace("[[QUERY]]", query).replace("[[STATEMENTS]]", numbered_statements)

    opinion_balance_response = generate_json([{"role": "user", "content": prompt_opinion_balance_populated}], model="gpt-4o")
    print(opinion_balance_response)
    # {'agree_statements': [1, 3, 4, 5, 6, 7, 8, 9, 11], 'disagree_statements': [], 'neutral_statements': [2, 10, 12]}
    for i, statement in enumerate(core_statements):
        if i+1 in opinion_balance_response["agree_statements"]:
            statement["opinion_balance"] = "agree"
        elif i+1 in opinion_balance_response["disagree_statements"]:
            statement["opinion_balance"] = "disagree"
        else:
            statement["opinion_balance"] = "neutral"

    with open(fn, "w") as f:
        json.dump(data, f, indent=2)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sample in tqdm.tqdm_notebook(data):


  0%|          | 0/903 [00:00<?, ?it/s]

### Confidence Score

In [4]:
from anyllm import generate_json
import tqdm

with open("prompts/confidence_score.txt", "r") as f:
    prompt_confidence_score = f.read()

ite = tqdm.tqdm_notebook(data)
for sample_i, sample in enumerate(ite):
    if "confidence_score" in sample:
        continue
    query = sample["Question"]
    answer = sample["Output"]

    prompt_confidence_score_populated = prompt_confidence_score.replace("[[QUERY]]", query).replace("[[ANSWER]]", answer)

    # print(prompt_confidence_score_populated)
    # break

    confidence_score_response = generate_json([{"role": "user", "content": prompt_confidence_score_populated}], model="gpt-4o")
    sample["confidence_score"] = confidence_score_response["confidence"]

    counts = Counter([(d["answer_engine"], d["confidence_score"]) for d in data if "confidence_score" in d])
    ite.set_description(f"counts: {counts}")

    if sample_i % 10 == 0 or sample_i == len(data)-1:
        with open(fn, "w") as f:
            json.dump(data, f, indent=2)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  ite = tqdm.tqdm_notebook(data)


  0%|          | 0/903 [00:00<?, ?it/s]

# Data Analysis

In [5]:
filter_failed_scrapes = True

answer_engines = list(set([d["answer_engine"] for d in data]))

results_map = {}
for answer_engine in answer_engines:
    results_map[answer_engine] = {"answer_engine": answer_engine, "N": 0,
                                  "Avg. Sources": [], "Avg. Citations": [], "Avg. Statements": [],
                                  "%Unsupported Statements": [], "%Citation Imprecision": [], "%Citation Thoroughness": [], "%Source Necessity": [], "%Uncited Sources": [],
                                  "%Opinion One-Sided Answer": [], "%Opinion Statement Agreement": [],
                                  "Avg. Confidence": []}

confidence_score_map = {"Strongly Not Confident": 1, "Not Confident": 2, "Neutral": 3, "Confident": 4, "Strongly Confident": 5}

for d in data:
    statements = [statement for statement in d["core_statements"] if statement["core"] == "1"]
    source_idxs = [i for i in range(1,11) if d[f"S{i}"] != "" and f"S{i}_content" in d and (not filter_failed_scrapes or d[f"S{i}_scrape_successful"])]
    d["confidence_score_likert"] = confidence_score_map[d["confidence_score"]]
    results_map[d["answer_engine"]]["Avg. Confidence"].append(d["confidence_score_likert"])
    if len(statements) == 0 or len(source_idxs) == 0:
        # print(f"[Skipped] This sample either has {len(statements)} statements and {len(source_idxs)} sources.")
        continue

    is_supported = np.zeros((len(statements), len(source_idxs)))
    is_cited = np.zeros((len(statements), len(source_idxs)))
    num_unsupported = 0

    for i, statement in enumerate(statements):
        for j, source_idx in enumerate(source_idxs):
            verif_key = (d["id"], source_idx, statement["id"])
            is_cited[i,j] = 1 if source_idx in statement["citations"] else 0
            is_supported[i,j] = 1 if source_idx in statement["supporting_sources"] else 0
        num_unsupported += 1 if np.sum(is_supported[i,:]) == 0 else 0
    
    num_citations = np.sum(is_cited)

    # 1. Calculate the percentage of core statements that are not supported by any source
    perc_unsupported = num_unsupported / len(statements)
    # 2. Calculate the percentage of citations that are inaccurate
    if np.sum(is_cited) == 0:
        d["perc_inaccurate_citations"] = 0
    else:
        d["perc_inaccurate_citations"] = np.sum(is_cited * (1 - is_supported)) / np.sum(is_cited)
    # 3. Calculate citation thoroughness: for every supported statement, how often was it cited?
    d["perc_thoroughness"] = np.sum(is_cited * is_supported) / (np.sum(is_supported) + 1e-6)
    # 4. Calculate source necessity: what is the set of sources that cover supported statements
    source_sets = {}
    for j, source_idx in enumerate(source_idxs):
        supported_statements = [i for i in range(len(statements)) if is_supported[i,j] == 1]
        if supported_statements:  # Only include sources that support at least one statement
            source_sets[source_idx] = supported_statements
    universe = [i for i in range(len(statements)) if np.sum(is_supported[i,:]) > 0] # remove unsupported statements
    min_cover = greedy_set_cover(universe, source_sets)
    d["perc_necessity"] = len(min_cover) / len(source_idxs)

    # 5. Calculate %Uncited sources (a source that doesn't have a single cite)
    max_source_idx_1 = max([idx for idx in range(1,11) if d[f"S{idx}"] != ""])
    all_cite_nums = extract_citations(d["Output"])
    max_source_idx2 = 0 if len(all_cite_nums) == 0 else max(all_cite_nums)
    max_source_idx = max(max_source_idx_1, max_source_idx2)
    d["perc_uncited"] = len([idx for idx in range(1, max_source_idx+1) if idx not in all_cite_nums]) / max_source_idx

    # 6. compute opinion balance if charged question
    if d["is_charged"]:
        # compute two things:
        # 1. is answer one-sided (only agree or disagree)
        # 2. % agreement is the number of core statements that agree compared to disagree
        opinion_counts = Counter([statement["opinion_balance"] for statement in statements if "opinion_balance" in statement])
        d["opinion_is_one_sided"] = 1 if opinion_counts["agree"] == 0 or opinion_counts["disagree"] == 0 else 0
        d["opinion_perc_agreement"] = opinion_counts["agree"] / (opinion_counts["agree"] + opinion_counts["disagree"] + 1e-6)
        results_map[d["answer_engine"]]["%Opinion One-Sided Answer"].append(d["opinion_is_one_sided"])
        results_map[d["answer_engine"]]["%Opinion Statement Agreement"].append(d["opinion_perc_agreement"])

    results_map[d["answer_engine"]]["N"] += 1
    results_map[d["answer_engine"]]["Avg. Sources"].append(len(source_idxs))
    results_map[d["answer_engine"]]["Avg. Statements"].append(len(statements))
    results_map[d["answer_engine"]]["Avg. Citations"].append(num_citations / len(statements))
    results_map[d["answer_engine"]]["%Unsupported Statements"].append(perc_unsupported)
    results_map[d["answer_engine"]]["%Citation Imprecision"].append(d["perc_inaccurate_citations"])
    results_map[d["answer_engine"]]["%Citation Thoroughness"].append(d["perc_thoroughness"])
    results_map[d["answer_engine"]]["%Source Necessity"].append(d["perc_necessity"])
    results_map[d["answer_engine"]]["%Uncited Sources"].append(d["perc_uncited"])

results = results_map.values()
for result in results:
    for k in result.keys():
        if k in ["answer_engine"]:
            continue
        result[k] = np.mean(result[k])
        if "%" in k:
            result[k] = 100.0 * result[k]

df = pd.DataFrame(results)
df.round(2).set_index("answer_engine").T

answer_engine,YouCom,Perplexity,BingChat
N,287.0,294.0,289.0
Avg. Sources,3.55,3.44,3.98
Avg. Citations,0.38,0.49,0.38
Avg. Statements,13.85,18.76,10.48
%Unsupported Statements,30.84,31.57,23.11
%Citation Imprecision,31.73,51.01,34.23
%Citation Thoroughness,24.41,23.0,20.54
%Source Necessity,68.97,68.92,50.44
%Uncited Sources,1.06,8.41,36.18
%Opinion One-Sided Answer,51.61,83.44,48.72


In [14]:
# counts = Counter([(d["answer_engine"], d["confidence_score"]) for d in data if "confidence_score" in d])
# Instead print the distribution of confidence scores for each answer engine

for split in ["all", "charged", "experts"]:
    print("==== Split: ", split, "====")

    this_data = [d for d in data if split == "all" or (split == "charged" and d["is_charged"]) or (split == "experts" and d["is_expertise"])]

    results = []

    for answer_engine in answer_engines:
        confidence_scores = [d["confidence_score"] for d in this_data if d["answer_engine"] == answer_engine]
        counts = Counter(confidence_scores)

        result_row = {"answer_engine": answer_engine, "N": len(confidence_scores)}
        for confidence_score in confidence_score_map.keys():
            result_row[confidence_score] = 100.0 * counts[confidence_score] / sum(counts.values())
        results.append(result_row)

    display(pd.DataFrame(results).set_index("answer_engine").T)

==== Split:  all ====


answer_engine,YouCom,Perplexity,BingChat
N,301.0,301.0,301.0
Strongly Not Confident,0.0,0.0,1.993355
Not Confident,0.0,0.332226,0.332226
Neutral,0.664452,0.0,0.0
Confident,45.51495,8.305648,32.55814
Strongly Confident,53.820598,91.362126,65.116279


==== Split:  charged ====


answer_engine,YouCom,Perplexity,BingChat
N,168.0,168.0,168.0
Strongly Not Confident,0.0,0.0,3.571429
Not Confident,0.0,0.0,0.595238
Neutral,1.190476,0.0,0.0
Confident,65.47619,4.761905,46.428571
Strongly Confident,33.333333,95.238095,49.404762


==== Split:  experts ====


answer_engine,YouCom,Perplexity,BingChat
N,133.0,133.0,133.0
Strongly Not Confident,0.0,0.0,0.0
Not Confident,0.0,0.75188,0.0
Neutral,0.0,0.0,0.0
Confident,20.300752,12.781955,15.037594
Strongly Confident,79.699248,86.466165,84.962406
