# A script to cumput LLM feedback for prompt enriching task

In [2]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Thesis/MedTransNet

import sys
sys.path.append('/content/drive/MyDrive/Thesis/MedTransNet')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Thesis/MedTransNet
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Thesis/MedTransNet


In [3]:
!pip install protobuf==3.20.0 icetk cpm_kernels torch-geometric datasets
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip -q install bitsandbytes accelerate xformers einops

[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.1.0+cu121 requires torch==2.1.0, but you have torch 2.1.2 which is incompatible.
torchdata 0.7.0 requires torch==2.1.0, but you have torch 2.1.2 which is incompatible.
torchtext 0.16.0 requires torch==2.1.0, but you have torch 2.1.2 which is incompatible.
torchvision 0.16.0+cu121 requires torch==2.1.0, but you have torch 2.1.2 which is incompatible.[0m[31m
[0m

## Setup

In [4]:
import torch
import gc
import pickle
import os

import pandas as pd

from tqdm import tqdm
from datasets import load_dataset

import transformers
from transformers import AutoTokenizer
from transformers import logging as hf_logging

from src.medical_hgt.llm import LLM, LLMFeedback
from config import ROOT_DIR

## LLM

## Inference Loop

In [10]:
def compute_llm_feedback(llm, qa_dataset, nx_graph_data, question_to_subgraphs_mapping):
    correct_answer_map = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}
    llm_feedbacks_dict = {}  # a dict of dicts in the form {node_type_0: {node_index_0: conf_diff_0, node_index_1: conf_diff_1...}, ...}

    for i, (qa_index, subgraph_tuples) in enumerate(tqdm(question_to_subgraphs_mapping.items())):
        qa_row = qa_dataset.iloc[qa_index]
        question_dict = dict(qa_row.drop(['id', 'cop', 'exp']))
        correct_answer = qa_row['cop']
        prompt = """Question: {} A. {} B. {} C. {} D. {}""".format(
            question_dict['question'],
            question_dict['opa'],
            question_dict['opb'],
            question_dict['opc'],
            question_dict['opd']
        )

        # Process question without context
        output_encodings, predictions = llm.inference(prompt)
        llm_vanilla_response_dict = llm.get_confidence(correct_answer_map[correct_answer], output_encodings, predictions)

        if llm_vanilla_response_dict['confidence'] == -1:
            print(f'Wrong response format. Question {i} ignored')
            continue

        # Create LLMFeedback object
        llm_feedback = LLMFeedback(qa_index, llm_vanilla_response_dict['response'], llm_vanilla_response_dict['confidence'], llm_vanilla_response_dict['cop_confidence'], llm_vanilla_response_dict['accuracy'])

        # Batch process contexts

        prompts_with_context = []

        for node_uid, node_type in subgraph_tuples[:20]:
            # Create Context string
            node_name = nx_graph_data.nodes[node_uid]['name']
            context = f'Context: The {node_type} {node_name}. '
            prompt_with_context = context + prompt
            prompts_with_context.append(prompt_with_context)

        # Batch inference
        batch_output_encodings, batch_predictions = llm.inference_batch(prompts_with_context)

        # Postprocess batch model output
        for j, (node_uid, node_type) in enumerate(subgraph_tuples[:20]):
            output_encoding = batch_output_encodings[j]
            prediction = batch_predictions[j]
            llm_context_response_dict = llm.get_confidence(correct_answer_map[correct_answer], output_encoding=output_encoding, predictions=prediction)

            if llm_context_response_dict['confidence'] == -1:
                print(f'Wrong response format. Node {node_uid} ignored')
                continue

            llm_feedback.insert_feedback(node_type, node_uid, llm_context_response_dict['confidence'], llm_context_response_dict['cop_confidence'], llm_context_response_dict['accuracy'])

            llm_feedbacks_dict[qa_index] = llm_feedback

        if i % 100 == 0:
            print(f'Example Feedback:\n')
            llm_feedbacks_dict[qa_index].print_feedback()
            pickle.dump(llm_feedbacks_dict, open(os.path.join(ROOT_DIR, 'datasets', 'train', 'llm_feedback', f'llm_feedbacks_{i}.pickle'), 'wb'))

    return llm_feedbacks_dict

## config and load llm

In [11]:
# 4bit quantization config

bnb_config = transformers.BitsAndBytesConfig(
          load_in_4bit=True,
          bnb_4bit_use_double_quant=True,
          bnb_4bit_quant_type="nf4",
          bnb_4bit_compute_dtype=torch.bfloat16
        )

In [12]:
model_name="mistralai/Mistral-7B-Instruct-v0.1"

In [13]:
model = transformers.AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, quantization_config=bnb_config, device_map='auto')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [15]:
llm = LLM(model, tokenizer)

## Compute feedback

In [16]:
# Load datasets

prime_kg = pickle.load(open(os.path.join(ROOT_DIR, 'datasets/prime_kg_nx_63960.pickle'), 'rb'))

qa_dataset = load_dataset("medmcqa")
qa_dataset = pd.DataFrame(qa_dataset['train'])

subgraphs_dict_path = 'datasets/subgraphs_dict_train.pickle'
question_to_subgraphs_mapping = pickle.load(open(os.path.join(ROOT_DIR, subgraphs_dict_path), 'rb'))

Downloading readme:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/936k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/182822 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6150 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4183 [00:00<?, ? examples/s]

In [None]:
llm_feedbacks_dict = compute_llm_feedback(llm, qa_dataset, prime_kg, question_to_subgraphs_mapping)

pickle.dump(llm_feedbacks_dict, open(os.path.join(ROOT_DIR, 'datasets', f'llm_feedbacks_train_{len(llm_feedbacks_dict)}.pickle'), 'wb'))

 10%|█         | 18427/177004 [00:06<00:56, 2789.04it/s]

Wrong response format. Question 18426 ignored


 11%|█         | 18706/177004 [00:07<01:03, 2495.93it/s]

Wrong response format. Question 18532 ignored


 11%|█         | 18846/177004 [00:07<01:13, 2143.87it/s]

Wrong response format. Question 18794 ignored


 11%|█         | 19164/177004 [00:08<01:22, 1913.70it/s]

Wrong response format. Question 19163 ignored


 11%|█         | 19540/177004 [00:09<01:32, 1696.96it/s]

Wrong response format. Question 19539 ignored


 11%|█         | 19769/177004 [00:09<01:52, 1394.73it/s]

Wrong response format. Question 19768 ignored


 11%|█▏        | 19935/177004 [00:10<02:15, 1162.22it/s]

Wrong response format. Question 19934 ignored


 11%|█▏        | 20227/177004 [00:10<02:34, 1013.43it/s]

Wrong response format. Question 20226 ignored


 12%|█▏        | 20426/177004 [00:11<03:17, 793.26it/s] 

Wrong response format. Question 20425 ignored


 12%|█▏        | 20471/177004 [00:12<04:32, 574.04it/s]

Wrong response format. Question 20440 ignored


 12%|█▏        | 20647/177004 [00:12<05:04, 512.90it/s]

Wrong response format. Question 20646 ignored


 12%|█▏        | 21167/177004 [00:13<04:06, 632.96it/s]

Wrong response format. Question 21166 ignored


 12%|█▏        | 21245/177004 [00:13<05:35, 464.42it/s]

Wrong response format. Question 21244 ignored


 12%|█▏        | 21791/177004 [00:14<04:15, 607.51it/s]

Wrong response format. Question 21790 ignored


 12%|█▏        | 21972/177004 [00:15<05:02, 511.95it/s]

Wrong response format. Question 21971 ignored


 13%|█▎        | 22175/177004 [00:15<05:26, 474.59it/s]

Wrong response format. Question 22174 ignored


 13%|█▎        | 22221/177004 [00:16<07:02, 365.99it/s]

Wrong response format. Question 22207 ignored


 13%|█▎        | 22841/177004 [00:16<04:47, 536.74it/s]

Wrong response format. Question 22840 ignored


 13%|█▎        | 23529/177004 [00:17<03:31, 725.74it/s]

Wrong response format. Question 23528 ignored


 14%|█▍        | 25307/177004 [00:18<01:40, 1503.79it/s]

Wrong response format. Question 24316 ignored
Wrong response format. Question 25571 ignored


 15%|█▍        | 25667/177004 [00:19<02:47, 902.73it/s] 

Wrong response format. Question 25573 ignored


 15%|█▍        | 25929/177004 [00:19<03:21, 748.27it/s]

Wrong response format. Question 25869 ignored


 15%|█▍        | 26125/177004 [00:20<03:55, 641.57it/s]

Wrong response format. Question 26097 ignored


 15%|█▍        | 26347/177004 [00:20<04:23, 571.97it/s]

Wrong response format. Question 26346 ignored


 15%|█▍        | 26462/177004 [00:21<05:24, 463.82it/s]

Wrong response format. Question 26385 ignored


 15%|█▌        | 26691/177004 [00:22<05:49, 429.64it/s]

Wrong response format. Question 26690 ignored


 15%|█▌        | 26763/177004 [00:22<07:11, 347.90it/s]

Wrong response format. Question 26721 ignored


 15%|█▌        | 26945/177004 [00:23<07:19, 341.35it/s]

Wrong response format. Question 26944 ignored


 15%|█▌        | 27095/177004 [00:23<07:46, 321.07it/s]

Wrong response format. Question 27094 ignored


 15%|█▌        | 27239/177004 [00:24<08:12, 303.80it/s]

Wrong response format. Question 27238 ignored


 16%|█▌        | 27541/177004 [00:24<06:30, 382.76it/s]

Wrong response format. Question 27540 ignored


 16%|█▌        | 27602/177004 [00:25<08:11, 304.24it/s]

Wrong response format. Question 27601 ignored


 16%|█▌        | 28386/177004 [00:26<03:31, 704.11it/s]

Wrong response format. Question 27854 ignored


 16%|█▌        | 28687/177004 [00:26<03:54, 631.76it/s]

Wrong response format. Question 28686 ignored


 17%|█▋        | 29329/177004 [00:27<02:45, 893.37it/s]

Wrong response format. Question 28707 ignored


 17%|█▋        | 29575/177004 [00:27<03:20, 735.05it/s]

Wrong response format. Question 29574 ignored


 17%|█▋        | 29760/177004 [00:28<04:12, 582.68it/s]

Wrong response format. Question 29759 ignored


 17%|█▋        | 30866/177004 [00:29<02:03, 1182.66it/s]

Wrong response format. Question 29987 ignored


 18%|█▊        | 32395/177004 [00:30<01:34, 1527.09it/s]

Wrong response format. Question 32004 ignored


 19%|█▊        | 33040/177004 [00:31<02:39, 900.83it/s]

Wrong response format. Node 36266 ignored
Wrong response format. Question 33213 ignored


 19%|█▉        | 33634/177004 [00:32<03:15, 732.95it/s]

Wrong response format. Question 33250 ignored
Wrong response format. Question 33766 ignored


 19%|█▉        | 33866/177004 [00:33<05:25, 440.15it/s]

Wrong response format. Question 33769 ignored


 19%|█▉        | 34110/177004 [00:34<05:33, 428.43it/s]

Wrong response format. Question 34109 ignored


 19%|█▉        | 34282/177004 [00:34<06:04, 391.07it/s]

Wrong response format. Question 34281 ignored


 20%|█▉        | 34550/177004 [00:35<05:52, 404.27it/s]

Wrong response format. Question 34549 ignored


 20%|█▉        | 34836/177004 [00:36<05:28, 433.11it/s]

Wrong response format. Question 34835 ignored
Wrong response format. Question 34862 ignored


 20%|██        | 35557/177004 [00:37<03:55, 601.59it/s]

Wrong response format. Question 34906 ignored


 20%|██        | 35748/177004 [00:38<04:44, 496.38it/s]

Wrong response format. Question 35560 ignored
Wrong response format. Node 84797 ignored
Wrong response format. Node 27954 ignored
Wrong response format. Node 33340 ignored
Wrong response format. Node 32052 ignored
Wrong response format. Node 31032 ignored
Wrong response format. Node 32027 ignored
Wrong response format. Node 39782 ignored


 20%|██        | 35891/177004 [00:40<13:24, 175.30it/s]

Wrong response format. Question 35874 ignored


 20%|██        | 35993/177004 [00:41<13:33, 173.29it/s]

Wrong response format. Question 35930 ignored


 21%|██        | 36383/177004 [00:42<07:35, 308.68it/s]

Wrong response format. Question 36048 ignored


 21%|██        | 37175/177004 [00:43<03:46, 617.21it/s]

Wrong response format. Question 36890 ignored


 21%|██▏       | 37632/177004 [00:43<03:21, 690.38it/s]

Wrong response format. Question 37321 ignored


 21%|██▏       | 37883/177004 [00:44<04:16, 543.15it/s]

Wrong response format. Question 37882 ignored


 22%|██▏       | 38273/177004 [00:45<04:12, 548.91it/s]

Wrong response format. Question 37895 ignored


 22%|██▏       | 39486/177004 [00:46<02:32, 903.48it/s]

Wrong response format. Question 39236 ignored


 22%|██▏       | 39673/177004 [00:46<03:46, 606.70it/s]

Wrong response format. Question 39518 ignored


 23%|██▎       | 40171/177004 [00:47<03:09, 720.85it/s]

Wrong response format. Question 39919 ignored


 23%|██▎       | 40416/177004 [00:48<04:25, 515.09it/s]

Wrong response format. Question 40415 ignored


 23%|██▎       | 40537/177004 [00:48<05:34, 408.21it/s]

Wrong response format. Question 40536 ignored


 23%|██▎       | 40878/177004 [00:49<04:37, 491.35it/s]

Wrong response format. Question 40593 ignored


 23%|██▎       | 41228/177004 [00:50<04:08, 546.59it/s]

Wrong response format. Question 40893 ignored


 23%|██▎       | 41580/177004 [00:50<03:45, 600.24it/s]

Wrong response format. Question 41305 ignored


 24%|██▍       | 42183/177004 [00:51<03:09, 711.05it/s]

Wrong response format. Question 41837 ignored


 24%|██▍       | 42787/177004 [00:52<02:28, 902.03it/s]

Wrong response format. Question 42348 ignored
Wrong response format. Question 42798 ignored


 24%|██▍       | 42955/177004 [00:53<05:47, 386.26it/s]

Wrong response format. Question 42882 ignored
Wrong response format. Question 42956 ignored


 24%|██▍       | 43285/177004 [00:54<06:12, 358.50it/s]

Wrong response format. Question 43004 ignored


 25%|██▍       | 43658/177004 [00:55<05:30, 403.73it/s]

Wrong response format. Question 43645 ignored


 25%|██▍       | 44163/177004 [00:56<03:37, 611.90it/s]

Wrong response format. Question 43776 ignored


 25%|██▌       | 44507/177004 [00:56<04:09, 531.30it/s]

Wrong response format. Question 44506 ignored


 25%|██▌       | 44625/177004 [00:57<05:26, 405.87it/s]

Wrong response format. Question 44624 ignored


 25%|██▌       | 45120/177004 [00:58<03:46, 582.54it/s]

Wrong response format. Question 44716 ignored


 26%|██▌       | 45675/177004 [00:59<03:10, 691.16it/s]

Wrong response format. Question 45342 ignored


 26%|██▌       | 46037/177004 [00:59<04:07, 528.76it/s]

Wrong response format. Question 45986 ignored


 26%|██▌       | 46353/177004 [01:02<10:11, 213.73it/s]

Wrong response format. Node 85940 ignored
Wrong response format. Node 84172 ignored
Wrong response format. Node 92672 ignored
Wrong response format. Node 33029 ignored


 26%|██▋       | 46882/177004 [01:03<05:30, 394.11it/s]

Wrong response format. Question 46604 ignored


 27%|██▋       | 47399/177004 [01:04<03:53, 555.99it/s]

Wrong response format. Question 47190 ignored


 27%|██▋       | 48114/177004 [01:05<02:45, 777.84it/s]

Wrong response format. Question 47758 ignored
Wrong response format. Question 48171 ignored


 27%|██▋       | 48452/177004 [01:06<04:30, 475.30it/s]

Wrong response format. Question 48246 ignored


 27%|██▋       | 48601/177004 [01:07<05:23, 396.57it/s]

Wrong response format. Question 48600 ignored


 28%|██▊       | 48879/177004 [01:07<04:54, 434.82it/s]

Wrong response format. Question 48645 ignored


 28%|██▊       | 49163/177004 [01:08<04:38, 459.83it/s]

Wrong response format. Question 48937 ignored


 28%|██▊       | 49524/177004 [01:09<04:44, 448.82it/s]

Wrong response format. Question 49523 ignored


 28%|██▊       | 49815/177004 [01:09<04:32, 466.95it/s]

Wrong response format. Question 49649 ignored


 28%|██▊       | 50096/177004 [01:10<04:26, 475.72it/s]

Wrong response format. Question 49865 ignored


 28%|██▊       | 50204/177004 [01:11<06:08, 344.12it/s]

Wrong response format. Question 50105 ignored


 28%|██▊       | 50371/177004 [01:11<06:43, 313.80it/s]

Wrong response format. Question 50370 ignored


 28%|██▊       | 50442/177004 [01:12<08:59, 234.79it/s]

Wrong response format. Question 50441 ignored


 29%|██▊       | 50501/177004 [01:12<10:52, 193.87it/s]

Wrong response format. Question 50500 ignored


 29%|██▊       | 50791/177004 [01:13<07:21, 286.13it/s]

Wrong response format. Question 50637 ignored


 29%|██▉       | 51041/177004 [01:14<05:52, 356.97it/s]

Wrong response format. Question 50849 ignored


 29%|██▉       | 51298/177004 [01:15<05:13, 400.78it/s]

Wrong response format. Question 51124 ignored


 29%|██▉       | 51585/177004 [01:15<05:36, 372.82it/s]

Wrong response format. Question 51536 ignored


 29%|██▉       | 51674/177004 [01:16<07:23, 282.51it/s]

Wrong response format. Question 51608 ignored


 29%|██▉       | 51762/177004 [01:17<08:53, 234.93it/s]

Wrong response format. Question 51761 ignored


 29%|██▉       | 52078/177004 [01:17<05:50, 356.60it/s]

Wrong response format. Question 51915 ignored


 30%|██▉       | 52322/177004 [01:18<05:18, 391.19it/s]

Wrong response format. Question 52132 ignored


 30%|██▉       | 52617/177004 [01:19<06:13, 333.40it/s]

Wrong response format. Question 52616 ignored


 30%|██▉       | 52928/177004 [01:19<05:08, 402.64it/s]

Wrong response format. Question 52772 ignored


 30%|███       | 53543/177004 [01:20<03:43, 553.27it/s]

Wrong response format. Question 53390 ignored


 30%|███       | 53852/177004 [01:21<02:29, 823.84it/s]

Wrong response format. Question 53911 ignored


 30%|███       | 53985/177004 [01:22<06:21, 322.88it/s]

Wrong response format. Question 53947 ignored


 31%|███       | 54278/177004 [01:22<05:21, 381.91it/s]

Wrong response format. Question 54132 ignored


 31%|███       | 54703/177004 [01:23<05:48, 350.64it/s]

Wrong response format. Question 54685 ignored


 31%|███       | 55125/177004 [01:24<03:49, 530.15it/s]

Wrong response format. Question 54826 ignored


 31%|███▏      | 55533/177004 [01:25<03:16, 619.48it/s]

Wrong response format. Question 55252 ignored


 32%|███▏      | 56258/177004 [01:26<03:38, 552.35it/s] 

Wrong response format. Question 56246 ignored
Wrong response format. Question 56258 ignored


 32%|███▏      | 56363/177004 [01:27<07:59, 251.62it/s]

Wrong response format. Question 56311 ignored


 32%|███▏      | 56589/177004 [01:28<06:33, 306.17it/s]

Wrong response format. Question 56367 ignored


 32%|███▏      | 56998/177004 [01:28<04:46, 418.64it/s]

Wrong response format. Question 56859 ignored


 32%|███▏      | 57249/177004 [01:29<05:37, 355.31it/s]

Wrong response format. Question 57178 ignored
Wrong response format. Question 57258 ignored
Wrong response format. Question 57316 ignored


 32%|███▏      | 57468/177004 [01:31<10:13, 194.80it/s]

Wrong response format. Question 57322 ignored


 33%|███▎      | 57688/177004 [01:32<07:44, 257.08it/s]

Wrong response format. Question 57518 ignored


 33%|███▎      | 58516/177004 [01:33<03:43, 531.04it/s]

Wrong response format. Question 58257 ignored


 34%|███▎      | 59308/177004 [01:34<02:44, 713.89it/s]

Wrong response format. Question 59043 ignored


 34%|███▎      | 59557/177004 [01:35<03:47, 516.94it/s]

Wrong response format. Question 59327 ignored


 34%|███▎      | 59657/177004 [01:35<05:50, 335.24it/s]

Wrong response format. Question 59585 ignored


 34%|███▍      | 59888/177004 [01:36<05:36, 348.19it/s]

Wrong response format. Question 59772 ignored


 34%|███▍      | 60233/177004 [01:37<04:50, 402.17it/s]

Wrong response format. Question 60023 ignored


 34%|███▍      | 60725/177004 [01:38<05:18, 365.17it/s]

Wrong response format. Question 60647 ignored


 34%|███▍      | 60934/177004 [01:39<05:22, 359.36it/s]

Wrong response format. Question 60770 ignored


 35%|███▍      | 61662/177004 [01:40<03:41, 520.97it/s]

Wrong response format. Question 61422 ignored


 35%|███▌      | 62265/177004 [01:41<03:44, 510.25it/s]

Wrong response format. Question 62111 ignored


 36%|███▌      | 62974/177004 [01:42<03:34, 531.55it/s]

Wrong response format. Question 62790 ignored


 36%|███▌      | 63200/177004 [01:43<04:54, 386.52it/s]

Wrong response format. Question 63152 ignored


 36%|███▌      | 63394/177004 [01:43<05:20, 354.63it/s]

Wrong response format. Question 63256 ignored


 36%|███▌      | 63949/177004 [01:44<04:17, 439.80it/s]

Wrong response format. Question 63844 ignored


 37%|███▋      | 65085/177004 [01:46<04:24, 423.69it/s] 

Wrong response format. Question 65059 ignored


 37%|███▋      | 65276/177004 [01:47<05:09, 360.95it/s]

Wrong response format. Question 65095 ignored


 37%|███▋      | 65352/177004 [01:47<07:23, 251.54it/s]

Wrong response format. Question 65317 ignored


 37%|███▋      | 65533/177004 [01:48<06:43, 276.17it/s]

Wrong response format. Question 65425 ignored


 37%|███▋      | 66063/177004 [01:49<04:34, 404.63it/s]

Wrong response format. Question 65939 ignored


 37%|███▋      | 66372/177004 [01:50<05:32, 332.89it/s]

Wrong response format. Question 66327 ignored


 38%|███▊      | 66557/177004 [01:50<05:41, 323.04it/s]

Wrong response format. Question 66379 ignored


 38%|███▊      | 67002/177004 [01:51<05:13, 351.36it/s]

Wrong response format. Question 66951 ignored


 38%|███▊      | 67196/177004 [01:52<05:22, 340.42it/s]

Wrong response format. Question 67058 ignored


 38%|███▊      | 67751/177004 [01:53<04:53, 372.22it/s]

Wrong response format. Question 67647 ignored


 38%|███▊      | 67829/177004 [01:54<07:01, 258.74it/s]

Wrong response format. Question 67772 ignored


 38%|███▊      | 67887/177004 [01:54<09:20, 194.63it/s]

Wrong response format. Question 67856 ignored


 38%|███▊      | 67940/177004 [01:55<11:00, 165.14it/s]

Wrong response format. Question 67939 ignored


 38%|███▊      | 68081/177004 [01:56<08:54, 203.75it/s]

Wrong response format. Question 67954 ignored


 39%|███▊      | 68410/177004 [01:56<05:23, 335.30it/s]

Wrong response format. Question 68187 ignored


 39%|███▉      | 68612/177004 [01:57<06:02, 299.12it/s]

Wrong response format. Question 68597 ignored


 39%|███▉      | 68942/177004 [01:58<04:17, 419.61it/s]

Wrong response format. Question 68710 ignored


 40%|███▉      | 70048/177004 [01:59<03:31, 506.81it/s]

Wrong response format. Question 69872 ignored


 41%|████      | 71864/177004 [02:02<03:38, 481.81it/s]

Wrong response format. Question 71766 ignored
Wrong response format. Question 71930 ignored


 41%|████      | 72051/177004 [02:03<07:14, 241.79it/s]

Wrong response format. Question 71933 ignored


 41%|████      | 72352/177004 [02:04<06:22, 273.93it/s]

Wrong response format. Question 72331 ignored


 41%|████      | 72448/177004 [02:05<07:27, 233.61it/s]

Wrong response format. Question 72447 ignored


 41%|████      | 72501/177004 [02:05<09:41, 179.62it/s]

Wrong response format. Question 72473 ignored


 41%|████      | 72655/177004 [02:06<08:03, 215.84it/s]

Wrong response format. Question 72556 ignored


 41%|████      | 72724/177004 [02:06<09:51, 176.23it/s]

Wrong response format. Question 72723 ignored


 41%|████      | 72856/177004 [02:07<08:38, 200.76it/s]

Wrong response format. Question 72724 ignored


 41%|████      | 72905/177004 [02:08<11:17, 153.76it/s]

Wrong response format. Question 72885 ignored


 41%|████▏     | 73039/177004 [02:08<09:01, 192.09it/s]

Wrong response format. Question 72932 ignored


 41%|████▏     | 73206/177004 [02:09<08:39, 199.70it/s]

Wrong response format. Question 73201 ignored


 41%|████▏     | 73374/177004 [02:10<07:11, 240.07it/s]

Wrong response format. Question 73273 ignored


 42%|████▏     | 73526/177004 [02:10<06:51, 251.44it/s]

Wrong response format. Question 73425 ignored


 42%|████▏     | 74386/177004 [02:12<03:41, 462.91it/s]

Wrong response format. Question 74230 ignored


 42%|████▏     | 75075/177004 [02:13<03:49, 444.08it/s]

Wrong response format. Question 74945 ignored


 43%|████▎     | 75430/177004 [02:14<04:21, 388.72it/s]

Wrong response format. Question 75287 ignored


 43%|████▎     | 76558/177004 [02:16<03:52, 431.27it/s]

Wrong response format. Question 76424 ignored


 44%|████▎     | 77024/177004 [02:16<02:07, 783.06it/s]

Wrong response format. Question 77060 ignored
Wrong response format. Question 77073 ignored


 44%|████▎     | 77115/177004 [02:18<12:22, 134.54it/s]

Wrong response format. Question 77095 ignored


 44%|████▎     | 77276/177004 [02:19<09:21, 177.55it/s]

Wrong response format. Question 77158 ignored


 44%|████▍     | 77442/177004 [02:20<08:47, 188.63it/s]

Wrong response format. Question 77382 ignored


 44%|████▍     | 77591/177004 [02:20<07:38, 216.84it/s]

Wrong response format. Question 77466 ignored


 44%|████▍     | 77861/177004 [02:21<05:08, 321.11it/s]

Wrong response format. Question 77681 ignored
Wrong response format. Question 77907 ignored


 44%|████▍     | 78013/177004 [02:22<07:48, 211.08it/s]

Wrong response format. Question 77927 ignored


 44%|████▍     | 78191/177004 [02:23<04:39, 353.06it/s]

Wrong response format. Question 78241 ignored


 44%|████▍     | 78347/177004 [02:24<08:01, 205.04it/s]

Wrong response format. Question 78243 ignored


 45%|████▍     | 79229/177004 [02:25<04:57, 328.35it/s]

Wrong response format. Question 79184 ignored


 45%|████▍     | 79378/177004 [02:26<05:51, 278.02it/s]

Wrong response format. Question 79233 ignored


 45%|████▌     | 79710/177004 [02:27<04:48, 337.62it/s]

Wrong response format. Question 79557 ignored


 45%|████▌     | 79864/177004 [02:28<07:37, 212.35it/s]

Wrong response format. Question 79825 ignored


 45%|████▌     | 79922/177004 [02:28<09:49, 164.79it/s]

Wrong response format. Question 79921 ignored


 45%|████▌     | 80068/177004 [02:29<08:07, 198.66it/s]

Wrong response format. Question 79978 ignored


 45%|████▌     | 80314/177004 [02:30<06:05, 264.37it/s]

Wrong response format. Question 80172 ignored


 45%|████▌     | 80462/177004 [02:31<06:15, 257.14it/s]

Wrong response format. Question 80332 ignored


 46%|████▌     | 81275/177004 [02:32<03:54, 407.56it/s]

Wrong response format. Question 81126 ignored


 46%|████▋     | 81978/177004 [02:33<03:58, 398.56it/s]

Wrong response format. Question 81876 ignored


 46%|████▋     | 82050/177004 [02:34<06:28, 244.35it/s]

Wrong response format. Question 82017 ignored


 46%|████▋     | 82194/177004 [02:35<06:25, 246.18it/s]

Wrong response format. Question 82088 ignored


 47%|████▋     | 82893/177004 [02:36<03:50, 407.42it/s]

Wrong response format. Question 82725 ignored


 47%|████▋     | 83155/177004 [02:36<02:29, 626.01it/s]

Wrong response format. Question 83176 ignored


 47%|████▋     | 83236/177004 [02:37<08:07, 192.47it/s]

Wrong response format. Question 83204 ignored


 47%|████▋     | 83385/177004 [02:38<07:54, 197.47it/s]

Wrong response format. Question 83300 ignored


 47%|████▋     | 83618/177004 [02:43<28:32, 54.54it/s] 

Wrong response format. Node 33594 ignored
Wrong response format. Node 70527 ignored
Wrong response format. Node 84033 ignored
Wrong response format. Node 35816 ignored
Wrong response format. Node 26963 ignored
Wrong response format. Node 95168 ignored
Wrong response format. Node 39855 ignored
Wrong response format. Node 94522 ignored
Wrong response format. Node 94521 ignored
Wrong response format. Node 16282 ignored
Wrong response format. Node 13453 ignored
Wrong response format. Node 10984 ignored
Wrong response format. Node 2498 ignored
Wrong response format. Node 11482 ignored


 47%|████▋     | 83752/177004 [02:43<17:57, 86.53it/s]

Wrong response format. Question 83662 ignored


 47%|████▋     | 83803/177004 [02:44<18:00, 86.30it/s]

Wrong response format. Question 83758 ignored


 47%|████▋     | 83864/177004 [02:45<17:11, 90.29it/s]

Wrong response format. Question 83863 ignored


 47%|████▋     | 83933/177004 [02:45<15:39, 99.02it/s]

Wrong response format. Question 83932 ignored


 48%|████▊     | 84079/177004 [02:46<10:23, 148.98it/s]

Wrong response format. Question 83996 ignored


 48%|████▊     | 84304/177004 [02:47<06:15, 246.57it/s]

Wrong response format. Question 84145 ignored


 48%|████▊     | 84431/177004 [02:47<07:39, 201.30it/s]

Wrong response format. Question 84310 ignored


 48%|████▊     | 84640/177004 [02:48<06:25, 239.64it/s]

Wrong response format. Question 84548 ignored


 48%|████▊     | 85347/177004 [02:50<05:00, 305.37it/s]

Wrong response format. Question 85226 ignored


 49%|████▊     | 85963/177004 [02:51<02:06, 719.53it/s]

Wrong response format. Question 85992 ignored


 49%|████▊     | 86105/177004 [02:52<06:48, 222.36it/s]

Wrong response format. Question 86035 ignored


 49%|████▉     | 86574/177004 [02:53<05:09, 291.77it/s]

Wrong response format. Question 86447 ignored


 49%|████▉     | 86701/177004 [02:54<06:14, 241.02it/s]

Wrong response format. Question 86618 ignored


 49%|████▉     | 86907/177004 [02:55<05:32, 270.79it/s]

Wrong response format. Question 86786 ignored


 50%|████▉     | 87800/177004 [02:56<04:14, 350.93it/s]

Wrong response format. Question 87697 ignored


 50%|█████     | 88614/177004 [02:58<04:02, 364.79it/s]

Wrong response format. Question 88521 ignored


 50%|█████     | 89278/177004 [02:59<04:20, 337.30it/s]

Wrong response format. Question 89144 ignored


 51%|█████     | 89411/177004 [03:00<06:13, 234.83it/s]

Wrong response format. Question 89402 ignored
Wrong response format. Question 89433 ignored


 51%|█████     | 89458/177004 [03:01<12:44, 114.49it/s]

Wrong response format. Question 89443 ignored


 51%|█████     | 89595/177004 [03:02<09:33, 152.44it/s]

Wrong response format. Question 89527 ignored


 51%|█████     | 90011/177004 [03:03<05:17, 273.95it/s]

Wrong response format. Question 89906 ignored


 51%|█████     | 90133/177004 [03:04<07:23, 195.91it/s]

Wrong response format. Question 90099 ignored


 51%|█████     | 90176/177004 [03:04<09:44, 148.54it/s]

Wrong response format. Question 90162 ignored


 51%|█████     | 90301/177004 [03:05<08:13, 175.85it/s]

Wrong response format. Question 90229 ignored


 51%|█████▏    | 91062/177004 [03:07<04:20, 330.03it/s]

Wrong response format. Question 90964 ignored


 52%|█████▏    | 91267/177004 [03:08<05:47, 246.76it/s]

Wrong response format. Question 91140 ignored


 52%|█████▏    | 91628/177004 [03:09<04:42, 301.96it/s]

Wrong response format. Question 91494 ignored


 52%|█████▏    | 92320/177004 [03:10<04:00, 352.28it/s]

Wrong response format. Question 92197 ignored


 53%|█████▎    | 93398/177004 [03:12<03:51, 361.36it/s]

Wrong response format. Question 93292 ignored


 53%|█████▎    | 93613/177004 [03:13<04:43, 294.48it/s]

Wrong response format. Question 93513 ignored


 53%|█████▎    | 94491/177004 [03:15<03:59, 343.99it/s]

Wrong response format. Question 94392 ignored


 53%|█████▎    | 94627/177004 [03:16<06:55, 198.30it/s]

Wrong response format. Node 14054 ignored


 54%|█████▎    | 94752/177004 [03:16<06:29, 211.13it/s]

Wrong response format. Question 94682 ignored


 54%|█████▎    | 94944/177004 [03:17<05:10, 264.10it/s]

Wrong response format. Question 94810 ignored


 54%|█████▎    | 95065/177004 [03:18<06:13, 219.10it/s]

Wrong response format. Question 95013 ignored


 54%|█████▍    | 95950/177004 [03:20<03:45, 358.81it/s]

Wrong response format. Question 95879 ignored


 54%|█████▍    | 96391/177004 [03:20<02:04, 649.03it/s]

Wrong response format. Question 96402 ignored


 55%|█████▍    | 96536/177004 [03:22<06:03, 221.61it/s]

Wrong response format. Question 96422 ignored


 55%|█████▍    | 96897/177004 [03:23<05:04, 263.51it/s]

Wrong response format. Question 96810 ignored


 55%|█████▍    | 97249/177004 [03:23<02:25, 547.88it/s]

Wrong response format. Question 97270 ignored


 55%|█████▌    | 97392/177004 [08:41<48:03:34,  2.17s/it]

Wrong response format. Question 97391 ignored


 55%|█████▌    | 97401/177004 [09:10<53:23:19,  2.41s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.43912258744239807
Cop confidence without context: 0.43912258744239807
LLM is correct without context: True
Confidences with context: {'anatomy': {70733: 0.39609143137931824}, 'disease': {83798: 0.40001827478408813}, 'effect/phenotype': {22925: 0.39744535088539124, 89723: 0.40543243288993835, 91754: 0.4100411534309387}}
Accuracies with context: {'anatomy': {70733: 1}, 'disease': {83798: 1}, 'effect/phenotype': {22925: 1, 89723: 1, 91754: 1}}
Cop confidences with context: {'anatomy': {70733: 0.39609143137931824}, 'disease': {83798: 0.40001827478408813}, 'effect/phenotype': {22925: 0.39744535088539124, 89723: 0.40543243288993835, 91754: 0.4100411534309387}}


 55%|█████▌    | 97501/177004 [14:41<71:14:31,  3.23s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.2756394147872925
Cop confidence without context: 0.2756394147872925
LLM is correct without context: True
Confidences with context: {'anatomy': {72071: 0.3247811496257782}}
Accuracies with context: {'anatomy': {72071: 0}}
Cop confidences with context: {'anatomy': {72071: 0.25492364168167114}}


 55%|█████▌    | 97523/177004 [15:58<76:05:33,  3.45s/it]

Wrong response format. Node 72147 ignored


 55%|█████▌    | 97565/177004 [18:24<87:19:24,  3.96s/it]

Wrong response format. Node 91012 ignored


 55%|█████▌    | 97601/177004 [20:23<65:00:44,  2.95s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.4255705177783966
Cop confidence without context: 0.20419096946716309
LLM is correct without context: False
Confidences with context: {'disease': {33075: 0.4579002857208252}}
Accuracies with context: {'disease': {33075: 0}}
Cop confidences with context: {'disease': {33075: 0.1658404916524887}}


 55%|█████▌    | 97683/177004 [25:14<88:39:53,  4.02s/it]

Wrong response format. Node 96414 ignored
Wrong response format. Node 21723 ignored
Wrong response format. Node 21699 ignored


 55%|█████▌    | 97701/177004 [26:09<73:06:40,  3.32s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.5022894740104675
Cop confidence without context: 0.5022894740104675
LLM is correct without context: True
Confidences with context: {'drug': {17606: 0.531107485294342, 21203: 0.5029581189155579}, 'disease': {31731: 0.7333835959434509}, 'gene/protein': {34574: 0.48683273792266846}}
Accuracies with context: {'drug': {17606: 1, 21203: 1}, 'disease': {31731: 1}, 'gene/protein': {34574: 1}}
Cop confidences with context: {'drug': {17606: 0.531107485294342, 21203: 0.5029581189155579}, 'disease': {31731: 0.7333835959434509}, 'gene/protein': {34574: 0.48683273792266846}}


 55%|█████▌    | 97801/177004 [31:56<79:22:05,  3.61s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.7565787434577942
Cop confidence without context: 0.7565787434577942
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {88050: 0.6557028293609619, 94180: 0.47971487045288086}, 'disease': {33577: 0.6130375862121582, 99477: 0.6057496070861816, 97093: 0.5809199810028076, 38899: 0.5535802245140076, 97064: 0.6515360474586487}, 'gene/protein': {4794: 0.6853047609329224, 22002: 0.7097645401954651, 33776: 0.7719057202339172, 4757: 0.7352883219718933, 6907: 0.7076103091239929}}
Accuracies with context: {'effect/phenotype': {88050: 1, 94180: 1}, 'disease': {33577: 1, 99477: 1, 97093: 1, 38899: 1, 97064: 1}, 'gene/protein': {4794: 1, 22002: 1, 33776: 1, 4757: 1, 6907: 1}}
Cop confidences with context: {'effect/phenotype': {88050: 0.6557028293609619, 94180: 0.47971487045288086}, 'disease': {33577: 0.6130375862121582, 99477: 0.6057496070861816, 97093: 0.5809199810028076, 38899: 0.5535802245

 55%|█████▌    | 97806/177004 [32:07<46:27:19,  2.11s/it]

Wrong response format. Question 97805 ignored


 55%|█████▌    | 97807/177004 [32:08<35:37:33,  1.62s/it]

Wrong response format. Question 97806 ignored


 55%|█████▌    | 97827/177004 [33:08<52:08:03,  2.37s/it]

Wrong response format. Question 97826 ignored


 55%|█████▌    | 97896/177004 [37:08<65:47:37,  2.99s/it]

Wrong response format. Question 97895 ignored


 55%|█████▌    | 97901/177004 [37:22<54:23:48,  2.48s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.2835668623447418
Cop confidence without context: 0.2835668623447418
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {89660: 0.4645736813545227}, 'disease': {33474: 0.4850180447101593, 32981: 0.507328987121582, 27713: 0.4257243573665619}}
Accuracies with context: {'effect/phenotype': {89660: 0}, 'disease': {33474: 0, 32981: 0, 27713: 0}}
Cop confidences with context: {'effect/phenotype': {89660: 0.1320667713880539}, 'disease': {33474: 0.19292683899402618, 32981: 0.31255632638931274, 27713: 0.18167684972286224}}


 55%|█████▌    | 97957/177004 [40:31<63:34:55,  2.90s/it]

Wrong response format. Node 83917 ignored


 55%|█████▌    | 98001/177004 [42:46<65:44:50,  3.00s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.8079123497009277
Cop confidence without context: 0.04182591661810875
LLM is correct without context: False
Confidences with context: {'disease': {27759: 0.6388643980026245, 31044: 0.94334876537323, 39504: 0.5397998690605164}, 'drug': {18392: 0.6475110054016113, 21365: 0.6351476907730103}, 'gene/protein': {6018: 0.5870014429092407, 1639: 0.572177529335022, 10837: 0.5785555243492126}}
Accuracies with context: {'disease': {27759: 0, 31044: 0, 39504: 0}, 'drug': {18392: 0, 21365: 0}, 'gene/protein': {6018: 0, 1639: 0, 10837: 0}}
Cop confidences with context: {'disease': {27759: 0.10676495730876923, 31044: 0.011243032291531563, 39504: 0.13755322992801666}, 'drug': {18392: 0.10570326447486877, 21365: 0.10207752138376236}, 'gene/protein': {6018: 0.11649399995803833, 1639: 0.11624491959810257, 10837: 0.12512141466140747}}


 55%|█████▌    | 98055/177004 [45:44<77:54:39,  3.55s/it]

Wrong response format. Node 24513 ignored
Wrong response format. Node 88349 ignored
Wrong response format. Node 88350 ignored


 55%|█████▌    | 98069/177004 [46:34<79:16:15,  3.62s/it]

Wrong response format. Node 17173 ignored
Wrong response format. Node 18590 ignored


 55%|█████▌    | 98101/177004 [48:28<64:42:14,  2.95s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.4354913532733917
Cop confidence without context: 0.1759544163942337
LLM is correct without context: False
Confidences with context: {'drug': {14209: 0.3728313446044922, 15545: 0.3608322739601135, 16533: 0.34654372930526733, 14631: 0.3267664909362793, 20231: 0.3631753921508789}}
Accuracies with context: {'drug': {14209: 0, 15545: 0, 16533: 0, 14631: 0, 20231: 0}}
Cop confidences with context: {'drug': {14209: 0.20751188695430756, 15545: 0.2311578392982483, 16533: 0.1675776094198227, 14631: 0.176277294754982, 20231: 0.18694667518138885}}


 55%|█████▌    | 98201/177004 [54:20<74:46:26,  3.42s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.44948986172676086
Cop confidence without context: 0.1787947118282318
LLM is correct without context: False
Confidences with context: {'anatomy': {72216: 0.47099804878234863, 70192: 0.44172683358192444}, 'disease': {28627: 0.4091120958328247, 30680: 0.4441761076450348, 38339: 0.5171499848365784}}
Accuracies with context: {'anatomy': {72216: 0, 70192: 0}, 'disease': {28627: 0, 30680: 0, 38339: 0}}
Cop confidences with context: {'anatomy': {72216: 0.16925670206546783, 70192: 0.19601508975028992}, 'disease': {28627: 0.20094910264015198, 30680: 0.1621316522359848, 38339: 0.14473368227481842}}


 56%|█████▌    | 98301/177004 [59:40<67:31:54,  3.09s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.5678598284721375
Cop confidence without context: 0.5678598284721375
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {88382: 0.5860570669174194, 84795: 0.5294819474220276, 26199: 0.582323431968689}, 'disease': {28473: 0.5682153701782227, 31661: 0.6916718482971191, 33064: 0.5733216404914856, 32595: 0.5279627442359924, 96871: 0.45557039976119995, 95791: 0.4989060163497925, 95670: 0.5426443219184875, 94925: 0.3893008828163147, 94924: 0.40155476331710815, 36499: 0.5544628500938416, 33392: 0.5449846386909485, 33195: 0.5162932276725769}}
Accuracies with context: {'effect/phenotype': {88382: 1, 84795: 1, 26199: 1}, 'disease': {28473: 1, 31661: 0, 33064: 0, 32595: 1, 96871: 1, 95791: 1, 95670: 1, 94925: 0, 94924: 0, 36499: 1, 33392: 1, 33195: 1}}
Cop confidences with context: {'effect/phenotype': {88382: 0.5860570669174194, 84795: 0.5294819474220276, 26199: 0.582323431968689}, 'disea

 56%|█████▌    | 98312/177004 [1:00:23<73:45:01,  3.37s/it]

Wrong response format. Node 95793 ignored


 56%|█████▌    | 98401/177004 [1:04:59<83:22:32,  3.82s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.38149163126945496
Cop confidence without context: 0.38149163126945496
LLM is correct without context: True
Confidences with context: {'drug': {20921: 0.41125160455703735, 15788: 0.3998124301433563, 15088: 0.5200381278991699, 21870: 0.37159380316734314, 20327: 0.41244572401046753, 21550: 0.35061556100845337, 21474: 0.3494754731655121, 21469: 0.36811885237693787, 21877: 0.37877821922302246, 21476: 0.3878270089626312, 21871: 0.40020573139190674, 21467: 0.3892225921154022, 21551: 0.39408597350120544}, 'gene/protein': {4970: 0.3516434133052826}, 'disease': {28966: 0.4064655303955078}}
Accuracies with context: {'drug': {20921: 1, 15788: 1, 15088: 0, 21870: 1, 20327: 1, 21550: 1, 21474: 1, 21469: 1, 21877: 1, 21476: 1, 21871: 1, 21467: 1, 21551: 1}, 'gene/protein': {4970: 1}, 'disease': {28966: 1}}
Cop confidences with context: {'drug': {20921: 0.41125160455703735, 15788: 0.3998124301433563, 15088: 0.2827398180961609

 56%|█████▌    | 98432/177004 [1:06:37<65:36:29,  3.01s/it]

Wrong response format. Node 97180 ignored
Wrong response format. Node 14995 ignored
Wrong response format. Node 85531 ignored
Wrong response format. Node 20402 ignored
Wrong response format. Node 15452 ignored
Wrong response format. Node 84014 ignored
Wrong response format. Node 21527 ignored
Wrong response format. Node 20231 ignored


 56%|█████▌    | 98501/177004 [1:10:13<51:02:24,  2.34s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.624932050704956
Cop confidence without context: 0.2625536620616913
LLM is correct without context: False
Confidences with context: {'anatomy': {63845: 0.52309650182724}, 'disease': {36041: 0.5545632243156433, 28448: 0.4527136981487274, 37758: 0.6071010828018188, 38559: 0.6583552956581116, 36849: 0.606201708316803, 95399: 0.5908472537994385, 37524: 0.5739515423774719, 96809: 0.5584721565246582, 97058: 0.643074631690979, 99429: 0.5569527745246887}}
Accuracies with context: {'anatomy': {63845: 0}, 'disease': {36041: 0, 28448: 0, 37758: 0, 38559: 0, 36849: 0, 95399: 0, 37524: 0, 96809: 0, 97058: 0, 99429: 0}}
Cop confidences with context: {'anatomy': {63845: 0.25693637132644653}, 'disease': {36041: 0.2538975477218628, 28448: 0.16016384959220886, 37758: 0.188071146607399, 38559: 0.09709426015615463, 36849: 0.17367969453334808, 95399: 0.23872385919094086, 37524: 0.2411351501941681, 96809: 0.24207976460456848, 97058:

 56%|█████▌    | 98514/177004 [1:10:53<56:17:12,  2.58s/it]

Wrong response format. Question 98513 ignored


 56%|█████▌    | 98551/177004 [1:12:53<81:25:49,  3.74s/it]

Wrong response format. Node 83790 ignored


 56%|█████▌    | 98595/177004 [1:15:20<51:24:17,  2.36s/it]

Wrong response format. Node 84015 ignored
Wrong response format. Node 68530 ignored


 56%|█████▌    | 98601/177004 [1:15:39<73:13:46,  3.36s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.29448992013931274
Cop confidence without context: 0.19013698399066925
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {93534: 0.27208074927330017, 22952: 0.2723191976547241}, 'disease': {31913: 0.2602780759334564, 99856: 0.26816803216934204, 32890: 0.309887170791626, 27731: 0.28029289841651917, 33521: 0.28661447763442993, 31764: 0.35065019130706787, 30409: 0.2992064356803894, 33473: 0.2866274118423462, 33528: 0.27783480286598206, 31749: 0.29568547010421753}, 'drug': {39890: 0.2930392920970917, 14715: 0.2626285254955292}}
Accuracies with context: {'effect/phenotype': {93534: 0, 22952: 0}, 'disease': {31913: 0, 99856: 0, 32890: 0, 27731: 0, 33521: 0, 31764: 0, 30409: 0, 33473: 0, 33528: 0, 31749: 0}, 'drug': {39890: 0, 14715: 0}}
Cop confidences with context: {'effect/phenotype': {93534: 0.22033770382404327, 22952: 0.2120824158191681}, 'disease': {31913: 0.22969458997249603, 

 56%|█████▌    | 98701/177004 [1:21:09<86:34:16,  3.98s/it] 

Example Feedback:

Response without context: A
Confidence without context: 0.4114203453063965
Cop confidence without context: 0.22194461524486542
LLM is correct without context: False
Confidences with context: {'disease': {99881: 0.4511663019657135, 94774: 0.47838094830513, 30152: 0.3732132613658905, 99806: 0.3619522452354431, 33257: 0.7812302112579346}}
Accuracies with context: {'disease': {99881: 0, 94774: 0, 30152: 0, 99806: 0, 33257: 0}}
Cop confidences with context: {'disease': {99881: 0.13546468317508698, 94774: 0.1917792409658432, 30152: 0.25650548934936523, 99806: 0.2818886935710907, 33257: 0.02550807036459446}}


 56%|█████▌    | 98801/177004 [1:26:43<79:13:37,  3.65s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.2661329209804535
Cop confidence without context: 0.1993243545293808
LLM is correct without context: False
Confidences with context: {'disease': {36041: 0.2614385783672333, 95399: 0.2683948278427124, 96809: 0.266583114862442, 99429: 0.2625235915184021, 37524: 0.2631714344024658}, 'drug': {14771: 0.2820548415184021, 15799: 0.2874101996421814, 14790: 0.2982059717178345, 15221: 0.38782283663749695, 15440: 0.4267902672290802, 21438: 0.44020235538482666, 14965: 0.6294514536857605, 14870: 0.2747451364994049, 17207: 0.27407020330429077, 15545: 0.2690582573413849, 17196: 0.27054327726364136, 21402: 0.2722853720188141, 21408: 0.2781234383583069}}
Accuracies with context: {'disease': {36041: 0, 95399: 0, 96809: 0, 99429: 0, 37524: 0}, 'drug': {14771: 0, 15799: 0, 14790: 0, 15221: 1, 15440: 0, 21438: 0, 14965: 0, 14870: 0, 17207: 0, 15545: 0, 17196: 0, 21402: 0, 21408: 0}}
Cop confidences with context: {'disease': {36041:

 56%|█████▌    | 98901/177004 [1:32:31<91:29:35,  4.22s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.33021995425224304
Cop confidence without context: 0.33021995425224304
LLM is correct without context: True
Confidences with context: {'disease': {31637: 0.3607766926288605, 30075: 0.33856117725372314, 36724: 0.39003628492355347, 32531: 0.3811906576156616, 35884: 0.43193456530570984, 98661: 0.4016631543636322, 97389: 0.34493839740753174, 98664: 0.35834670066833496, 97390: 0.43934208154678345, 97406: 0.44636258482933044, 97280: 0.4363132417201996, 37958: 0.39998772740364075, 99923: 0.41637900471687317, 97274: 0.3492012321949005, 97373: 0.42513012886047363}, 'drug': {15654: 0.323540598154068}, 'effect/phenotype': {92435: 0.32910922169685364}}
Accuracies with context: {'disease': {31637: 1, 30075: 1, 36724: 1, 32531: 1, 35884: 1, 98661: 1, 97389: 1, 98664: 1, 97390: 1, 97406: 1, 97280: 1, 37958: 1, 99923: 1, 97274: 1, 97373: 1}, 'drug': {15654: 1}, 'effect/phenotype': {92435: 1}}
Cop confidences with context: {'di

 56%|█████▌    | 99001/177004 [1:38:26<93:52:56,  4.33s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.33592045307159424
Cop confidence without context: 0.3011171221733093
LLM is correct without context: False
Confidences with context: {'anatomy': {71885: 0.2499578893184662, 71297: 0.2682672441005707, 72122: 0.2882739305496216, 72553: 0.3177807927131653, 72636: 0.30255699157714844, 68189: 0.2923552393913269, 65949: 0.26321566104888916, 70279: 0.3035255968570709, 76740: 0.2725970447063446, 64304: 0.2653674781322479, 68255: 0.26095491647720337}, 'disease': {36372: 0.5705636143684387, 28921: 0.4988500773906708, 33626: 0.33547377586364746, 36448: 0.31111282110214233, 84162: 0.264859139919281}, 'gene/protein': {35178: 0.2568039894104004}}
Accuracies with context: {'anatomy': {71885: 0, 71297: 0, 72122: 0, 72553: 0, 72636: 0, 68189: 0, 65949: 0, 70279: 0, 76740: 0, 64304: 0, 68255: 0}, 'disease': {36372: 0, 28921: 1, 33626: 0, 36448: 0, 84162: 0}, 'gene/protein': {35178: 0}}
Cop confidences with context: {'anatomy': 

 56%|█████▌    | 99101/177004 [1:43:56<53:12:17,  2.46s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.25146275758743286
Cop confidence without context: 0.20205600559711456
LLM is correct without context: False
Confidences with context: {'disease': {99856: 0.29724371433258057, 83955: 0.3182739317417145}, 'drug': {15976: 0.31100571155548096, 21203: 0.2814314663410187}}
Accuracies with context: {'disease': {99856: 0, 83955: 0}, 'drug': {15976: 0, 21203: 0}}
Cop confidences with context: {'disease': {99856: 0.21577581763267517, 83955: 0.1915406435728073}, 'drug': {15976: 0.2120874673128128, 21203: 0.19191958010196686}}


 56%|█████▌    | 99125/177004 [1:45:16<75:36:16,  3.49s/it]

Wrong response format. Node 31133 ignored


 56%|█████▌    | 99182/177004 [1:48:29<98:36:16,  4.56s/it] 

Wrong response format. Node 30251 ignored


 56%|█████▌    | 99201/177004 [1:49:31<77:17:21,  3.58s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.297433465719223
Cop confidence without context: 0.297433465719223
LLM is correct without context: True
Confidences with context: {'disease': {30357: 0.2921828329563141, 36117: 0.4674069583415985, 39794: 0.7791603207588196, 37787: 0.4554041028022766, 38973: 0.30756351351737976, 38216: 0.2988484501838684, 83973: 0.39190545678138733, 38087: 0.4528874158859253, 97214: 0.44309186935424805, 99043: 0.457881361246109}, 'effect/phenotype': {93837: 0.2730506956577301, 94334: 0.27860528230667114}}
Accuracies with context: {'disease': {30357: 0, 36117: 0, 39794: 0, 37787: 0, 38973: 0, 38216: 0, 83973: 0, 38087: 0, 97214: 0, 99043: 0}, 'effect/phenotype': {93837: 0, 94334: 0}}
Cop confidences with context: {'disease': {30357: 0.25785043835639954, 36117: 0.28796109557151794, 39794: 0.04464944079518318, 37787: 0.20366930961608887, 38973: 0.258994460105896, 38216: 0.25762346386909485, 83973: 0.28897297382354736, 38087: 0.2878

 56%|█████▌    | 99301/177004 [1:55:49<71:25:03,  3.31s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.5191996097564697
Cop confidence without context: 0.2358570396900177
LLM is correct without context: False
Confidences with context: {'disease': {97907: 0.3594059944152832, 33482: 0.7008054852485657, 27699: 0.34993377327919006, 32308: 0.48301196098327637, 37611: 0.3783346712589264, 32413: 0.29376399517059326}, 'anatomy': {66059: 0.42886754870414734}, 'effect/phenotype': {92446: 0.3855619728565216, 93443: 0.5392928123474121, 91309: 0.4273046851158142}, 'gene/protein': {8527: 0.37647417187690735}}
Accuracies with context: {'disease': {97907: 0, 33482: 0, 27699: 0, 32308: 0, 37611: 0, 32413: 0}, 'anatomy': {66059: 0}, 'effect/phenotype': {92446: 0, 93443: 0, 91309: 0}, 'gene/protein': {8527: 0}}
Cop confidences with context: {'disease': {97907: 0.2412937730550766, 33482: 0.13065272569656372, 27699: 0.1493334025144577, 32308: 0.2029285579919815, 37611: 0.26002538204193115, 32413: 0.24931426346302032}, 'anatomy': {6

 56%|█████▌    | 99401/177004 [2:01:36<67:09:18,  3.12s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.34083640575408936
Cop confidence without context: 0.34083640575408936
LLM is correct without context: True
Confidences with context: {'drug': {14843: 0.27375954389572144, 21505: 0.31435003876686096, 21527: 0.2661544680595398, 21493: 0.27721843123435974, 21492: 0.25554749369621277, 21491: 0.25203561782836914}}
Accuracies with context: {'drug': {14843: 0, 21505: 1, 21527: 0, 21493: 1, 21492: 0, 21491: 1}}
Cop confidences with context: {'drug': {14843: 0.27375954389572144, 21505: 0.31435003876686096, 21527: 0.23305265605449677, 21493: 0.27721843123435974, 21492: 0.25355878472328186, 21491: 0.25203561782836914}}


 56%|█████▌    | 99472/177004 [2:05:11<60:53:20,  2.83s/it]

Wrong response format. Node 14054 ignored


 56%|█████▌    | 99501/177004 [2:06:43<72:12:25,  3.35s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.9196391701698303
Cop confidence without context: 0.027770696207880974
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {23509: 0.8561068177223206, 89724: 0.7101741433143616, 88592: 0.943177342414856, 92479: 0.9290763139724731}, 'disease': {35834: 0.910685658454895, 29483: 0.8372736573219299, 32413: 0.8752868175506592, 33197: 0.8984420895576477, 32926: 0.889141321182251, 35969: 0.9125180840492249, 33079: 0.8560383319854736}}
Accuracies with context: {'effect/phenotype': {23509: 0, 89724: 0, 88592: 0, 92479: 0}, 'disease': {35834: 0, 29483: 0, 32413: 0, 33197: 0, 32926: 0, 35969: 0, 33079: 0}}
Cop confidences with context: {'effect/phenotype': {23509: 0.05559093877673149, 89724: 0.08958587795495987, 88592: 0.013559220358729362, 92479: 0.01444182451814413}, 'disease': {35834: 0.03165268152952194, 29483: 0.06506998091936111, 32413: 0.042902350425720215, 33197: 0.0370831340551376

 56%|█████▌    | 99550/177004 [2:09:26<53:05:18,  2.47s/it]

Wrong response format. Question 99549 ignored


 56%|█████▋    | 99601/177004 [2:12:15<65:57:28,  3.07s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.313800573348999
Cop confidence without context: 0.250183641910553
LLM is correct without context: False
Confidences with context: {'drug': {15618: 0.2986844480037689, 16892: 0.2685779929161072, 15571: 0.2996584177017212, 19675: 0.27871865034103394, 17283: 0.31233954429626465}, 'disease': {39537: 0.27690792083740234}, 'effect/phenotype': {26696: 0.38843727111816406}, 'anatomy': {71288: 0.2845079302787781}}
Accuracies with context: {'drug': {15618: 0, 16892: 0, 15571: 0, 19675: 0, 17283: 0}, 'disease': {39537: 0}, 'effect/phenotype': {26696: 0}, 'anatomy': {71288: 0}}
Cop confidences with context: {'drug': {15618: 0.21015076339244843, 16892: 0.254284530878067, 15571: 0.21415622532367706, 19675: 0.23287814855575562, 17283: 0.22148188948631287}, 'disease': {39537: 0.23136524856090546}, 'effect/phenotype': {26696: 0.155721515417099}, 'anatomy': {71288: 0.23040154576301575}}


 56%|█████▋    | 99666/177004 [2:16:18<71:13:11,  3.32s/it]

Wrong response format. Node 98534 ignored


 56%|█████▋    | 99692/177004 [2:17:43<49:52:34,  2.32s/it]

Wrong response format. Question 99691 ignored


 56%|█████▋    | 99698/177004 [2:17:59<47:58:33,  2.23s/it]

Wrong response format. Question 99697 ignored


 56%|█████▋    | 99701/177004 [2:18:10<62:00:04,  2.89s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.2956141531467438
Cop confidence without context: 0.1710883527994156
LLM is correct without context: False
Confidences with context: {'disease': {32274: 0.34873446822166443}, 'drug': {17227: 0.3280964493751526}, 'gene/protein': {34885: 0.33944594860076904, 5981: 0.356034517288208}}
Accuracies with context: {'disease': {32274: 0}, 'drug': {17227: 0}, 'gene/protein': {34885: 0, 5981: 0}}
Cop confidences with context: {'disease': {32274: 0.16217659413814545}, 'drug': {17227: 0.18694370985031128}, 'gene/protein': {34885: 0.18027837574481964, 5981: 0.17487826943397522}}


 56%|█████▋    | 99801/177004 [2:23:56<64:46:49,  3.02s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.5980156064033508
Cop confidence without context: 0.5980156064033508
LLM is correct without context: True
Confidences with context: {'disease': {31322: 0.6184913516044617, 94788: 0.5949421525001526, 27836: 0.5787129402160645, 33062: 0.5678521394729614}, 'effect/phenotype': {84675: 0.5673777461051941}, 'drug': {39896: 0.5907883048057556}, 'gene/protein': {5407: 0.5894660949707031}}
Accuracies with context: {'disease': {31322: 1, 94788: 1, 27836: 1, 33062: 1}, 'effect/phenotype': {84675: 1}, 'drug': {39896: 1}, 'gene/protein': {5407: 1}}
Cop confidences with context: {'disease': {31322: 0.6184913516044617, 94788: 0.5949421525001526, 27836: 0.5787129402160645, 33062: 0.5678521394729614}, 'effect/phenotype': {84675: 0.5673777461051941}, 'drug': {39896: 0.5907883048057556}, 'gene/protein': {5407: 0.5894660949707031}}


 56%|█████▋    | 99862/177004 [2:27:38<98:53:44,  4.62s/it]

Wrong response format. Node 94775 ignored


 56%|█████▋    | 99901/177004 [2:29:44<86:31:28,  4.04s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.2898671329021454
Cop confidence without context: 0.17998261749744415
LLM is correct without context: False
Confidences with context: {'disease': {99643: 0.34777915477752686, 39255: 0.32299643754959106, 38462: 0.31704646348953247, 98677: 0.42553848028182983, 98528: 0.3407461643218994, 98389: 0.478580117225647, 98096: 0.45636358857154846, 99803: 0.40464186668395996, 98527: 0.3553135395050049, 98630: 0.3752109706401825, 98468: 0.3950955271720886, 99435: 0.4810854494571686, 98108: 0.3236457407474518}, 'drug': {14743: 0.47277405858039856, 16430: 0.5232499837875366}}
Accuracies with context: {'disease': {99643: 0, 39255: 0, 38462: 0, 98677: 0, 98528: 0, 98389: 0, 98096: 0, 99803: 0, 98527: 0, 98630: 0, 98468: 0, 99435: 0, 98108: 0}, 'drug': {14743: 0, 16430: 0}}
Cop confidences with context: {'disease': {99643: 0.16949403285980225, 39255: 0.16114890575408936, 38462: 0.20955492556095123, 98677: 0.11909457296133041, 9

 56%|█████▋    | 99908/177004 [2:29:55<37:13:29,  1.74s/it]

Wrong response format. Node 91661 ignored


 56%|█████▋    | 100001/177004 [2:34:58<72:34:31,  3.39s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.3614438474178314
Cop confidence without context: 0.3614438474178314
LLM is correct without context: True
Confidences with context: {'drug': {16061: 0.2688911259174347, 21393: 0.33362990617752075, 21402: 0.2836124002933502, 17195: 0.3066372275352478, 21418: 0.3302614092826843, 21398: 0.28083327412605286, 39890: 0.2889859676361084, 14214: 0.3026982545852661, 21034: 0.2741933763027191}, 'effect/phenotype': {24438: 0.2731079161167145, 24100: 0.36999306082725525, 87220: 0.3273000121116638}, 'disease': {100001: 0.27106955647468567, 31633: 0.33070871233940125}}
Accuracies with context: {'drug': {16061: 1, 21393: 0, 21402: 0, 17195: 0, 21418: 0, 21398: 0, 39890: 0, 14214: 0, 21034: 0}, 'effect/phenotype': {24438: 0, 24100: 0, 87220: 0}, 'disease': {100001: 1, 31633: 0}}
Cop confidences with context: {'drug': {16061: 0.2688911259174347, 21393: 0.21709710359573364, 21402: 0.24833932518959045, 17195: 0.22966067492961884,

 57%|█████▋    | 100036/177004 [2:36:54<84:41:16,  3.96s/it]

Wrong response format. Node 91078 ignored
Wrong response format. Node 90737 ignored


 57%|█████▋    | 100101/177004 [2:40:27<67:17:38,  3.15s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.27944469451904297
Cop confidence without context: 0.2176317423582077
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {88424: 0.26140013337135315}}
Accuracies with context: {'effect/phenotype': {88424: 0}}
Cop confidences with context: {'effect/phenotype': {88424: 0.2306847870349884}}


 57%|█████▋    | 100124/177004 [2:41:24<45:15:53,  2.12s/it]

Wrong response format. Question 100123 ignored


 57%|█████▋    | 100201/177004 [2:46:08<69:25:39,  3.25s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.33284804224967957
Cop confidence without context: 0.2099246233701706
LLM is correct without context: False
Confidences with context: {'drug': {14634: 0.3025733232498169, 15440: 0.2926531732082367, 21438: 0.32125458121299744, 21435: 0.32152777910232544, 20303: 0.31741929054260254, 21495: 0.30274972319602966, 15545: 0.28952082991600037, 21436: 0.2923748791217804, 21437: 0.30344581604003906, 21433: 0.3291139602661133, 21034: 0.3033585548400879}, 'disease': {39625: 0.32069435715675354, 95927: 0.3245130181312561, 98906: 0.276690274477005}, 'effect/phenotype': {91408: 0.5467149019241333, 92905: 0.30180633068084717}}
Accuracies with context: {'drug': {14634: 0, 15440: 0, 21438: 0, 21435: 0, 20303: 0, 21495: 0, 15545: 0, 21436: 0, 21437: 0, 21433: 0, 21034: 0}, 'disease': {39625: 0, 95927: 0, 98906: 0}, 'effect/phenotype': {91408: 0, 92905: 0}}
Cop confidences with context: {'drug': {14634: 0.19232729077339172, 15440:

 57%|█████▋    | 100283/177004 [2:51:00<63:01:41,  2.96s/it]

Wrong response format. Question 100282 ignored


 57%|█████▋    | 100301/177004 [2:52:08<77:11:11,  3.62s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.8281107544898987
Cop confidence without context: 0.8281107544898987
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {89696: 0.7757469415664673, 92682: 0.4583700895309448, 88259: 0.6316971778869629}, 'drug': {20267: 0.5378519296646118, 14496: 0.4387601613998413, 20327: 0.3925255239009857}}
Accuracies with context: {'effect/phenotype': {89696: 1, 92682: 1, 88259: 1}, 'drug': {20267: 0, 14496: 1, 20327: 1}}
Cop confidences with context: {'effect/phenotype': {89696: 0.7757469415664673, 92682: 0.4583700895309448, 88259: 0.6316971778869629}, 'drug': {20267: 0.2747078835964203, 14496: 0.4387601613998413, 20327: 0.3925255239009857}}


 57%|█████▋    | 100368/177004 [2:56:10<78:41:32,  3.70s/it]

Wrong response format. Node 32956 ignored


 57%|█████▋    | 100400/177004 [2:58:08<88:42:49,  4.17s/it]

Wrong response format. Node 98644 ignored


 57%|█████▋    | 100401/177004 [2:58:10<73:04:50,  3.43s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.42284855246543884
Cop confidence without context: 0.17904534935951233
LLM is correct without context: False
Confidences with context: {'disease': {37565: 0.4283379912376404, 96889: 0.3970149755477905}, 'drug': {17751: 0.40935683250427246}, 'anatomy': {76705: 0.4245462119579315}}
Accuracies with context: {'disease': {37565: 0, 96889: 0}, 'drug': {17751: 0}, 'anatomy': {76705: 0}}
Cop confidences with context: {'disease': {37565: 0.15881262719631195, 96889: 0.1761743277311325}, 'drug': {17751: 0.164108008146286}, 'anatomy': {76705: 0.1410985141992569}}


 57%|█████▋    | 100501/177004 [3:03:50<63:09:34,  2.97s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.2520354986190796
Cop confidence without context: 0.20570538938045502
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {23966: 0.30264517664909363, 93305: 0.29492101073265076}, 'disease': {95043: 0.32655540108680725, 98388: 0.28368809819221497, 39804: 0.3536590039730072, 30438: 0.2643163800239563, 84233: 0.32971256971359253}, 'gene/protein': {13830: 0.2943173050880432}, 'drug': {17360: 0.3004114329814911, 21203: 0.32080402970314026}}
Accuracies with context: {'effect/phenotype': {23966: 0, 93305: 0}, 'disease': {95043: 0, 98388: 0, 39804: 0, 30438: 0, 84233: 0}, 'gene/protein': {13830: 0}, 'drug': {17360: 0, 21203: 0}}
Cop confidences with context: {'effect/phenotype': {23966: 0.19087591767311096, 93305: 0.18312062323093414}, 'disease': {95043: 0.20757123827934265, 98388: 0.1995995044708252, 39804: 0.2046821415424347, 30438: 0.18889841437339783, 84233: 0.20313002169132233}, '

 57%|█████▋    | 100601/177004 [3:09:30<94:35:12,  4.46s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.5139114260673523
Cop confidence without context: 0.24851134419441223
LLM is correct without context: False
Confidences with context: {'disease': {37743: 0.38258931040763855, 83800: 0.3096359968185425, 83799: 0.3278568983078003}, 'effect/phenotype': {84414: 0.4072350561618805, 26038: 0.3035700023174286, 93888: 0.39599356055259705}, 'drug': {20911: 0.313135027885437, 21797: 0.3164670169353485, 20476: 0.3161683976650238, 21793: 0.31209859251976013, 21803: 0.3080251216888428, 21788: 0.33033397793769836, 21814: 0.32128551602363586, 16752: 0.3344256579875946, 21791: 0.31129729747772217, 21796: 0.31078869104385376}, 'anatomy': {63383: 0.34638381004333496}}
Accuracies with context: {'disease': {37743: 0, 83800: 1, 83799: 0}, 'effect/phenotype': {84414: 0, 26038: 1, 93888: 0}, 'drug': {20911: 1, 21797: 1, 20476: 1, 21793: 1, 21803: 0, 21788: 0, 21814: 1, 16752: 1, 21791: 1, 21796: 0}, 'anatomy': {63383: 0}}
Cop confide

 57%|█████▋    | 100647/177004 [3:11:59<78:42:25,  3.71s/it]

Wrong response format. Node 84260 ignored
Wrong response format. Node 84028 ignored
Wrong response format. Node 39204 ignored


 57%|█████▋    | 100681/177004 [3:13:48<63:11:37,  2.98s/it]

Wrong response format. Node 24475 ignored


 57%|█████▋    | 100696/177004 [3:14:39<72:44:16,  3.43s/it]

Wrong response format. Node 33707 ignored
Wrong response format. Node 39621 ignored
Wrong response format. Node 96058 ignored
Wrong response format. Node 96054 ignored
Wrong response format. Node 96696 ignored
Wrong response format. Node 95995 ignored
Wrong response format. Node 96115 ignored


 57%|█████▋    | 100701/177004 [3:14:55<68:01:21,  3.21s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3026390075683594
Cop confidence without context: 0.2568463981151581
LLM is correct without context: False
Confidences with context: {'disease': {29015: 0.343082457780838, 31464: 0.37362539768218994, 38295: 0.3485691249370575}, 'anatomy': {73227: 0.34123384952545166}, 'gene/protein': {7461: 0.3636094331741333, 6784: 0.3834288418292999}}
Accuracies with context: {'disease': {29015: 0, 31464: 0, 38295: 0}, 'anatomy': {73227: 0}, 'gene/protein': {7461: 0, 6784: 0}}
Cop confidences with context: {'disease': {29015: 0.23396189510822296, 31464: 0.21455501019954681, 38295: 0.24144677817821503}, 'anatomy': {73227: 0.2438688576221466}, 'gene/protein': {7461: 0.21209152042865753, 6784: 0.20684434473514557}}


 57%|█████▋    | 100801/177004 [3:20:54<69:58:39,  3.31s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.8950505256652832
Cop confidence without context: 0.014022037386894226
LLM is correct without context: False
Confidences with context: {'disease': {36041: 0.8199175596237183, 31017: 0.7739613652229309, 27751: 0.7843484878540039, 28315: 0.7344967722892761, 95399: 0.822486162185669, 96809: 0.8038687705993652, 37524: 0.8019575476646423, 99429: 0.7973750233650208}, 'gene/protein': {8483: 0.7718635201454163}}
Accuracies with context: {'disease': {36041: 0, 31017: 0, 27751: 0, 28315: 0, 95399: 0, 96809: 0, 37524: 0, 99429: 0}, 'gene/protein': {8483: 0}}
Cop confidences with context: {'disease': {36041: 0.032040998339653015, 31017: 0.04795542359352112, 27751: 0.04529927298426628, 28315: 0.0557602122426033, 95399: 0.03342173993587494, 96809: 0.03730485588312149, 37524: 0.03961640223860741, 99429: 0.03788101673126221}, 'gene/protein': {8483: 0.04934358224272728}}


 57%|█████▋    | 100874/177004 [3:25:02<78:41:19,  3.72s/it]

Wrong response format. Node 29078 ignored
Wrong response format. Node 99459 ignored
Wrong response format. Node 92247 ignored


 57%|█████▋    | 100901/177004 [3:26:20<67:10:58,  3.18s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.6378917098045349
Cop confidence without context: 0.6378917098045349
LLM is correct without context: True
Confidences with context: {'drug': {14245: 0.44832277297973633, 17301: 0.7775583863258362, 14692: 0.29797568917274475, 16461: 0.39834409952163696, 21636: 0.3649802505970001, 21513: 0.36201101541519165, 21637: 0.3522837162017822, 18403: 0.37430447340011597, 21635: 0.3361137807369232, 21634: 0.30601009726524353, 21516: 0.33778998255729675, 14676: 0.3326217532157898}, 'effect/phenotype': {92883: 0.3887648582458496}}
Accuracies with context: {'drug': {14245: 1, 17301: 1, 14692: 0, 16461: 1, 21636: 1, 21513: 1, 21637: 1, 18403: 1, 21635: 1, 21634: 1, 21516: 1, 14676: 1}, 'effect/phenotype': {92883: 1}}
Cop confidences with context: {'drug': {14245: 0.44832277297973633, 17301: 0.7775583863258362, 14692: 0.2713099420070648, 16461: 0.39834409952163696, 21636: 0.3649802505970001, 21513: 0.36201101541519165, 21637: 0

 57%|█████▋    | 101001/177004 [3:31:46<43:19:54,  2.05s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.33877992630004883
Cop confidence without context: 0.21873275935649872
LLM is correct without context: False
Confidences with context: {'disease': {83848: 0.3633793294429779, 33179: 0.3660320043563843}, 'gene/protein': {34355: 0.351398766040802}}
Accuracies with context: {'disease': {83848: 0, 33179: 0}, 'gene/protein': {34355: 0}}
Cop confidences with context: {'disease': {83848: 0.1929893046617508, 33179: 0.19900815188884735}, 'gene/protein': {34355: 0.178080216050148}}


 57%|█████▋    | 101101/177004 [3:37:11<53:38:45,  2.54s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.5275042653083801
Cop confidence without context: 0.5275042653083801
LLM is correct without context: True
Confidences with context: {'disease': {27472: 0.4584510028362274, 31337: 0.4183911085128784, 30966: 0.4561232328414917, 30914: 0.444237619638443, 32163: 0.39113932847976685, 31651: 0.4210624694824219, 33566: 0.4185156226158142, 31749: 0.3350542485713959}, 'effect/phenotype': {94285: 0.4270029664039612, 24380: 0.4322117269039154}, 'gene/protein': {4277: 0.47811779379844666, 4051: 0.4549654722213745}}
Accuracies with context: {'disease': {27472: 1, 31337: 1, 30966: 0, 30914: 0, 32163: 0, 31651: 1, 33566: 0, 31749: 0}, 'effect/phenotype': {94285: 0, 24380: 1}, 'gene/protein': {4277: 1, 4051: 1}}
Cop confidences with context: {'disease': {27472: 0.4584510028362274, 31337: 0.4183911085128784, 30966: 0.31348860263824463, 30914: 0.3224819302558899, 32163: 0.3451792299747467, 31651: 0.4210624694824219, 33566: 0.328

 57%|█████▋    | 101132/177004 [3:39:04<66:33:01,  3.16s/it]

Wrong response format. Question 101131 ignored


 57%|█████▋    | 101159/177004 [3:40:31<36:13:35,  1.72s/it]

Wrong response format. Question 101158 ignored


 57%|█████▋    | 101196/177004 [3:42:36<66:32:10,  3.16s/it]

Wrong response format. Node 14312 ignored
Wrong response format. Node 21402 ignored


 57%|█████▋    | 101201/177004 [3:42:56<75:45:30,  3.60s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.360710084438324
Cop confidence without context: 0.13064053654670715
LLM is correct without context: False
Confidences with context: {'disease': {98194: 0.33653926849365234, 84072: 0.379770427942276}, 'anatomy': {72085: 0.3451501727104187}, 'gene/protein': {13631: 0.33622512221336365}, 'drug': {15547: 0.3675309419631958, 15976: 0.3149970471858978, 21203: 0.3333906829357147}, 'effect/phenotype': {84702: 0.33230292797088623}}
Accuracies with context: {'disease': {98194: 0, 84072: 1}, 'anatomy': {72085: 0}, 'gene/protein': {13631: 0}, 'drug': {15547: 0, 15976: 0, 21203: 0}, 'effect/phenotype': {84702: 0}}
Cop confidences with context: {'disease': {98194: 0.16659902036190033, 84072: 0.379770427942276}, 'anatomy': {72085: 0.10123222321271896}, 'gene/protein': {13631: 0.12662354111671448}, 'drug': {15547: 0.16565918922424316, 15976: 0.1753222644329071, 21203: 0.1624816507101059}, 'effect/phenotype': {84702: 0.1606912

 57%|█████▋    | 101301/177004 [3:48:50<73:12:27,  3.48s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3012303411960602
Cop confidence without context: 0.3012303411960602
LLM is correct without context: True
Confidences with context: {'disease': {38911: 0.32347068190574646, 30438: 0.30150073766708374, 33623: 0.30930525064468384, 38910: 0.31375500559806824, 94840: 0.29934456944465637}, 'gene/protein': {5056: 0.2733513116836548, 35101: 0.2663024663925171, 34607: 0.2876329720020294, 6784: 0.27315452694892883}}
Accuracies with context: {'disease': {38911: 1, 30438: 1, 33623: 0, 38910: 1, 94840: 1}, 'gene/protein': {5056: 1, 35101: 0, 34607: 0, 6784: 1}}
Cop confidences with context: {'disease': {38911: 0.32347068190574646, 30438: 0.30150073766708374, 33623: 0.231658935546875, 38910: 0.31375500559806824, 94840: 0.29934456944465637}, 'gene/protein': {5056: 0.2733513116836548, 35101: 0.26423007249832153, 34607: 0.2479551136493683, 6784: 0.27315452694892883}}


 57%|█████▋    | 101401/177004 [3:53:56<74:45:01,  3.56s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.3038215935230255
Cop confidence without context: 0.19616197049617767
LLM is correct without context: False
Confidences with context: {'disease': {30151: 0.3664940893650055}}
Accuracies with context: {'disease': {30151: 0}}
Cop confidences with context: {'disease': {30151: 0.16779322922229767}}


 57%|█████▋    | 101482/177004 [3:58:55<47:21:53,  2.26s/it]

Wrong response format. Question 101481 ignored


 57%|█████▋    | 101501/177004 [4:00:07<84:45:55,  4.04s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.28808826208114624
Cop confidence without context: 0.21746043860912323
LLM is correct without context: False
Confidences with context: {'disease': {95351: 0.3064709007740021, 99856: 0.30027544498443604, 37564: 0.3005034625530243, 98599: 0.3284122943878174, 35690: 0.3022528886795044, 95607: 0.36136046051979065, 97126: 0.3254927098751068, 99179: 0.3238944113254547, 99428: 0.29049739241600037, 99132: 0.31789982318878174, 96888: 0.3165302574634552, 99152: 0.32703012228012085, 99806: 0.2852632999420166, 99181: 0.3218367397785187, 99080: 0.3295741379261017, 99450: 0.3338276445865631, 99382: 0.31363293528556824}, 'effect/phenotype': {90282: 0.28557732701301575}}
Accuracies with context: {'disease': {95351: 0, 99856: 0, 37564: 0, 98599: 0, 35690: 0, 95607: 0, 97126: 0, 99179: 0, 99428: 0, 99132: 0, 96888: 0, 99152: 0, 99806: 0, 99181: 0, 99080: 0, 99450: 0, 99382: 0}, 'effect/phenotype': {90282: 0}}
Cop confidences wit

 57%|█████▋    | 101601/177004 [4:05:43<48:24:19,  2.31s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.2943098545074463
Cop confidence without context: 0.28749215602874756
LLM is correct without context: False
Confidences with context: {'disease': {38436: 0.3638349175453186, 94765: 0.3544068932533264}}
Accuracies with context: {'disease': {38436: 0, 94765: 0}}
Cop confidences with context: {'disease': {38436: 0.2330814003944397, 94765: 0.24741588532924652}}


 57%|█████▋    | 101636/177004 [4:07:49<83:24:01,  3.98s/it]

Wrong response format. Node 97059 ignored


 57%|█████▋    | 101701/177004 [4:11:26<59:41:36,  2.85s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.2808969020843506
Cop confidence without context: 0.2808969020843506
LLM is correct without context: True
Confidences with context: {'anatomy': {71734: 0.3009617328643799, 72765: 0.2670062482357025, 64826: 0.2967093288898468, 75942: 0.2963991165161133}, 'effect/phenotype': {92495: 0.2532366216182709}, 'disease': {31525: 0.36184224486351013}}
Accuracies with context: {'anatomy': {71734: 0, 72765: 0, 64826: 0, 75942: 0}, 'effect/phenotype': {92495: 0}, 'disease': {31525: 0}}
Cop confidences with context: {'anatomy': {71734: 0.2475643754005432, 72765: 0.2393428534269333, 64826: 0.21538788080215454, 75942: 0.24003131687641144}, 'effect/phenotype': {92495: 0.2473703920841217}, 'disease': {31525: 0.24103890359401703}}


 58%|█████▊    | 101801/177004 [4:17:05<70:28:18,  3.37s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.32163316011428833
Cop confidence without context: 0.21933476626873016
LLM is correct without context: False
Confidences with context: {'disease': {32670: 0.24768756330013275, 30119: 0.2869952321052551, 36090: 0.9200885891914368, 35747: 0.551183819770813, 36044: 0.25921934843063354, 36104: 0.9572827816009521, 38472: 0.9284948110580444, 38549: 0.2703665792942047, 95245: 0.36977797746658325, 95457: 0.7058730721473694, 95775: 0.28107962012290955}, 'gene/protein': {2249: 0.25414904952049255, 1488: 0.25785133242607117}}
Accuracies with context: {'disease': {32670: 0, 30119: 0, 36090: 0, 35747: 0, 36044: 0, 36104: 0, 38472: 0, 38549: 0, 95245: 0, 95457: 0, 95775: 0}, 'gene/protein': {2249: 1, 1488: 0}}
Cop confidences with context: {'disease': {32670: 0.23450587689876556, 30119: 0.24740536510944366, 36090: 0.01487184688448906, 35747: 0.2771526277065277, 36044: 0.2435140311717987, 36104: 0.011232219636440277, 38472: 0

 58%|█████▊    | 101809/177004 [4:17:32<71:12:00,  3.41s/it]

Wrong response format. Node 31918 ignored


 58%|█████▊    | 101813/177004 [4:17:48<82:32:28,  3.95s/it]

Wrong response format. Node 91661 ignored
Wrong response format. Node 21505 ignored


 58%|█████▊    | 101901/177004 [4:22:33<80:23:05,  3.85s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.5076559782028198
Cop confidence without context: 0.09173156321048737
LLM is correct without context: False
Confidences with context: {'drug': {18755: 0.5065052509307861}, 'effect/phenotype': {25746: 0.44564488530158997, 90045: 0.46316391229629517, 92653: 0.43242084980010986, 92656: 0.4421461820602417, 89995: 0.5051286816596985, 92651: 0.45362943410873413, 90737: 0.4278934597969055}, 'disease': {35506: 0.43407028913497925, 83781: 0.38277488946914673, 37731: 0.44781211018562317, 33489: 0.45912858843803406, 39431: 0.4984217882156372}, 'gene/protein': {34150: 0.4780954420566559, 6414: 0.4888048470020294}}
Accuracies with context: {'drug': {18755: 0}, 'effect/phenotype': {25746: 0, 90045: 0, 92653: 0, 92656: 0, 89995: 0, 92651: 0, 90737: 0}, 'disease': {35506: 0, 83781: 0, 37731: 0, 33489: 0, 39431: 0}, 'gene/protein': {34150: 0, 6414: 0}}
Cop confidences with context: {'drug': {18755: 0.1311015486717224}, 'effect/

 58%|█████▊    | 102001/177004 [4:27:33<57:30:51,  2.76s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3095690608024597
Cop confidence without context: 0.16060230135917664
LLM is correct without context: False
Confidences with context: {'disease': {95512: 0.33615612983703613}, 'anatomy': {64227: 0.3057961165904999, 63158: 0.584831953048706, 70269: 0.40807780623435974}}
Accuracies with context: {'disease': {95512: 0}, 'anatomy': {64227: 0, 63158: 0, 70269: 0}}
Cop confidences with context: {'disease': {95512: 0.15511056780815125}, 'anatomy': {64227: 0.17288143932819366, 63158: 0.07611627131700516, 70269: 0.15732800960540771}}


 58%|█████▊    | 102004/177004 [4:27:45<74:40:55,  3.58s/it]

Wrong response format. Node 33528 ignored
Wrong response format. Node 28726 ignored


 58%|█████▊    | 102013/177004 [4:28:07<39:47:58,  1.91s/it]

Wrong response format. Question 102012 ignored


 58%|█████▊    | 102101/177004 [4:33:28<66:20:44,  3.19s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.3311091661453247
Cop confidence without context: 0.15640491247177124
LLM is correct without context: False
Confidences with context: {'anatomy': {72319: 0.42859455943107605}}
Accuracies with context: {'anatomy': {72319: 0}}
Cop confidences with context: {'anatomy': {72319: 0.11994986981153488}}


 58%|█████▊    | 102104/177004 [4:33:32<41:27:56,  1.99s/it]

Wrong response format. Question 102103 ignored


 58%|█████▊    | 102193/177004 [4:38:02<56:02:19,  2.70s/it]

Wrong response format. Node 91661 ignored


 58%|█████▊    | 102200/177004 [4:38:21<64:19:16,  3.10s/it]

Wrong response format. Node 91661 ignored


 58%|█████▊    | 102201/177004 [4:38:26<74:25:13,  3.58s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.4077260196208954
Cop confidence without context: 0.1699654906988144
LLM is correct without context: False
Confidences with context: {'drug': {14274: 0.3491840660572052, 15003: 0.413936972618103, 16714: 0.3152683973312378, 16756: 0.2619010806083679, 21614: 0.3275763988494873, 21658: 0.33177122473716736, 15512: 0.33130931854248047, 15514: 0.30877697467803955, 21359: 0.30662164092063904, 14870: 0.3122122585773468, 18589: 0.3393361270427704, 16420: 0.31123360991477966, 21656: 0.30241790413856506, 21660: 0.3529377281665802}, 'disease': {27331: 0.9070391058921814}, 'effect/phenotype': {26677: 0.28208985924720764}}
Accuracies with context: {'drug': {14274: 0, 15003: 0, 16714: 0, 16756: 0, 21614: 0, 21658: 0, 15512: 0, 15514: 0, 21359: 0, 14870: 0, 18589: 0, 16420: 0, 21656: 0, 21660: 0}, 'disease': {27331: 0}, 'effect/phenotype': {26677: 0}}
Cop confidences with context: {'drug': {14274: 0.20527471601963043, 15003: 0

 58%|█████▊    | 102224/177004 [4:39:21<34:39:23,  1.67s/it]

Wrong response format. Question 102223 ignored


 58%|█████▊    | 102301/177004 [4:43:38<95:16:02,  4.59s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.30976220965385437
Cop confidence without context: 0.30976220965385437
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {94192: 0.434179425239563, 91380: 0.43200013041496277, 85015: 0.42555204033851624, 88379: 0.4002951383590698}, 'disease': {37721: 0.4448333978652954, 29507: 0.5569424033164978, 98994: 0.509080708026886, 96807: 0.4011388421058655, 32909: 0.3275402784347534, 32319: 0.5507426857948303, 39169: 0.4918162524700165, 29439: 0.5157107710838318, 33524: 0.593208909034729, 32359: 0.47346630692481995, 32439: 0.5450893044471741}, 'anatomy': {68240: 0.43971046805381775, 68241: 0.4434381425380707, 68242: 0.4286062717437744}, 'drug': {39897: 0.39861711859703064, 17105: 0.40146881341934204}}
Accuracies with context: {'effect/phenotype': {94192: 1, 91380: 1, 85015: 1, 88379: 1}, 'disease': {37721: 1, 29507: 1, 98994: 1, 96807: 1, 32909: 0, 32319: 1, 39169: 1, 29439: 1, 33524: 1

 58%|█████▊    | 102401/177004 [4:49:24<64:03:24,  3.09s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.5204490423202515
Cop confidence without context: 0.05275346711277962
LLM is correct without context: False
Confidences with context: {'disease': {33594: 0.4916960597038269, 39855: 0.4991956055164337, 95168: 0.4901195168495178}, 'drug': {19350: 0.43068867921829224, 21013: 0.39972493052482605, 16282: 0.431527316570282, 21400: 0.4298148453235626, 17172: 0.4136343002319336, 21408: 0.4470827281475067, 17207: 0.435621052980423, 21402: 0.4295594096183777}, 'anatomy': {74307: 0.4604189693927765}, 'effect/phenotype': {94522: 0.4487520456314087, 94521: 0.46196219325065613}}
Accuracies with context: {'disease': {33594: 0, 39855: 0, 95168: 0}, 'drug': {19350: 0, 21013: 0, 16282: 0, 21400: 0, 17172: 0, 21408: 0, 17207: 0, 21402: 0}, 'anatomy': {74307: 0}, 'effect/phenotype': {94522: 0, 94521: 0}}
Cop confidences with context: {'disease': {33594: 0.07365730404853821, 39855: 0.07960370182991028, 95168: 0.09137406945228577}, 

 58%|█████▊    | 102412/177004 [4:49:55<43:05:37,  2.08s/it]

Wrong response format. Question 102411 ignored


 58%|█████▊    | 102501/177004 [4:54:52<91:08:45,  4.40s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.6003876328468323
Cop confidence without context: 0.01885235868394375
LLM is correct without context: False
Confidences with context: {'disease': {30438: 0.3862204849720001, 38024: 0.4530642032623291, 84218: 0.46059462428092957, 30128: 0.46380215883255005}, 'gene/protein': {6155: 0.3747916519641876, 2283: 0.39912641048431396, 2197: 0.33366072177886963, 5996: 0.3498745560646057, 4840: 0.33689719438552856, 4477: 0.3347387909889221, 6408: 0.34947291016578674, 34989: 0.3245966136455536}, 'drug': {16496: 0.3278481364250183}}
Accuracies with context: {'disease': {30438: 0, 38024: 0, 84218: 0, 30128: 0}, 'gene/protein': {6155: 0, 2283: 0, 2197: 0, 5996: 0, 4840: 0, 4477: 0, 6408: 0, 34989: 0}, 'drug': {16496: 0}}
Cop confidences with context: {'disease': {30438: 0.10979291051626205, 38024: 0.06734198331832886, 84218: 0.06583856791257858, 30128: 0.052856624126434326}, 'gene/protein': {6155: 0.10087381303310394, 2283: 0

 58%|█████▊    | 102601/177004 [5:00:44<72:50:57,  3.52s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3169747292995453
Cop confidence without context: 0.3169747292995453
LLM is correct without context: True
Confidences with context: {'drug': {14510: 0.302375465631485, 16831: 0.2978764772415161, 16047: 0.3193804621696472, 17269: 0.3345737159252167, 14216: 0.330556184053421, 16521: 0.36206290125846863, 21590: 0.3526791036128998, 21574: 0.3555994927883148, 21601: 0.3541933298110962, 21613: 0.3531988561153412, 21605: 0.35925430059432983, 21455: 0.3609011769294739, 21608: 0.35572585463523865, 19702: 0.31731677055358887}, 'anatomy': {70918: 0.3283936679363251}, 'disease': {84064: 0.3424288034439087}}
Accuracies with context: {'drug': {14510: 0, 16831: 0, 16047: 1, 17269: 1, 14216: 1, 16521: 1, 21590: 1, 21574: 1, 21601: 1, 21613: 1, 21605: 1, 21455: 1, 21608: 1, 19702: 1}, 'anatomy': {70918: 1}, 'disease': {84064: 1}}
Cop confidences with context: {'drug': {14510: 0.302375465631485, 16831: 0.28646501898765564, 16047

 58%|█████▊    | 102629/177004 [5:02:21<72:14:10,  3.50s/it]

Wrong response format. Node 91661 ignored


 58%|█████▊    | 102682/177004 [5:05:16<93:33:45,  4.53s/it]

Wrong response format. Node 84257 ignored


 58%|█████▊    | 102701/177004 [5:06:20<63:26:15,  3.07s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.30658459663391113
Cop confidence without context: 0.30658459663391113
LLM is correct without context: True
Confidences with context: {'disease': {96876: 0.37632057070732117, 32028: 0.3574351370334625}}
Accuracies with context: {'disease': {96876: 0, 32028: 0}}
Cop confidences with context: {'disease': {96876: 0.21442098915576935, 32028: 0.23441168665885925}}


 58%|█████▊    | 102718/177004 [5:07:14<58:44:55,  2.85s/it]

Wrong response format. Node 37824 ignored


 58%|█████▊    | 102801/177004 [5:12:17<60:08:11,  2.92s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.4342154860496521
Cop confidence without context: 0.26751241087913513
LLM is correct without context: False
Confidences with context: {'gene/protein': {11221: 0.3168233335018158}, 'effect/phenotype': {90726: 0.33545389771461487}, 'anatomy': {68935: 0.38625940680503845}, 'drug': {20556: 0.41066649556159973}, 'disease': {30146: 0.35756561160087585}}
Accuracies with context: {'gene/protein': {11221: 0}, 'effect/phenotype': {90726: 0}, 'anatomy': {68935: 0}, 'drug': {20556: 0}, 'disease': {30146: 0}}
Cop confidences with context: {'gene/protein': {11221: 0.22291305661201477}, 'effect/phenotype': {90726: 0.18096379935741425}, 'anatomy': {68935: 0.17546625435352325}, 'drug': {20556: 0.1354234367609024}, 'disease': {30146: 0.19901522994041443}}


 58%|█████▊    | 102897/177004 [5:17:47<37:31:00,  1.82s/it]

Wrong response format. Question 102896 ignored


 58%|█████▊    | 102901/177004 [5:18:06<78:44:16,  3.83s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.32585301995277405
Cop confidence without context: 0.32585301995277405
LLM is correct without context: True
Confidences with context: {'anatomy': {73231: 0.3535069525241852}, 'disease': {33577: 0.31470292806625366, 33575: 0.4034998118877411, 33593: 0.5254505276679993, 32642: 0.3829957842826843, 33086: 0.30376729369163513, 38899: 0.37340089678764343, 97941: 0.4369766414165497, 84167: 0.2976202666759491, 84173: 0.35126015543937683, 97093: 0.3940137028694153, 83916: 0.6145728230476379, 32983: 0.3976328670978546}, 'drug': {20896: 0.41457927227020264, 17548: 0.6429427862167358}}
Accuracies with context: {'anatomy': {73231: 1}, 'disease': {33577: 0, 33575: 0, 33593: 0, 32642: 0, 33086: 0, 38899: 0, 97941: 1, 84167: 0, 84173: 0, 97093: 0, 83916: 0, 32983: 0}, 'drug': {20896: 1, 17548: 0}}
Cop confidences with context: {'anatomy': {73231: 0.3535069525241852}, 'disease': {33577: 0.22667202353477478, 33575: 0.16820374131

 58%|█████▊    | 102978/177004 [5:22:06<69:30:55,  3.38s/it]

Wrong response format. Node 91661 ignored


 58%|█████▊    | 103001/177004 [5:23:30<68:42:31,  3.34s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.502413272857666
Cop confidence without context: 0.18773813545703888
LLM is correct without context: False
Confidences with context: {'disease': {96877: 0.3456266224384308, 37574: 0.814503014087677, 33699: 0.7604240775108337, 30499: 0.8545629382133484, 99610: 0.6649618148803711, 84014: 0.7480381727218628, 96898: 0.6195072531700134, 96875: 0.7138895988464355}, 'drug': {20896: 0.30652275681495667}, 'effect/phenotype': {94523: 0.4504871666431427, 94509: 0.7553608417510986}}
Accuracies with context: {'disease': {96877: 1, 37574: 0, 33699: 1, 30499: 0, 99610: 0, 84014: 0, 96898: 0, 96875: 1}, 'drug': {20896: 0}, 'effect/phenotype': {94523: 1, 94509: 1}}
Cop confidences with context: {'disease': {96877: 0.3456266224384308, 37574: 0.07342967391014099, 33699: 0.7604240775108337, 30499: 0.0234962347894907, 99610: 0.09883775562047958, 84014: 0.1111859530210495, 96898: 0.14946308732032776, 96875: 0.7138895988464355}, 'dru

 58%|█████▊    | 103018/177004 [5:24:22<52:44:03,  2.57s/it]

Wrong response format. Node 91998 ignored


 58%|█████▊    | 103074/177004 [5:27:33<60:08:48,  2.93s/it]

Wrong response format. Question 103073 ignored


 58%|█████▊    | 103101/177004 [5:29:05<81:52:00,  3.99s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.30372336506843567
Cop confidence without context: 0.24789179861545563
LLM is correct without context: False
Confidences with context: {'disease': {32649: 0.27644893527030945, 30825: 0.2825160324573517, 33185: 0.2984519898891449, 28327: 0.28601351380348206, 33145: 0.27635863423347473, 27683: 0.2893041968345642}, 'effect/phenotype': {85298: 0.28297853469848633, 94035: 0.29009249806404114, 90062: 0.3064683675765991, 88997: 0.27954643964767456}}
Accuracies with context: {'disease': {32649: 1, 30825: 0, 33185: 1, 28327: 1, 33145: 0, 27683: 1}, 'effect/phenotype': {85298: 0, 94035: 0, 90062: 0, 88997: 1}}
Cop confidences with context: {'disease': {32649: 0.27644893527030945, 30825: 0.25523197650909424, 33185: 0.2984519898891449, 28327: 0.28601351380348206, 33145: 0.2699567973613739, 27683: 0.2893041968345642}, 'effect/phenotype': {85298: 0.24972766637802124, 94035: 0.20410557091236115, 90062: 0.21562743186950684, 88

 58%|█████▊    | 103117/177004 [5:29:59<79:32:18,  3.88s/it]

Wrong response format. Node 14027 ignored


 58%|█████▊    | 103130/177004 [5:30:41<59:33:58,  2.90s/it]

Wrong response format. Node 91641 ignored
Wrong response format. Node 91658 ignored


 58%|█████▊    | 103201/177004 [5:34:31<71:59:14,  3.51s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.27307820320129395
Cop confidence without context: 0.26467645168304443
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {85877: 0.3208470344543457}, 'anatomy': {64850: 0.3412227928638458, 73022: 0.3336801528930664}, 'disease': {33145: 0.36668458580970764}}
Accuracies with context: {'effect/phenotype': {85877: 0}, 'anatomy': {64850: 1, 73022: 0}, 'disease': {33145: 0}}
Cop confidences with context: {'effect/phenotype': {85877: 0.22929903864860535}, 'anatomy': {64850: 0.3412227928638458, 73022: 0.19161638617515564}, 'disease': {33145: 0.18294572830200195}}


 58%|█████▊    | 103301/177004 [5:39:55<62:36:20,  3.06s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.328826367855072
Cop confidence without context: 0.23135824501514435
LLM is correct without context: False
Confidences with context: {'disease': {98020: 0.33734485507011414, 94989: 0.46183574199676514}, 'gene/protein': {2422: 0.33567380905151367}}
Accuracies with context: {'disease': {98020: 0, 94989: 0}, 'gene/protein': {2422: 0}}
Cop confidences with context: {'disease': {98020: 0.2229713350534439, 94989: 0.10467227548360825}, 'gene/protein': {2422: 0.2084246277809143}}


 58%|█████▊    | 103304/177004 [5:40:01<47:23:01,  2.31s/it]

Wrong response format. Question 103303 ignored


 58%|█████▊    | 103355/177004 [5:42:55<42:45:27,  2.09s/it]

Wrong response format. Question 103354 ignored


 58%|█████▊    | 103374/177004 [5:43:55<52:53:31,  2.59s/it]

Wrong response format. Question 103373 ignored


 58%|█████▊    | 103401/177004 [5:45:20<75:51:25,  3.71s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.34821492433547974
Cop confidence without context: 0.1385103166103363
LLM is correct without context: False
Confidences with context: {'anatomy': {74209: 0.39162835478782654, 74051: 0.34629586338996887}, 'effect/phenotype': {89741: 0.30843454599380493}, 'drug': {20997: 0.4032047986984253, 19601: 0.3708488643169403, 21203: 0.344835489988327, 17196: 0.3427891135215759, 17207: 0.3589860796928406, 14870: 0.35227900743484497, 21402: 0.38481950759887695, 14214: 0.36401233077049255, 17172: 0.38676413893699646, 17195: 0.3226451873779297, 21393: 0.35152068734169006, 14863: 0.3500307500362396}}
Accuracies with context: {'anatomy': {74209: 0, 74051: 0}, 'effect/phenotype': {89741: 0}, 'drug': {20997: 0, 19601: 0, 21203: 0, 17196: 0, 17207: 0, 14870: 0, 21402: 0, 14214: 0, 17172: 0, 17195: 0, 21393: 0, 14863: 0}}
Cop confidences with context: {'anatomy': {74209: 0.1311790645122528, 74051: 0.14778093993663788}, 'effect/phen

 58%|█████▊    | 103491/177004 [5:50:21<78:26:44,  3.84s/it]

Wrong response format. Node 84039 ignored
Wrong response format. Node 96015 ignored


 58%|█████▊    | 103501/177004 [5:50:47<60:10:26,  2.95s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.32259494066238403
Cop confidence without context: 0.2824735939502716
LLM is correct without context: False
Confidences with context: {'drug': {16589: 0.2971310615539551}, 'effect/phenotype': {24593: 0.29056817293167114, 91234: 0.31080248951911926}, 'disease': {98013: 0.28516843914985657, 83914: 0.35023510456085205, 37788: 0.29854699969291687, 30283: 0.26763850450515747}, 'gene/protein': {10164: 0.32322436571121216}}
Accuracies with context: {'drug': {16589: 0}, 'effect/phenotype': {24593: 0, 91234: 0}, 'disease': {98013: 0, 83914: 0, 37788: 0, 30283: 0}, 'gene/protein': {10164: 0}}
Cop confidences with context: {'drug': {16589: 0.22781828045845032}, 'effect/phenotype': {24593: 0.24088948965072632, 91234: 0.23096878826618195}, 'disease': {98013: 0.24201931059360504, 83914: 0.21917153894901276, 37788: 0.24750417470932007, 30283: 0.24946652352809906}, 'gene/protein': {10164: 0.2256469577550888}}


 59%|█████▊    | 103601/177004 [5:56:02<74:24:53,  3.65s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.5429072976112366
Cop confidence without context: 0.206064373254776
LLM is correct without context: False
Confidences with context: {'disease': {38640: 0.5242248773574829, 29275: 0.47385871410369873, 97009: 0.46698158979415894, 36885: 0.5152090787887573, 98187: 0.5239753723144531, 37168: 0.46412235498428345}, 'anatomy': {74604: 0.4627740979194641}, 'effect/phenotype': {89448: 0.5366823077201843, 89557: 0.408758282661438}, 'gene/protein': {10025: 0.4815835654735565, 9680: 0.46713897585868835}}
Accuracies with context: {'disease': {38640: 0, 29275: 0, 97009: 0, 36885: 0, 98187: 0, 37168: 0}, 'anatomy': {74604: 0}, 'effect/phenotype': {89448: 0, 89557: 0}, 'gene/protein': {10025: 0, 9680: 0}}
Cop confidences with context: {'disease': {38640: 0.21015773713588715, 29275: 0.19908298552036285, 97009: 0.1901574432849884, 36885: 0.20654335618019104, 98187: 0.19275978207588196, 37168: 0.17893511056900024}, 'anatomy': {74

 59%|█████▊    | 103670/177004 [5:59:54<49:40:29,  2.44s/it]

Wrong response format. Node 91641 ignored
Wrong response format. Node 91658 ignored


 59%|█████▊    | 103701/177004 [6:01:38<51:18:54,  2.52s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.5425097942352295
Cop confidence without context: 0.5425097942352295
LLM is correct without context: True
Confidences with context: {'disease': {31271: 0.5158252716064453, 36273: 0.4961313009262085}, 'effect/phenotype': {88483: 0.49009549617767334, 93486: 0.44137999415397644}}
Accuracies with context: {'disease': {31271: 1, 36273: 1}, 'effect/phenotype': {88483: 1, 93486: 1}}
Cop confidences with context: {'disease': {31271: 0.5158252716064453, 36273: 0.4961313009262085}, 'effect/phenotype': {88483: 0.49009549617767334, 93486: 0.44137999415397644}}


 59%|█████▊    | 103801/177004 [6:07:28<68:46:02,  3.38s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.33735519647598267
Cop confidence without context: 0.26273247599601746
LLM is correct without context: False
Confidences with context: {'disease': {36414: 0.2821454107761383, 98005: 0.29868757724761963, 35813: 0.3050719201564789, 39466: 0.26319652795791626, 39776: 0.3070533275604248, 83932: 0.2769537568092346, 95277: 0.326811283826828, 99063: 0.2926400303840637, 99731: 0.26938575506210327, 95744: 0.2715294063091278, 96156: 0.2859286963939667, 99483: 0.29899051785469055}, 'effect/phenotype': {26254: 0.2919449508190155, 27140: 0.30083173513412476, 91305: 0.27465513348579407}, 'drug': {21429: 0.30842551589012146, 14253: 0.31791040301322937}}
Accuracies with context: {'disease': {36414: 0, 98005: 0, 35813: 0, 39466: 0, 39776: 0, 83932: 0, 95277: 0, 99063: 0, 99731: 1, 95744: 0, 96156: 0, 99483: 0}, 'effect/phenotype': {26254: 0, 27140: 0, 91305: 0}, 'drug': {21429: 0, 14253: 0}}
Cop confidences with context: {'dise

 59%|█████▊    | 103840/177004 [6:09:46<83:32:56,  4.11s/it]

Wrong response format. Node 87341 ignored


 59%|█████▊    | 103901/177004 [6:13:19<70:02:37,  3.45s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.2741223871707916
Cop confidence without context: 0.2272554337978363
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {93101: 0.27127447724342346, 33765: 0.2915830910205841}, 'anatomy': {63273: 0.24861982464790344, 66961: 0.25344300270080566, 66963: 0.24402806162834167, 66964: 0.2582000195980072, 66962: 0.2567884922027588}, 'disease': {97247: 0.3143768012523651, 37785: 0.4489220082759857, 30495: 0.3385653793811798, 33104: 0.6996450424194336, 38695: 0.29747432470321655, 97086: 0.3136453926563263, 97087: 0.2508409917354584, 84035: 0.4909629821777344}, 'gene/protein': {58585: 0.2713945209980011}}
Accuracies with context: {'effect/phenotype': {93101: 0, 33765: 0}, 'anatomy': {63273: 0, 66961: 0, 66963: 0, 66964: 0, 66962: 0}, 'disease': {97247: 0, 37785: 0, 30495: 0, 33104: 0, 38695: 0, 97086: 0, 97087: 1, 84035: 0}, 'gene/protein': {58585: 0}}
Cop confidences with context: {'eff

 59%|█████▊    | 103936/177004 [6:15:30<78:42:36,  3.88s/it]

Wrong response format. Node 84039 ignored
Wrong response format. Node 96015 ignored


 59%|█████▉    | 104001/177004 [6:18:56<82:29:38,  4.07s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.319150447845459
Cop confidence without context: 0.21260005235671997
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {23161: 0.28935253620147705, 22484: 0.8018916845321655}, 'disease': {33575: 0.6349006295204163, 83917: 0.5247179269790649, 30438: 0.4237922728061676, 30512: 0.3233621418476105, 33515: 0.341881662607193, 97941: 0.423430860042572, 33086: 0.29876473546028137, 30439: 0.5143753886222839, 32983: 0.502169668674469, 84167: 0.5076598525047302, 84173: 0.2758241295814514, 30676: 0.45414507389068604, 33489: 0.5583720207214355}}
Accuracies with context: {'effect/phenotype': {23161: 0, 22484: 1}, 'disease': {33575: 0, 83917: 1, 30438: 1, 30512: 1, 33515: 1, 97941: 1, 33086: 0, 30439: 1, 32983: 0, 84167: 0, 84173: 0, 30676: 1, 33489: 0}}
Cop confidences with context: {'effect/phenotype': {23161: 0.28045010566711426, 22484: 0.8018916845321655}, 'disease': {33575: 0.1052768379

 59%|█████▉    | 104101/177004 [6:24:18<61:54:41,  3.06s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3672519624233246
Cop confidence without context: 0.288259357213974
LLM is correct without context: False
Confidences with context: {'disease': {31974: 0.3451904058456421, 31160: 0.31295904517173767, 38636: 0.381790429353714}, 'drug': {20810: 0.44360584020614624, 21034: 0.39771586656570435, 21035: 0.40754234790802}, 'effect/phenotype': {24909: 0.38459157943725586, 87940: 0.3853982388973236, 87931: 0.38251933455467224}, 'gene/protein': {56574: 0.4015371799468994}}
Accuracies with context: {'disease': {31974: 0, 31160: 1, 38636: 0}, 'drug': {20810: 0, 21034: 0, 21035: 0}, 'effect/phenotype': {24909: 0, 87940: 0, 87931: 0}, 'gene/protein': {56574: 0}}
Cop confidences with context: {'disease': {31974: 0.23539939522743225, 31160: 0.31295904517173767, 38636: 0.24650244414806366}, 'drug': {20810: 0.19994883239269257, 21034: 0.1953514665365219, 21035: 0.20815227925777435}, 'effect/phenotype': {24909: 0.2074715942144394

 59%|█████▉    | 104130/177004 [6:25:53<46:55:38,  2.32s/it]

Wrong response format. Question 104129 ignored


 59%|█████▉    | 104148/177004 [6:26:52<61:24:47,  3.03s/it]

Wrong response format. Node 94913 ignored
Wrong response format. Node 21940 ignored


 59%|█████▉    | 104201/177004 [6:29:44<66:43:15,  3.30s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.2720279395580292
Cop confidence without context: 0.2151918262243271
LLM is correct without context: False
Confidences with context: {'disease': {97502: 0.3042903542518616, 95721: 0.30485615134239197, 84177: 0.3152008056640625, 29761: 0.2964726984500885}, 'effect/phenotype': {26349: 0.28863462805747986, 86153: 0.2879375219345093}, 'drug': {20486: 0.2740270793437958, 14638: 0.2992992401123047, 14681: 0.28256893157958984, 14749: 0.2858734130859375}, 'gene/protein': {34507: 0.28512442111968994, 3220: 0.2854273319244385}}
Accuracies with context: {'disease': {97502: 0, 95721: 0, 84177: 0, 29761: 0}, 'effect/phenotype': {26349: 0, 86153: 0}, 'drug': {20486: 0, 14638: 0, 14681: 0, 14749: 0}, 'gene/protein': {34507: 0, 3220: 0}}
Cop confidences with context: {'disease': {97502: 0.20270107686519623, 95721: 0.2014976292848587, 84177: 0.1882149875164032, 29761: 0.20376251637935638}, 'effect/phenotype': {26349: 0.21449504

 59%|█████▉    | 104301/177004 [6:35:00<65:55:57,  3.26s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.4151363670825958
Cop confidence without context: 0.4151363670825958
LLM is correct without context: True
Confidences with context: {'anatomy': {73391: 0.347218781709671, 76248: 0.36383384466171265}, 'disease': {83776: 0.7170003652572632}, 'gene/protein': {10743: 0.32479193806648254, 10923: 0.337941974401474, 1112: 0.3322334587574005, 310: 0.3039312958717346, 1495: 0.36852434277534485, 34678: 0.30473047494888306, 7481: 0.32812023162841797, 10847: 0.3385235071182251}, 'effect/phenotype': {84481: 0.35387343168258667}, 'drug': {20548: 0.3243527114391327}}
Accuracies with context: {'anatomy': {73391: 1, 76248: 1}, 'disease': {83776: 0}, 'gene/protein': {10743: 1, 10923: 1, 1112: 1, 310: 1, 1495: 1, 34678: 1, 7481: 1, 10847: 1}, 'effect/phenotype': {84481: 0}, 'drug': {20548: 1}}
Cop confidences with context: {'anatomy': {73391: 0.347218781709671, 76248: 0.36383384466171265}, 'disease': {83776: 0.10825086385011673},

 59%|█████▉    | 104401/177004 [6:40:32<90:16:28,  4.48s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.5939024686813354
Cop confidence without context: 0.5939024686813354
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {89929: 0.5203140377998352, 23863: 0.42635855078697205, 91408: 0.3281627595424652}, 'disease': {37372: 0.47754552960395813, 94944: 0.4585035741329193, 96526: 0.47588875889778137}, 'anatomy': {63540: 0.47032713890075684}, 'drug': {17301: 0.9413377642631531, 16419: 0.38050559163093567, 14196: 0.9370507001876831, 14609: 0.8387836217880249, 17195: 0.42421671748161316, 17196: 0.4694337248802185, 14642: 0.4820812940597534, 21408: 0.4595448076725006, 14214: 0.48564988374710083, 17172: 0.43812447786331177, 21402: 0.5069952607154846, 21511: 0.3754032552242279}, 'gene/protein': {10683: 0.4817480146884918}}
Accuracies with context: {'effect/phenotype': {89929: 1, 23863: 1, 91408: 1}, 'disease': {37372: 1, 94944: 1, 96526: 1}, 'anatomy': {63540: 1}, 'drug': {17301: 0, 1641

 59%|█████▉    | 104430/177004 [6:42:06<40:52:21,  2.03s/it]

Wrong response format. Question 104429 ignored


 59%|█████▉    | 104501/177004 [6:46:31<90:51:10,  4.51s/it] 

Example Feedback:

Response without context: A
Confidence without context: 0.5756210684776306
Cop confidence without context: 0.5756210684776306
LLM is correct without context: True
Confidences with context: {'disease': {84122: 0.43923547863960266}, 'drug': {17723: 0.3555796444416046, 21513: 0.3985978662967682}, 'anatomy': {73767: 0.33426347374916077}}
Accuracies with context: {'disease': {84122: 1}, 'drug': {17723: 0, 21513: 1}, 'anatomy': {73767: 0}}
Cop confidences with context: {'disease': {84122: 0.43923547863960266}, 'drug': {17723: 0.272632360458374, 21513: 0.3985978662967682}, 'anatomy': {73767: 0.3239792585372925}}


 59%|█████▉    | 104601/177004 [6:52:10<68:39:26,  3.41s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.6220222115516663
Cop confidence without context: 0.06978895515203476
LLM is correct without context: False
Confidences with context: {'disease': {97696: 0.3924265205860138}, 'anatomy': {72211: 0.4872562289237976}, 'drug': {18835: 0.4100317656993866, 14256: 0.4254569411277771, 14632: 0.48475703597068787, 19246: 0.38659942150115967, 21314: 0.4005550444126129, 21278: 0.36661267280578613, 21330: 0.3793948292732239, 21322: 0.4112643897533417, 21375: 0.40035223960876465, 21365: 0.462090402841568, 21369: 0.39106443524360657, 21359: 0.4382167160511017, 21294: 0.4743340015411377}}
Accuracies with context: {'disease': {97696: 0}, 'anatomy': {72211: 0}, 'drug': {18835: 0, 14256: 0, 14632: 0, 19246: 0, 21314: 0, 21278: 0, 21330: 0, 21322: 0, 21375: 0, 21365: 0, 21369: 0, 21359: 0, 21294: 0}}
Cop confidences with context: {'disease': {97696: 0.13351640105247498}, 'anatomy': {72211: 0.08335971087217331}, 'drug': {18835: 0.1

 59%|█████▉    | 104650/177004 [6:54:38<52:08:27,  2.59s/it]

Wrong response format. Question 104649 ignored


 59%|█████▉    | 104698/177004 [6:57:11<65:52:51,  3.28s/it]

Wrong response format. Node 91661 ignored


 59%|█████▉    | 104700/177004 [6:57:13<41:22:52,  2.06s/it]

Wrong response format. Question 104699 ignored


 59%|█████▉    | 104701/177004 [6:57:19<61:52:32,  3.08s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.7077176570892334
Cop confidence without context: 0.7077176570892334
LLM is correct without context: True
Confidences with context: {'anatomy': {73314: 0.5855191349983215, 70983: 0.2690339684486389, 70986: 0.4582641124725342, 70987: 0.27479997277259827, 70985: 0.47708064317703247}, 'disease': {94764: 0.41957658529281616, 28540: 0.5940762758255005, 39815: 0.284322589635849, 99781: 0.34487566351890564, 99778: 0.4283781349658966}, 'effect/phenotype': {90962: 0.4394771456718445}, 'gene/protein': {8216: 0.5642424821853638, 11630: 0.48271995782852173, 12580: 0.5429624319076538, 11050: 0.4255133271217346, 3224: 0.4179008901119232}}
Accuracies with context: {'anatomy': {73314: 1, 70983: 0, 70986: 1, 70987: 0, 70985: 1}, 'disease': {94764: 1, 28540: 1, 39815: 1, 99781: 1, 99778: 1}, 'effect/phenotype': {90962: 1}, 'gene/protein': {8216: 1, 11630: 1, 12580: 1, 11050: 1, 3224: 1}}
Cop confidences with context: {'anatomy':

 59%|█████▉    | 104712/177004 [6:57:53<76:24:58,  3.81s/it]

Wrong response format. Node 92615 ignored


 59%|█████▉    | 104801/177004 [7:02:54<72:54:37,  3.64s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.7156051397323608
Cop confidence without context: 0.7156051397323608
LLM is correct without context: True
Confidences with context: {'anatomy': {64492: 0.8759071230888367}, 'drug': {14688: 0.9100690484046936, 16352: 0.8701962232589722, 16521: 0.5670866370201111, 21614: 0.6782047748565674, 15514: 0.4852852523326874, 18589: 0.5774656534194946, 16420: 0.4573734700679779, 21656: 0.5220190286636353, 15512: 0.5918529033660889, 21658: 0.586682140827179, 17195: 0.5332573056221008, 21359: 0.48149141669273376}, 'effect/phenotype': {25233: 0.49795618653297424}}
Accuracies with context: {'anatomy': {64492: 1}, 'drug': {14688: 1, 16352: 0, 16521: 1, 21614: 1, 15514: 1, 18589: 1, 16420: 1, 21656: 1, 15512: 1, 21658: 1, 17195: 1, 21359: 1}, 'effect/phenotype': {25233: 1}}
Cop confidences with context: {'anatomy': {64492: 0.8759071230888367}, 'drug': {14688: 0.9100690484046936, 16352: 0.037640996277332306, 16521: 0.56708663702

 59%|█████▉    | 104847/177004 [7:05:36<93:51:39,  4.68s/it]

Wrong response format. Node 33309 ignored
Wrong response format. Node 38854 ignored


 59%|█████▉    | 104901/177004 [7:08:56<60:12:02,  3.01s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.6825472116470337
Cop confidence without context: 0.6825472116470337
LLM is correct without context: True
Confidences with context: {'disease': {95623: 0.4028504192829132, 36689: 0.3429003953933716, 96701: 0.4101932942867279}, 'gene/protein': {12462: 0.5462286472320557, 11724: 0.5550650358200073}}
Accuracies with context: {'disease': {95623: 1, 36689: 1, 96701: 1}, 'gene/protein': {12462: 1, 11724: 1}}
Cop confidences with context: {'disease': {95623: 0.4028504192829132, 36689: 0.3429003953933716, 96701: 0.4101932942867279}, 'gene/protein': {12462: 0.5462286472320557, 11724: 0.5550650358200073}}


 59%|█████▉    | 105001/177004 [7:14:46<77:16:59,  3.86s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.5588851571083069
Cop confidence without context: 0.5588851571083069
LLM is correct without context: True
Confidences with context: {'disease': {97062: 0.5711609125137329, 38538: 0.4494876265525818, 35538: 0.3327750861644745, 33304: 0.35135823488235474, 32967: 0.5870229005813599, 98887: 0.45693403482437134, 97574: 0.40228286385536194, 95658: 0.5455549955368042, 97207: 0.32006093859672546, 97457: 0.5482863783836365, 39793: 0.3188609480857849, 97088: 0.27043354511260986}, 'gene/protein': {1733: 0.44371554255485535, 34969: 0.38315948843955994}}
Accuracies with context: {'disease': {97062: 1, 38538: 1, 35538: 1, 33304: 1, 32967: 0, 98887: 1, 97574: 1, 95658: 0, 97207: 1, 97457: 0, 39793: 1, 97088: 0}, 'gene/protein': {1733: 1, 34969: 1}}
Cop confidences with context: {'disease': {97062: 0.5711609125137329, 38538: 0.4494876265525818, 35538: 0.3327750861644745, 33304: 0.35135823488235474, 32967: 0.24280324578285217, 

 59%|█████▉    | 105005/177004 [7:15:02<75:01:42,  3.75s/it]

Wrong response format. Node 33645 ignored


 59%|█████▉    | 105101/177004 [7:20:09<73:14:26,  3.67s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.3085649907588959
Cop confidence without context: 0.2723076343536377
LLM is correct without context: False
Confidences with context: {'disease': {95167: 0.3074640929698944, 95169: 0.30012983083724976, 36011: 0.2764216661453247, 84061: 0.28640344738960266, 32322: 0.2724304497241974, 97117: 0.29370492696762085, 38630: 0.2701902985572815, 27533: 0.27675125002861023}, 'drug': {17629: 0.261382520198822, 20288: 0.39866289496421814, 21495: 0.30187976360321045, 21527: 0.2926812171936035, 21491: 0.2808075249195099, 21492: 0.26940420269966125}, 'effect/phenotype': {93759: 0.2790103554725647, 23179: 0.26571527123451233, 89631: 0.2980782389640808, 92980: 0.2645378112792969}, 'anatomy': {75738: 0.2836049497127533}}
Accuracies with context: {'disease': {95167: 0, 95169: 0, 36011: 0, 84061: 0, 32322: 0, 97117: 0, 38630: 0, 27533: 0}, 'drug': {17629: 1, 20288: 0, 21495: 0, 21527: 0, 21491: 0, 21492: 0}, 'effect/phenotype': {93

 59%|█████▉    | 105135/177004 [7:22:05<87:43:42,  4.39s/it]

Wrong response format. Node 31918 ignored
Wrong response format. Node 98644 ignored


 59%|█████▉    | 105201/177004 [7:25:37<40:39:41,  2.04s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.4362194538116455
Cop confidence without context: 0.09001896530389786
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {91764: 0.5644344687461853}, 'gene/protein': {8163: 0.3293105363845825}}
Accuracies with context: {'effect/phenotype': {91764: 0}, 'gene/protein': {8163: 0}}
Cop confidences with context: {'effect/phenotype': {91764: 0.06042779982089996}, 'gene/protein': {8163: 0.1951100379228592}}


 59%|█████▉    | 105258/177004 [7:29:12<76:25:40,  3.83s/it]

Wrong response format. Node 94658 ignored


 59%|█████▉    | 105301/177004 [7:31:29<61:46:24,  3.10s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.2943861782550812
Cop confidence without context: 0.1189427599310875
LLM is correct without context: False
Confidences with context: {'disease': {84015: 0.3129916191101074}, 'effect/phenotype': {89788: 0.3128383755683899, 85389: 0.29724612832069397}}
Accuracies with context: {'disease': {84015: 0}, 'effect/phenotype': {89788: 0, 85389: 1}}
Cop confidences with context: {'disease': {84015: 0.1216154471039772}, 'effect/phenotype': {89788: 0.16614720225334167, 85389: 0.29724612832069397}}


 60%|█████▉    | 105401/177004 [7:37:05<60:47:15,  3.06s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.35342007875442505
Cop confidence without context: 0.2525779604911804
LLM is correct without context: False
Confidences with context: {'drug': {14362: 0.3147640824317932, 16253: 0.2952340245246887, 14572: 0.703048825263977, 21641: 0.27522143721580505}}
Accuracies with context: {'drug': {14362: 0, 16253: 0, 14572: 0, 21641: 0}}
Cop confidences with context: {'drug': {14362: 0.2865959405899048, 16253: 0.2143169343471527, 14572: 0.07887989282608032, 21641: 0.2525573968887329}}


 60%|█████▉    | 105501/177004 [7:42:50<64:46:47,  3.26s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.423054963350296
Cop confidence without context: 0.11209841072559357
LLM is correct without context: False
Confidences with context: {'anatomy': {72292: 0.3918613791465759}}
Accuracies with context: {'anatomy': {72292: 0}}
Cop confidences with context: {'anatomy': {72292: 0.15956808626651764}}


 60%|█████▉    | 105601/177004 [7:48:35<55:02:11,  2.77s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.42763087153434753
Cop confidence without context: 0.42763087153434753
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {90597: 0.38959527015686035}, 'anatomy': {73174: 0.47071900963783264, 73004: 0.40323975682258606}, 'disease': {33531: 0.42805296182632446}}
Accuracies with context: {'effect/phenotype': {90597: 1}, 'anatomy': {73174: 1, 73004: 1}, 'disease': {33531: 1}}
Cop confidences with context: {'effect/phenotype': {90597: 0.38959527015686035}, 'anatomy': {73174: 0.47071900963783264, 73004: 0.40323975682258606}, 'disease': {33531: 0.42805296182632446}}


 60%|█████▉    | 105616/177004 [7:49:22<68:44:54,  3.47s/it]

Wrong response format. Node 84275 ignored


 60%|█████▉    | 105648/177004 [7:51:04<70:40:36,  3.57s/it]

Wrong response format. Node 28057 ignored


 60%|█████▉    | 105695/177004 [7:53:48<72:00:08,  3.64s/it]

Wrong response format. Node 21231 ignored


 60%|█████▉    | 105701/177004 [7:54:06<58:11:37,  2.94s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.4310716688632965
Cop confidence without context: 0.4310716688632965
LLM is correct without context: True
Confidences with context: {'disease': {36641: 0.3683520257472992}, 'anatomy': {63884: 0.6160879135131836, 63885: 0.36660265922546387, 66791: 0.45561131834983826, 66262: 0.3381677567958832, 71356: 0.6235973238945007, 71108: 0.5423322916030884, 66461: 0.3377203047275543, 63490: 0.5104660391807556, 71107: 0.612263560295105, 68292: 0.5700739622116089, 71376: 0.6373574137687683, 66685: 0.37345656752586365}}
Accuracies with context: {'disease': {36641: 1}, 'anatomy': {63884: 1, 63885: 1, 66791: 0, 66262: 1, 71356: 0, 71108: 0, 66461: 1, 63490: 0, 71107: 0, 68292: 0, 71376: 0, 66685: 0}}
Cop confidences with context: {'disease': {36641: 0.3683520257472992}, 'anatomy': {63884: 0.6160879135131836, 63885: 0.36660265922546387, 66791: 0.13895359635353088, 66262: 0.3381677567958832, 71356: 0.06126399710774422, 71108: 0.

 60%|█████▉    | 105801/177004 [7:59:34<51:57:32,  2.63s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.4001842439174652
Cop confidence without context: 0.4001842439174652
LLM is correct without context: True
Confidences with context: {'disease': {33296: 0.4307587146759033, 97069: 0.39763185381889343, 94638: 0.4330012798309326, 97758: 0.42064982652664185, 97736: 0.42332807183265686}, 'gene/protein': {67: 0.4126066565513611}}
Accuracies with context: {'disease': {33296: 1, 97069: 1, 94638: 1, 97758: 1, 97736: 1}, 'gene/protein': {67: 1}}
Cop confidences with context: {'disease': {33296: 0.4307587146759033, 97069: 0.39763185381889343, 94638: 0.4330012798309326, 97758: 0.42064982652664185, 97736: 0.42332807183265686}, 'gene/protein': {67: 0.4126066565513611}}


 60%|█████▉    | 105819/177004 [8:00:38<70:33:49,  3.57s/it]

Wrong response format. Question 105818 ignored


 60%|█████▉    | 105844/177004 [8:02:04<63:52:58,  3.23s/it]

Wrong response format. Node 90966 ignored


 60%|█████▉    | 105901/177004 [8:05:35<56:27:11,  2.86s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3098348081111908
Cop confidence without context: 0.17516471445560455
LLM is correct without context: False
Confidences with context: {'disease': {28262: 0.3023320436477661}, 'gene/protein': {4163: 0.33978357911109924, 3470: 0.3323740065097809, 5188: 0.3235400319099426, 6857: 0.33945468068122864, 33912: 0.3286469578742981, 10280: 0.3498366177082062, 10597: 0.3233717679977417}, 'drug': {17606: 0.33261948823928833, 21203: 0.3199010193347931}}
Accuracies with context: {'disease': {28262: 0}, 'gene/protein': {4163: 0, 3470: 0, 5188: 0, 6857: 0, 33912: 0, 10280: 0, 10597: 0}, 'drug': {17606: 0, 21203: 0}}
Cop confidences with context: {'disease': {28262: 0.15441609919071198}, 'gene/protein': {4163: 0.16952432692050934, 3470: 0.16453705728054047, 5188: 0.17049361765384674, 6857: 0.15420423448085785, 33912: 0.1704998016357422, 10280: 0.14469872415065765, 10597: 0.16008062660694122}, 'drug': {17606: 0.1571183204650879,

 60%|█████▉    | 105927/177004 [8:06:56<80:53:12,  4.10s/it]

Wrong response format. Node 28758 ignored
Wrong response format. Node 98096 ignored
Wrong response format. Node 98389 ignored
Wrong response format. Node 99435 ignored
Wrong response format. Node 99803 ignored


 60%|█████▉    | 105968/177004 [8:09:11<82:33:14,  4.18s/it]

Wrong response format. Node 15723 ignored


 60%|█████▉    | 106001/177004 [8:11:03<65:47:05,  3.34s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.4041164815425873
Cop confidence without context: 0.24130897223949432
LLM is correct without context: False
Confidences with context: {'disease': {98243: 0.3158488869667053, 39524: 0.5700080394744873, 29078: 0.9070083498954773, 39820: 0.9028376936912537, 33151: 0.350582480430603, 97035: 0.29796791076660156, 84245: 0.8175516724586487, 99459: 0.5962454080581665}, 'effect/phenotype': {92247: 0.7294198274612427}, 'drug': {20449: 0.3838154375553131, 20452: 0.27913910150527954}, 'gene/protein': {4487: 0.4686981737613678, 11014: 0.492167592048645, 9376: 0.44323012232780457, 4510: 0.5008922219276428}}
Accuracies with context: {'disease': {98243: 0, 39524: 1, 29078: 0, 39820: 0, 33151: 0, 97035: 0, 84245: 0, 99459: 0}, 'effect/phenotype': {92247: 0}, 'drug': {20449: 0, 20452: 0}, 'gene/protein': {4487: 0, 11014: 0, 9376: 0, 4510: 0}}
Cop confidences with context: {'disease': {98243: 0.2787356674671173, 39524: 0.57000803

 60%|█████▉    | 106101/177004 [8:16:47<79:25:48,  4.03s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.36536338925361633
Cop confidence without context: 0.21478603780269623
LLM is correct without context: False
Confidences with context: {'disease': {35479: 0.3234878182411194, 97219: 0.33966064453125, 98608: 0.32116928696632385, 97083: 0.306534081697464, 39409: 0.3271612823009491, 99027: 0.3153150975704193, 99684: 0.33680805563926697, 96007: 0.3162379562854767, 27695: 0.29893484711647034, 95691: 0.338769793510437}, 'effect/phenotype': {91087: 0.3154471516609192, 87971: 0.4083667993545532, 89878: 0.3637840449810028}, 'anatomy': {74868: 0.3308132588863373, 74870: 0.33378300070762634, 74869: 0.3410622775554657}}
Accuracies with context: {'disease': {35479: 0, 97219: 0, 98608: 0, 97083: 0, 39409: 0, 99027: 0, 99684: 0, 96007: 0, 27695: 0, 95691: 0}, 'effect/phenotype': {91087: 0, 87971: 0, 89878: 0}, 'anatomy': {74868: 0, 74870: 0, 74869: 0}}
Cop confidences with context: {'disease': {35479: 0.22407346963882446, 972

 60%|█████▉    | 106104/177004 [8:16:57<73:01:38,  3.71s/it]

Wrong response format. Node 94279 ignored
Wrong response format. Node 85023 ignored
Wrong response format. Node 85080 ignored
Wrong response format. Node 86844 ignored
Wrong response format. Node 31166 ignored
Wrong response format. Node 23420 ignored


 60%|█████▉    | 106182/177004 [8:21:08<75:03:25,  3.82s/it]

Wrong response format. Node 28082 ignored


 60%|█████▉    | 106201/177004 [8:22:14<51:47:29,  2.63s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.30375751852989197
Cop confidence without context: 0.30375751852989197
LLM is correct without context: True
Confidences with context: {'disease': {28791: 0.2734403610229492}, 'anatomy': {70387: 0.28759855031967163}}
Accuracies with context: {'disease': {28791: 0}, 'anatomy': {70387: 0}}
Cop confidences with context: {'disease': {28791: 0.25289100408554077}, 'anatomy': {70387: 0.2557954490184784}}


 60%|██████    | 106301/177004 [8:28:15<75:45:50,  3.86s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.32436108589172363
Cop confidence without context: 0.11122383922338486
LLM is correct without context: False
Confidences with context: {'disease': {29771: 0.3233381509780884, 32595: 0.28086283802986145, 33412: 0.29944127798080444, 33320: 0.3018711507320404}, 'anatomy': {63240: 0.29294994473457336, 70752: 0.247279092669487}, 'effect/phenotype': {90888: 0.31202059984207153, 23084: 0.27654096484184265, 92680: 0.31234118342399597}, 'drug': {15788: 0.31148189306259155, 16756: 0.32399147748947144, 17423: 0.3287789821624756, 21468: 0.30753353238105774, 21469: 0.3228580951690674, 21461: 0.3150083124637604}}
Accuracies with context: {'disease': {29771: 0, 32595: 0, 33412: 0, 33320: 0}, 'anatomy': {63240: 0, 70752: 0}, 'effect/phenotype': {90888: 0, 23084: 0, 92680: 0}, 'drug': {15788: 0, 16756: 0, 17423: 0, 21468: 0, 21469: 0, 21461: 0}}
Cop confidences with context: {'disease': {29771: 0.15881836414337158, 32595: 0.174

 60%|██████    | 106359/177004 [8:31:32<55:02:56,  2.81s/it]

Wrong response format. Question 106358 ignored


 60%|██████    | 106401/177004 [8:33:50<67:43:07,  3.45s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.2901257574558258
Cop confidence without context: 0.20572996139526367
LLM is correct without context: False
Confidences with context: {'drug': {20196: 0.2787640392780304}, 'disease': {95740: 0.2691537141799927, 36409: 0.2625356614589691}}
Accuracies with context: {'drug': {20196: 1}, 'disease': {95740: 0, 36409: 0}}
Cop confidences with context: {'drug': {20196: 0.2787640392780304}, 'disease': {95740: 0.23202501237392426, 36409: 0.26049256324768066}}


 60%|██████    | 106441/177004 [8:36:08<83:31:07,  4.26s/it]

Wrong response format. Node 91661 ignored


 60%|██████    | 106445/177004 [8:36:19<61:14:58,  3.13s/it]

Wrong response format. Node 39776 ignored


 60%|██████    | 106481/177004 [8:38:36<106:03:35,  5.41s/it]

Wrong response format. Node 92615 ignored


 60%|██████    | 106501/177004 [8:39:49<65:22:29,  3.34s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.39192256331443787
Cop confidence without context: 0.1659504622220993
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {92682: 0.380405992269516}, 'drug': {14736: 0.40287795662879944, 20267: 0.32204532623291016, 15932: 0.309104859828949, 17693: 0.29834675788879395, 20327: 0.2854670584201813, 14496: 0.2917666733264923, 21510: 0.4156934916973114}, 'gene/protein': {4109: 0.31280753016471863}}
Accuracies with context: {'effect/phenotype': {92682: 0}, 'drug': {14736: 0, 20267: 0, 15932: 0, 17693: 0, 20327: 0, 14496: 0, 21510: 0}, 'gene/protein': {4109: 0}}
Cop confidences with context: {'effect/phenotype': {92682: 0.19429238140583038}, 'drug': {14736: 0.13494721055030823, 20267: 0.17785030603408813, 15932: 0.1845749467611313, 17693: 0.19262714684009552, 20327: 0.17587114870548248, 14496: 0.1883787214756012, 21510: 0.2057831883430481}, 'gene/protein': {4109: 0.15606547892093658}}


 60%|██████    | 106601/177004 [8:45:08<60:34:04,  3.10s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.8566415309906006
Cop confidence without context: 0.8566415309906006
LLM is correct without context: True
Confidences with context: {'disease': {83767: 0.6492919325828552}, 'drug': {20180: 0.6052380800247192, 14245: 0.872477650642395, 14748: 0.47776052355766296, 14993: 0.5791155695915222, 15545: 0.6501976847648621, 17195: 0.720806360244751, 17172: 0.6970727443695068, 21402: 0.5363528728485107, 21502: 0.5878311395645142, 21640: 0.6453256607055664, 21501: 0.6603396534919739}}
Accuracies with context: {'disease': {83767: 1}, 'drug': {20180: 1, 14245: 0, 14748: 1, 14993: 0, 15545: 1, 17195: 1, 17172: 1, 21402: 1, 21502: 1, 21640: 1, 21501: 1}}
Cop confidences with context: {'disease': {83767: 0.6492919325828552}, 'drug': {20180: 0.6052380800247192, 14245: 0.07989490032196045, 14748: 0.47776052355766296, 14993: 0.2147156447172165, 15545: 0.6501976847648621, 17195: 0.720806360244751, 17172: 0.6970727443695068, 21402:

 60%|██████    | 106603/177004 [8:45:17<70:49:53,  3.62s/it]

Wrong response format. Node 20679 ignored
Wrong response format. Node 14798 ignored
Wrong response format. Node 21470 ignored
Wrong response format. Node 20269 ignored


 60%|██████    | 106642/177004 [8:47:26<68:18:48,  3.50s/it]

Wrong response format. Node 32928 ignored
Wrong response format. Node 97155 ignored


 60%|██████    | 106643/177004 [8:47:32<86:03:30,  4.40s/it]

Wrong response format. Node 91661 ignored


 60%|██████    | 106699/177004 [8:50:48<59:17:33,  3.04s/it]

Wrong response format. Node 84009 ignored


 60%|██████    | 106701/177004 [8:50:53<51:14:41,  2.62s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.8587642312049866
Cop confidence without context: 0.8587642312049866
LLM is correct without context: True
Confidences with context: {'drug': {14052: 0.8045526742935181}}
Accuracies with context: {'drug': {14052: 1}}
Cop confidences with context: {'drug': {14052: 0.8045526742935181}}


 60%|██████    | 106801/177004 [8:56:31<75:50:30,  3.89s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3467344641685486
Cop confidence without context: 0.1945004016160965
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {89917: 0.32197606563568115}, 'disease': {28484: 0.3302977383136749, 99750: 0.351162850856781, 38113: 0.37387585639953613, 97791: 0.35810586810112}, 'gene/protein': {1421: 0.34731218218803406, 4106: 0.34709852933883667, 1974: 0.35168972611427307, 6400: 0.3332497477531433}}
Accuracies with context: {'effect/phenotype': {89917: 0}, 'disease': {28484: 0, 99750: 0, 38113: 0, 97791: 0}, 'gene/protein': {1421: 0, 4106: 0, 1974: 0, 6400: 0}}
Cop confidences with context: {'effect/phenotype': {89917: 0.22653846442699432}, 'disease': {28484: 0.221751406788826, 99750: 0.19698451459407806, 38113: 0.19856397807598114, 97791: 0.1946987807750702}, 'gene/protein': {1421: 0.20417428016662598, 4106: 0.20726199448108673, 1974: 0.2133105993270874, 6400: 0.20691949129104614}}


 60%|██████    | 106866/177004 [9:00:06<55:35:25,  2.85s/it]

Wrong response format. Node 84407 ignored
Wrong response format. Node 29635 ignored


 60%|██████    | 106901/177004 [9:02:11<67:14:24,  3.45s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.2930275797843933
Cop confidence without context: 0.2565835118293762
LLM is correct without context: False
Confidences with context: {'drug': {18394: 0.2726478576660156, 19896: 0.27519744634628296, 20269: 0.27837613224983215, 20276: 0.2625959515571594, 20231: 0.2699301838874817}, 'effect/phenotype': {84553: 0.27168965339660645}, 'disease': {27494: 0.8243175148963928, 98544: 0.4027753472328186, 38350: 0.3309452533721924, 39106: 0.4069541096687317, 35489: 0.3797873556613922, 36934: 0.31331032514572144, 33283: 0.29205748438835144, 38285: 0.2582930624485016, 35460: 0.26085349917411804}}
Accuracies with context: {'drug': {18394: 0, 19896: 0, 20269: 0, 20276: 1, 20231: 0}, 'effect/phenotype': {84553: 1}, 'disease': {27494: 0, 98544: 1, 38350: 0, 39106: 0, 35489: 0, 36934: 0, 33283: 0, 38285: 0, 35460: 0}}
Cop confidences with context: {'drug': {18394: 0.2581377923488617, 19896: 0.2525353729724884, 20269: 0.2475928515

 60%|██████    | 106921/177004 [9:03:13<42:50:39,  2.20s/it]

Wrong response format. Question 106920 ignored


 60%|██████    | 107001/177004 [9:07:32<72:54:18,  3.75s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3334372043609619
Cop confidence without context: 0.1550627201795578
LLM is correct without context: False
Confidences with context: {'disease': {36476: 0.32951200008392334, 39868: 0.40509697794914246}, 'anatomy': {64967: 0.3523203730583191, 76407: 0.3823383152484894, 70421: 0.3411623537540436, 72117: 0.329448401927948}, 'drug': {14748: 0.3562886416912079, 14870: 0.37425053119659424, 17423: 0.3346208333969116, 21503: 0.3723989427089691, 21461: 0.3471006155014038, 21469: 0.35346701741218567, 16018: 0.3827153742313385, 16756: 0.33586791157722473, 21411: 0.34554654359817505, 19900: 0.34891849756240845, 21474: 0.36018359661102295}, 'effect/phenotype': {93805: 0.5169317722320557}}
Accuracies with context: {'disease': {36476: 0, 39868: 0}, 'anatomy': {64967: 0, 76407: 0, 70421: 0, 72117: 0}, 'drug': {14748: 0, 14870: 0, 17423: 0, 21503: 0, 21461: 0, 21469: 0, 16018: 0, 16756: 0, 21411: 0, 19900: 0, 21474: 0}, 'effect

 61%|██████    | 107101/177004 [9:13:21<70:59:15,  3.66s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.33125489950180054
Cop confidence without context: 0.28779923915863037
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {86304: 0.30824217200279236, 90994: 0.3594794273376465, 90993: 0.3353227972984314, 92943: 0.32448405027389526}, 'disease': {31820: 0.3315395414829254, 31401: 0.3134574890136719, 38500: 0.30885618925094604, 38621: 0.3959314823150635, 27498: 0.3012816905975342}, 'gene/protein': {3060: 0.3520395755767822, 5362: 0.28579702973365784}}
Accuracies with context: {'effect/phenotype': {86304: 1, 90994: 0, 90993: 0, 92943: 0}, 'disease': {31820: 1, 31401: 0, 38500: 1, 38621: 1, 27498: 1}, 'gene/protein': {3060: 0, 5362: 0}}
Cop confidences with context: {'effect/phenotype': {86304: 0.30824217200279236, 90994: 0.25095707178115845, 90993: 0.18663524091243744, 92943: 0.24879056215286255}, 'disease': {31820: 0.3315395414829254, 31401: 0.2619035542011261, 38500: 0.308856189

 61%|██████    | 107201/177004 [9:18:43<53:41:57,  2.77s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.384493350982666
Cop confidence without context: 0.22427299618721008
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {33729: 0.2970764935016632}, 'anatomy': {68830: 0.3091574013233185, 68834: 0.32119160890579224, 70535: 0.31100574135780334}, 'disease': {29427: 0.3296419382095337, 32642: 0.3223058879375458}}
Accuracies with context: {'effect/phenotype': {33729: 0}, 'anatomy': {68830: 1, 68834: 1, 70535: 0}, 'disease': {29427: 1, 32642: 1}}
Cop confidences with context: {'effect/phenotype': {33729: 0.28793638944625854}, 'anatomy': {68830: 0.3091574013233185, 68834: 0.32119160890579224, 70535: 0.3038012981414795}, 'disease': {29427: 0.3296419382095337, 32642: 0.3223058879375458}}


 61%|██████    | 107274/177004 [9:23:00<48:16:32,  2.49s/it]

Wrong response format. Question 107273 ignored


 61%|██████    | 107301/177004 [9:24:38<75:26:39,  3.90s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3161703944206238
Cop confidence without context: 0.3161703944206238
LLM is correct without context: True
Confidences with context: {'disease': {83976: 0.34001871943473816, 84043: 0.4184199273586273, 83788: 0.254603773355484, 33528: 0.27956050634384155, 33521: 0.3262875974178314, 30409: 0.35217592120170593, 27731: 0.3375997245311737, 31764: 0.3006647825241089, 33489: 0.3359900712966919, 33473: 0.37199750542640686, 33197: 0.3034486174583435, 32413: 0.32222217321395874, 29549: 0.3025057315826416}, 'effect/phenotype': {22952: 0.31939512491226196, 85550: 0.22686995565891266}, 'gene/protein': {2384: 0.28047433495521545}}
Accuracies with context: {'disease': {83976: 1, 84043: 1, 83788: 1, 33528: 1, 33521: 1, 30409: 1, 27731: 1, 31764: 1, 33489: 1, 33473: 1, 33197: 1, 32413: 1, 29549: 1}, 'effect/phenotype': {22952: 1, 85550: 1}, 'gene/protein': {2384: 1}}
Cop confidences with context: {'disease': {83976: 0.3400187194

 61%|██████    | 107318/177004 [9:25:30<63:15:23,  3.27s/it]

Wrong response format. Node 90962 ignored
Wrong response format. Node 21238 ignored


 61%|██████    | 107325/177004 [9:25:51<51:27:01,  2.66s/it]

Wrong response format. Question 107324 ignored


 61%|██████    | 107401/177004 [9:30:18<55:17:41,  2.86s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3853655457496643
Cop confidence without context: 0.2030733823776245
LLM is correct without context: False
Confidences with context: {'disease': {29535: 0.4179818332195282, 30553: 0.4018999934196472, 31982: 0.4081799387931824}, 'drug': {15912: 0.36405953764915466, 14132: 0.41103896498680115, 14130: 0.39722582697868347}, 'effect/phenotype': {93432: 0.3577040135860443, 91830: 0.3878229558467865}}
Accuracies with context: {'disease': {29535: 0, 30553: 0, 31982: 0}, 'drug': {15912: 0, 14132: 0, 14130: 0}, 'effect/phenotype': {93432: 0, 91830: 0}}
Cop confidences with context: {'disease': {29535: 0.19437962770462036, 30553: 0.19434615969657898, 31982: 0.1913100928068161}, 'drug': {15912: 0.21235404908657074, 14132: 0.1376808136701584, 14130: 0.20130424201488495}, 'effect/phenotype': {93432: 0.22210349142551422, 91830: 0.18176895380020142}}


 61%|██████    | 107501/177004 [9:35:41<54:33:05,  2.83s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.3560115396976471
Cop confidence without context: 0.3560115396976471
LLM is correct without context: True
Confidences with context: {'disease': {32742: 0.32902172207832336, 37824: 0.5777731537818909, 84029: 0.5163958072662354, 27713: 0.4703690707683563}, 'anatomy': {72352: 0.37405261397361755}}
Accuracies with context: {'disease': {32742: 0, 37824: 0, 84029: 0, 27713: 0}, 'anatomy': {72352: 0}}
Cop confidences with context: {'disease': {32742: 0.18028903007507324, 37824: 0.1981191784143448, 84029: 0.07981297373771667, 27713: 0.16903066635131836}, 'anatomy': {72352: 0.21147017180919647}}


 61%|██████    | 107545/177004 [9:38:09<41:13:05,  2.14s/it]

Wrong response format. Question 107544 ignored


 61%|██████    | 107601/177004 [9:41:09<50:53:33,  2.64s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.3454626500606537
Cop confidence without context: 0.16446489095687866
LLM is correct without context: False
Confidences with context: {'gene/protein': {1430: 0.37754857540130615, 6739: 0.4841763973236084}, 'effect/phenotype': {85395: 0.35198384523391724}, 'anatomy': {70420: 0.4141004979610443}, 'drug': {19270: 0.40940046310424805}, 'disease': {94817: 0.4488420784473419}}
Accuracies with context: {'gene/protein': {1430: 0, 6739: 0}, 'effect/phenotype': {85395: 0}, 'anatomy': {70420: 0}, 'drug': {19270: 0}, 'disease': {94817: 0}}
Cop confidences with context: {'gene/protein': {1430: 0.17421004176139832, 6739: 0.13550543785095215}, 'effect/phenotype': {85395: 0.14332973957061768}, 'anatomy': {70420: 0.15473806858062744}, 'drug': {19270: 0.08188524097204208}, 'disease': {94817: 0.13904479146003723}}


 61%|██████    | 107701/177004 [9:46:29<55:12:44,  2.87s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.9531463980674744
Cop confidence without context: 0.9531463980674744
LLM is correct without context: True
Confidences with context: {'disease': {83761: 0.5923140048980713, 33026: 0.7481690049171448, 33645: 0.736943244934082, 39815: 0.7098514437675476, 36469: 0.5167532563209534, 33473: 0.7184023857116699, 95541: 0.6616355180740356, 99778: 0.6886842846870422, 99564: 0.6026610732078552, 99781: 0.515670120716095, 29549: 0.5312651991844177, 27731: 0.623875617980957, 38980: 0.44678056240081787, 37839: 0.5702562928199768}, 'effect/phenotype': {22952: 0.8380118012428284, 92528: 0.7982672452926636}}
Accuracies with context: {'disease': {83761: 1, 33026: 1, 33645: 0, 39815: 1, 36469: 0, 33473: 1, 95541: 0, 99778: 1, 99564: 0, 99781: 1, 29549: 1, 27731: 1, 38980: 1, 37839: 1}, 'effect/phenotype': {22952: 1, 92528: 1}}
Cop confidences with context: {'disease': {83761: 0.5923140048980713, 33026: 0.7481690049171448, 33645: 0

 61%|██████    | 107724/177004 [9:47:42<43:07:37,  2.24s/it]

Wrong response format. Question 107723 ignored


 61%|██████    | 107801/177004 [9:52:13<58:54:26,  3.06s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3792315125465393
Cop confidence without context: 0.1384257972240448
LLM is correct without context: False
Confidences with context: {'disease': {28374: 0.40068519115448, 97267: 0.42305850982666016, 31966: 0.3438086211681366, 39706: 0.40687403082847595, 98021: 0.4914659857749939, 97484: 0.3581695258617401}, 'gene/protein': {7638: 0.4037773311138153, 35125: 0.40628373622894287, 5926: 0.4146222770214081, 9824: 0.42959052324295044}}
Accuracies with context: {'disease': {28374: 0, 97267: 0, 31966: 0, 39706: 0, 98021: 0, 97484: 0}, 'gene/protein': {7638: 0, 35125: 0, 5926: 0, 9824: 0}}
Cop confidences with context: {'disease': {28374: 0.1532757431268692, 97267: 0.14393840730190277, 31966: 0.18986915051937103, 39706: 0.16058383882045746, 98021: 0.11052097380161285, 97484: 0.178697407245636}, 'gene/protein': {7638: 0.15445859730243683, 35125: 0.16035087406635284, 5926: 0.15614818036556244, 9824: 0.14056147634983063}}


 61%|██████    | 107867/177004 [9:56:20<53:00:08,  2.76s/it]

Wrong response format. Question 107866 ignored


 61%|██████    | 107891/177004 [9:57:39<51:33:45,  2.69s/it]

Wrong response format. Question 107890 ignored


 61%|██████    | 107901/177004 [9:58:06<38:53:10,  2.03s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.32023656368255615
Cop confidence without context: 0.32023656368255615
LLM is correct without context: True
Confidences with context: {'disease': {94927: 0.33926334977149963}, 'drug': {16570: 0.5280484557151794}}
Accuracies with context: {'disease': {94927: 1}, 'drug': {16570: 0}}
Cop confidences with context: {'disease': {94927: 0.33926334977149963}, 'drug': {16570: 0.22534342110157013}}


 61%|██████    | 108001/177004 [10:03:19<58:09:56,  3.03s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.34827908873558044
Cop confidence without context: 0.34827908873558044
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {87360: 0.3587663173675537}, 'disease': {37183: 0.3519960045814514, 38760: 0.3530176877975464, 37177: 0.37704694271087646, 99018: 0.3466089367866516}, 'drug': {15861: 0.3922039270401001, 20276: 0.3761756420135498, 20269: 0.37879276275634766}}
Accuracies with context: {'effect/phenotype': {87360: 1}, 'disease': {37183: 1, 38760: 1, 37177: 1, 99018: 1}, 'drug': {15861: 1, 20276: 1, 20269: 1}}
Cop confidences with context: {'effect/phenotype': {87360: 0.3587663173675537}, 'disease': {37183: 0.3519960045814514, 38760: 0.3530176877975464, 37177: 0.37704694271087646, 99018: 0.3466089367866516}, 'drug': {15861: 0.3922039270401001, 20276: 0.3761756420135498, 20269: 0.37879276275634766}}


 61%|██████    | 108101/177004 [10:09:01<59:25:25,  3.10s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3413938879966736
Cop confidence without context: 0.3413938879966736
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {85299: 0.31696808338165283, 88545: 0.31658121943473816, 90318: 0.31784698367118835}, 'disease': {35856: 0.3204360604286194, 84138: 0.3406601846218109, 27740: 0.3164478838443756, 37681: 0.296461820602417, 39220: 0.33779165148735046, 98647: 0.2851126194000244, 39776: 0.3005140721797943, 98624: 0.31127187609672546, 33197: 0.3040791451931}}
Accuracies with context: {'effect/phenotype': {85299: 1, 88545: 0, 90318: 1}, 'disease': {35856: 1, 84138: 1, 27740: 1, 37681: 1, 39220: 1, 98647: 1, 39776: 0, 98624: 1, 33197: 1}}
Cop confidences with context: {'effect/phenotype': {85299: 0.31696808338165283, 88545: 0.24463500082492828, 90318: 0.31784698367118835}, 'disease': {35856: 0.3204360604286194, 84138: 0.3406601846218109, 27740: 0.3164478838443756, 37681: 0.29646182060

 61%|██████    | 108155/177004 [10:11:57<35:05:00,  1.83s/it]

Wrong response format. Question 108154 ignored


 61%|██████    | 108201/177004 [10:14:24<53:43:02,  2.81s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.4277476966381073
Cop confidence without context: 0.4277476966381073
LLM is correct without context: True
Confidences with context: {'disease': {28468: 0.3455076217651367, 37824: 0.5489402413368225, 27713: 0.34224194288253784, 32421: 0.42681145668029785, 97187: 0.5958066582679749}}
Accuracies with context: {'disease': {28468: 1, 37824: 1, 27713: 1, 32421: 0, 97187: 0}}
Cop confidences with context: {'disease': {28468: 0.3455076217651367, 37824: 0.5489402413368225, 27713: 0.34224194288253784, 32421: 0.10961425304412842, 97187: 0.14600870013237}}


 61%|██████    | 108247/177004 [10:16:49<68:32:28,  3.59s/it]

Wrong response format. Node 97147 ignored
Wrong response format. Node 38096 ignored
Wrong response format. Node 18392 ignored
Wrong response format. Node 37843 ignored
Wrong response format. Node 96670 ignored
Wrong response format. Node 33346 ignored
Wrong response format. Node 21365 ignored
Wrong response format. Node 34243 ignored
Wrong response format. Node 13976 ignored
Wrong response format. Node 11174 ignored


 61%|██████    | 108257/177004 [10:17:17<50:58:55,  2.67s/it]

Wrong response format. Node 91661 ignored


 61%|██████    | 108264/177004 [10:17:41<58:03:09,  3.04s/it]

Wrong response format. Question 108263 ignored


 61%|██████    | 108301/177004 [10:19:41<72:22:03,  3.79s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.319609135389328
Cop confidence without context: 0.23751334846019745
LLM is correct without context: False
Confidences with context: {'anatomy': {68240: 0.35164132714271545, 68241: 0.3403264284133911}, 'disease': {39776: 0.3019692003726959, 33635: 0.32482820749282837, 83833: 0.29201698303222656, 95556: 0.32418927550315857, 95931: 0.3295172154903412, 99637: 0.3450848162174225, 99636: 0.3307942748069763, 95518: 0.321513295173645}, 'effect/phenotype': {85458: 0.35493767261505127, 90493: 0.3394603133201599}, 'gene/protein': {34973: 0.321594774723053, 264: 0.30687862634658813, 667: 0.3027118146419525, 2131: 0.3061290383338928}}
Accuracies with context: {'anatomy': {68240: 0, 68241: 0}, 'disease': {39776: 0, 33635: 0, 83833: 0, 95556: 0, 95931: 0, 99637: 0, 99636: 0, 95518: 0}, 'effect/phenotype': {85458: 0, 90493: 0}, 'gene/protein': {34973: 0, 264: 0, 667: 0, 2131: 0}}
Cop confidences with context: {'anatomy': {682

 61%|██████    | 108328/177004 [10:21:12<84:56:45,  4.45s/it] 

Wrong response format. Question 108327 ignored


 61%|██████    | 108401/177004 [10:24:59<77:27:33,  4.06s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3248400092124939
Cop confidence without context: 0.17523804306983948
LLM is correct without context: False
Confidences with context: {'disease': {97059: 0.40875059366226196, 84231: 0.3440553545951843, 83951: 0.34167009592056274}, 'drug': {14701: 0.4041455090045929, 14989: 0.3256595730781555, 14206: 0.3163059651851654, 15808: 0.8914089798927307, 20287: 0.503713071346283, 21505: 0.3691737949848175, 21382: 0.29260799288749695}, 'gene/protein': {34459: 0.6902827024459839}}
Accuracies with context: {'disease': {97059: 0, 84231: 0, 83951: 0}, 'drug': {14701: 1, 14989: 1, 14206: 0, 15808: 0, 20287: 1, 21505: 1, 21382: 0}, 'gene/protein': {34459: 0}}
Cop confidences with context: {'disease': {97059: 0.17997044324874878, 84231: 0.23832017183303833, 83951: 0.22760137915611267}, 'drug': {14701: 0.4041455090045929, 14989: 0.3256595730781555, 14206: 0.17882317304611206, 15808: 0.040725965052843094, 20287: 0.503713071346283

 61%|██████▏   | 108475/177004 [10:28:55<67:37:25,  3.55s/it]

Wrong response format. Node 12686 ignored


 61%|██████▏   | 108501/177004 [10:30:19<79:05:28,  4.16s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.3193250298500061
Cop confidence without context: 0.3193250298500061
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {92529: 0.30608484148979187}, 'anatomy': {64807: 0.3375921845436096, 63717: 0.4606291651725769, 68718: 0.44134455919265747}}
Accuracies with context: {'effect/phenotype': {92529: 0}, 'anatomy': {64807: 0, 63717: 0, 68718: 0}}
Cop confidences with context: {'effect/phenotype': {92529: 0.24787506461143494}, 'anatomy': {64807: 0.2356773167848587, 63717: 0.15672118961811066, 68718: 0.15015994012355804}}


 61%|██████▏   | 108528/177004 [10:31:42<98:01:25,  5.15s/it] 

Wrong response format. Node 92291 ignored
Wrong response format. Node 31918 ignored
Wrong response format. Node 24558 ignored


 61%|██████▏   | 108537/177004 [10:32:12<75:09:14,  3.95s/it]

Wrong response format. Node 14211 ignored


 61%|██████▏   | 108601/177004 [10:35:47<67:30:40,  3.55s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.5371363162994385
Cop confidence without context: 0.04443664103746414
LLM is correct without context: False
Confidences with context: {'disease': {32617: 0.5166596174240112}, 'gene/protein': {5631: 0.30385154485702515, 4784: 0.32807087898254395, 728: 0.39366063475608826, 482: 0.30099931359291077, 12105: 0.30906346440315247, 2490: 0.308441698551178, 34103: 0.3163689076900482, 883: 0.3077935576438904, 3609: 0.30713579058647156, 7487: 0.308086633682251}}
Accuracies with context: {'disease': {32617: 0}, 'gene/protein': {5631: 0, 4784: 0, 728: 0, 482: 0, 12105: 0, 2490: 0, 34103: 0, 883: 0, 3609: 0, 7487: 0}}
Cop confidences with context: {'disease': {32617: 0.06620107591152191}, 'gene/protein': {5631: 0.0912327691912651, 4784: 0.06510765850543976, 728: 0.06894449144601822, 482: 0.08228863030672073, 12105: 0.07814348489046097, 2490: 0.07737938314676285, 34103: 0.07455942779779434, 883: 0.07425865530967712, 3609: 0.0

 61%|██████▏   | 108664/177004 [10:39:45<71:21:38,  3.76s/it]

Wrong response format. Node 97155 ignored


 61%|██████▏   | 108701/177004 [10:41:50<67:35:21,  3.56s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.33461475372314453
Cop confidence without context: 0.11746083199977875
LLM is correct without context: False
Confidences with context: {'drug': {14943: 0.3855625092983246, 14676: 0.27333611249923706, 21449: 0.24933874607086182, 21455: 0.25754448771476746, 21451: 0.25698626041412354, 21447: 0.2488158643245697, 21452: 0.2805570065975189, 21453: 0.2607240080833435, 21456: 0.26099780201911926, 21448: 0.24720710515975952, 16104: 0.2504675090312958}, 'effect/phenotype': {25855: 0.26148542761802673, 88431: 0.2593221068382263, 93684: 0.30043715238571167}}
Accuracies with context: {'drug': {14943: 0, 14676: 0, 21449: 0, 21455: 0, 21451: 0, 21447: 0, 21452: 0, 21453: 0, 21456: 0, 21448: 0, 16104: 0}, 'effect/phenotype': {25855: 0, 88431: 0, 93684: 0}}
Cop confidences with context: {'drug': {14943: 0.15336617827415466, 14676: 0.2670042812824249, 21449: 0.1852928102016449, 21455: 0.19592952728271484, 21451: 0.1836598068475

 61%|██████▏   | 108801/177004 [10:47:27<58:44:29,  3.10s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.3264934718608856
Cop confidence without context: 0.18748943507671356
LLM is correct without context: False
Confidences with context: {'drug': {14940: 0.28175705671310425, 14856: 0.27950653433799744, 16522: 0.2797885835170746, 16527: 0.28736111521720886, 14214: 0.28465649485588074, 17172: 0.2656732499599457, 21900: 0.273488312959671, 21402: 0.2873018980026245, 17207: 0.2847910225391388, 16521: 0.2806512117385864, 21034: 0.2909802198410034, 21393: 0.2817527651786804}}
Accuracies with context: {'drug': {14940: 0, 14856: 1, 16522: 1, 16527: 1, 14214: 1, 17172: 0, 21900: 1, 21402: 1, 17207: 1, 16521: 1, 21034: 1, 21393: 1}}
Cop confidences with context: {'drug': {14940: 0.2525653839111328, 14856: 0.27950653433799744, 16522: 0.2797885835170746, 16527: 0.28736111521720886, 14214: 0.28465649485588074, 17172: 0.2636057436466217, 21900: 0.273488312959671, 21402: 0.2873018980026245, 17207: 0.2847910225391388, 16521: 0.28

 62%|██████▏   | 108901/177004 [10:52:45<47:22:57,  2.50s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.3811887502670288
Cop confidence without context: 0.07162324339151382
LLM is correct without context: False
Confidences with context: {'drug': {19349: 0.3464304506778717, 21203: 0.3709973990917206}, 'disease': {84079: 0.3937373459339142, 83912: 0.35313060879707336}, 'anatomy': {71403: 0.33984050154685974, 66063: 0.30650365352630615, 64893: 0.36547496914863586, 64875: 0.3653407096862793}, 'effect/phenotype': {27077: 0.3639698326587677}}
Accuracies with context: {'drug': {19349: 0, 21203: 0}, 'disease': {84079: 0, 83912: 0}, 'anatomy': {71403: 0, 66063: 0, 64893: 0, 64875: 0}, 'effect/phenotype': {27077: 0}}
Cop confidences with context: {'drug': {19349: 0.09695476293563843, 21203: 0.09162989258766174}, 'disease': {84079: 0.07875242084264755, 83912: 0.10520391166210175}, 'anatomy': {71403: 0.10445839911699295, 66063: 0.11100827157497406, 64893: 0.09913774579763412, 64875: 0.09987859427928925}, 'effect/phenotype':

 62%|██████▏   | 109001/177004 [10:58:14<88:13:55,  4.67s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.3253427743911743
Cop confidence without context: 0.20200993120670319
LLM is correct without context: False
Confidences with context: {'disease': {97174: 0.35282737016677856, 33521: 0.35281896591186523, 36830: 0.413690447807312, 96322: 0.5295218825340271, 33653: 0.7198449969291687, 33707: 0.4264732599258423, 96054: 0.7651157379150391, 96059: 0.7151538133621216, 96543: 0.31149888038635254, 96048: 0.7253934144973755, 96058: 0.6834007501602173, 98090: 0.43161889910697937, 96696: 0.4502927362918854, 96115: 0.3259028494358063, 98021: 0.2886713743209839, 96051: 0.7414618730545044}}
Accuracies with context: {'disease': {97174: 0, 33521: 0, 36830: 0, 96322: 1, 33653: 0, 33707: 0, 96054: 0, 96059: 0, 96543: 0, 96048: 0, 96058: 0, 98090: 0, 96696: 0, 96115: 0, 98021: 0, 96051: 0}}
Cop confidences with context: {'disease': {97174: 0.22602969408035278, 33521: 0.23872926831245422, 36830: 0.20639783143997192, 96322: 0.529521

 62%|██████▏   | 109032/177004 [10:59:54<67:19:07,  3.57s/it]

Wrong response format. Node 14683 ignored


 62%|██████▏   | 109041/177004 [11:00:21<61:04:14,  3.23s/it]

Wrong response format. Node 31891 ignored
Wrong response format. Node 30867 ignored


 62%|██████▏   | 109101/177004 [11:03:52<47:29:11,  2.52s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.2899937033653259
Cop confidence without context: 0.21719524264335632
LLM is correct without context: False
Confidences with context: {'disease': {84135: 0.3019939363002777, 32481: 0.3210694491863251, 98466: 0.3551981449127197, 33124: 0.3419665992259979, 98467: 0.34057971835136414}}
Accuracies with context: {'disease': {84135: 0, 32481: 0, 98466: 0, 33124: 0, 98467: 0}}
Cop confidences with context: {'disease': {84135: 0.18032880127429962, 32481: 0.2259005755186081, 98466: 0.21209849417209625, 33124: 0.22079022228717804, 98467: 0.2164856344461441}}


 62%|██████▏   | 109146/177004 [11:06:08<50:53:19,  2.70s/it]

Wrong response format. Node 91791 ignored
Wrong response format. Node 84142 ignored


 62%|██████▏   | 109201/177004 [11:09:11<55:03:56,  2.92s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3253220319747925
Cop confidence without context: 0.3253220319747925
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {89507: 0.3534048795700073}, 'anatomy': {72085: 0.3653343915939331, 64096: 0.31450724601745605}, 'disease': {32052: 0.41795244812965393}}
Accuracies with context: {'effect/phenotype': {89507: 1}, 'anatomy': {72085: 1, 64096: 0}, 'disease': {32052: 1}}
Cop confidences with context: {'effect/phenotype': {89507: 0.3534048795700073}, 'anatomy': {72085: 0.3653343915939331, 64096: 0.27972856163978577}, 'disease': {32052: 0.41795244812965393}}


 62%|██████▏   | 109223/177004 [11:10:13<72:44:07,  3.86s/it]

Wrong response format. Node 37919 ignored
Wrong response format. Node 14784 ignored
Wrong response format. Node 99907 ignored
Wrong response format. Node 97255 ignored
Wrong response format. Node 97338 ignored


 62%|██████▏   | 109232/177004 [11:10:45<71:51:45,  3.82s/it]

Wrong response format. Node 30094 ignored


 62%|██████▏   | 109279/177004 [11:13:39<94:37:32,  5.03s/it]

Wrong response format. Node 94682 ignored


 62%|██████▏   | 109301/177004 [11:15:03<82:32:30,  4.39s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3188415467739105
Cop confidence without context: 0.17200210690498352
LLM is correct without context: False
Confidences with context: {'disease': {37574: 0.34569835662841797, 96898: 0.33918488025665283, 84014: 0.3339506983757019, 99610: 0.26741841435432434}, 'gene/protein': {5682: 0.27068617939949036, 22082: 0.2846944034099579}, 'drug': {14291: 0.7464306354522705, 15419: 0.39335742592811584, 14545: 0.42348429560661316, 20862: 0.3803292214870453, 14309: 0.9650592803955078, 21454: 0.2900632321834564, 21451: 0.3087858259677887, 20269: 0.40484291315078735, 21449: 0.3146876394748688, 21457: 0.30642473697662354}}
Accuracies with context: {'disease': {37574: 0, 96898: 0, 84014: 0, 99610: 0}, 'gene/protein': {5682: 0, 22082: 0}, 'drug': {14291: 0, 15419: 0, 14545: 1, 20862: 0, 14309: 0, 21454: 0, 21451: 0, 20269: 0, 21449: 0, 21457: 0}}
Cop confidences with context: {'disease': {37574: 0.15827225148677826, 96898: 0.164

 62%|██████▏   | 109373/177004 [11:19:09<54:30:11,  2.90s/it]

Wrong response format. Node 99645 ignored
Wrong response format. Node 95177 ignored


 62%|██████▏   | 109393/177004 [11:20:10<49:29:22,  2.64s/it]

Wrong response format. Question 109392 ignored


 62%|██████▏   | 109401/177004 [11:20:33<38:55:25,  2.07s/it]

Wrong response format. Question 109400 ignored


 62%|██████▏   | 109452/177004 [11:23:39<54:29:12,  2.90s/it]

Wrong response format. Node 95805 ignored


 62%|██████▏   | 109501/177004 [11:26:56<80:07:49,  4.27s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.34794488549232483
Cop confidence without context: 0.1508229523897171
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {92482: 0.3247051239013672, 22722: 0.4551525413990021, 24597: 0.48920685052871704, 87553: 0.4745357036590576}, 'disease': {32358: 0.4198493957519531, 36034: 0.37766870856285095, 29046: 0.36722332239151, 31106: 0.38733983039855957, 33340: 0.44554078578948975, 31727: 0.46450427174568176, 27533: 0.431469202041626, 38475: 0.3566600978374481, 28797: 0.46824556589126587, 31473: 0.5640418529510498, 31933: 0.4920782744884491}}
Accuracies with context: {'effect/phenotype': {92482: 0, 22722: 0, 24597: 0, 87553: 0}, 'disease': {32358: 0, 36034: 0, 29046: 0, 31106: 0, 33340: 0, 31727: 0, 27533: 0, 38475: 0, 28797: 0, 31473: 0, 31933: 0}}
Cop confidences with context: {'effect/phenotype': {92482: 0.15701714158058167, 22722: 0.11966487765312195, 24597: 0.10016791522502899,

 62%|██████▏   | 109519/177004 [11:28:00<51:34:20,  2.75s/it]

Wrong response format. Question 109518 ignored


 62%|██████▏   | 109526/177004 [11:28:18<40:24:41,  2.16s/it]

Wrong response format. Question 109525 ignored


 62%|██████▏   | 109577/177004 [11:31:05<66:56:25,  3.57s/it]

Wrong response format. Node 92752 ignored


 62%|██████▏   | 109578/177004 [11:31:05<49:54:25,  2.66s/it]

Wrong response format. Question 109577 ignored


 62%|██████▏   | 109601/177004 [11:32:18<55:54:19,  2.99s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.28065699338912964
Cop confidence without context: 0.18120577931404114
LLM is correct without context: False
Confidences with context: {'disease': {98537: 0.33523333072662354, 99854: 0.321824312210083, 38091: 0.32738789916038513, 30637: 0.34185582399368286}, 'effect/phenotype': {85326: 0.29827287793159485, 86628: 0.29992520809173584}}
Accuracies with context: {'disease': {98537: 0, 99854: 0, 38091: 0, 30637: 0}, 'effect/phenotype': {85326: 0, 86628: 0}}
Cop confidences with context: {'disease': {98537: 0.1725633293390274, 99854: 0.19982527196407318, 38091: 0.17939376831054688, 30637: 0.18014536798000336}, 'effect/phenotype': {85326: 0.18665440380573273, 86628: 0.18477855622768402}}


 62%|██████▏   | 109701/177004 [11:37:32<72:33:17,  3.88s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3441349267959595
Cop confidence without context: 0.19762006402015686
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {22952: 0.35230162739753723, 88956: 0.36412671208381653, 91848: 0.3655441999435425, 86004: 0.3246089816093445, 92528: 0.3458367884159088}, 'disease': {32932: 0.3393772542476654, 98566: 0.35539257526397705, 33080: 0.337077796459198, 30357: 0.3384009301662445, 95783: 0.3599056303501129, 95433: 0.3454311490058899, 99623: 0.34583038091659546, 33473: 0.3516709804534912, 32413: 0.36686980724334717, 30409: 0.3573407530784607, 32464: 0.34681063890457153, 27731: 0.3307541608810425, 33079: 0.34041088819503784}}
Accuracies with context: {'effect/phenotype': {22952: 0, 88956: 0, 91848: 0, 86004: 0, 92528: 0}, 'disease': {32932: 0, 98566: 0, 33080: 0, 30357: 0, 95783: 0, 95433: 0, 99623: 0, 33473: 0, 32413: 0, 30409: 0, 32464: 0, 27731: 0, 33079: 0}}
Cop confidences with 

 62%|██████▏   | 109773/177004 [11:41:45<34:21:01,  1.84s/it]

Wrong response format. Question 109772 ignored


 62%|██████▏   | 109774/177004 [11:41:48<38:40:27,  2.07s/it]

Wrong response format. Node 14054 ignored


 62%|██████▏   | 109788/177004 [11:42:37<56:52:09,  3.05s/it]

Wrong response format. Node 9521 ignored


 62%|██████▏   | 109801/177004 [11:43:25<69:22:05,  3.72s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.2771373689174652
Cop confidence without context: 0.24457286298274994
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {89336: 0.36368298530578613, 88330: 0.5288742184638977, 92409: 0.6211793422698975}, 'disease': {33591: 0.7281243801116943, 33575: 0.7181097865104675, 95431: 0.9042908549308777, 94640: 0.43998071551322937, 98538: 0.6202593445777893, 32983: 0.5135622024536133, 33086: 0.3473767638206482, 32642: 0.3692932724952698, 84173: 0.3459114134311676, 99143: 0.6610827445983887, 33489: 0.591566264629364, 98053: 0.3412695527076721}}
Accuracies with context: {'effect/phenotype': {89336: 1, 88330: 0, 92409: 0}, 'disease': {33591: 0, 33575: 0, 95431: 0, 94640: 1, 98538: 0, 32983: 0, 33086: 1, 32642: 1, 84173: 1, 99143: 0, 33489: 0, 98053: 0}}
Cop confidences with context: {'effect/phenotype': {89336: 0.36368298530578613, 88330: 0.23286016285419464, 92409: 0.20009803771972656}, 

 62%|██████▏   | 109861/177004 [11:46:31<40:26:25,  2.17s/it]

Wrong response format. Question 109860 ignored


 62%|██████▏   | 109901/177004 [11:48:50<77:52:05,  4.18s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.4098037779331207
Cop confidence without context: 0.27946189045906067
LLM is correct without context: False
Confidences with context: {'disease': {31913: 0.41490429639816284, 36645: 0.3673585057258606, 38020: 0.4029983878135681, 35563: 0.38748225569725037, 83922: 0.5581239461898804, 95111: 0.6915084719657898, 97454: 0.30731555819511414, 39417: 0.33609429001808167, 28184: 0.3665129542350769, 37611: 0.5048354268074036, 97157: 0.32597970962524414, 97155: 0.29483553767204285}, 'effect/phenotype': {91750: 0.5170649290084839, 24513: 0.3408483564853668, 93661: 0.3806028366088867, 88349: 0.3001260757446289, 88350: 0.3700384497642517}}
Accuracies with context: {'disease': {31913: 0, 36645: 0, 38020: 0, 35563: 0, 83922: 1, 95111: 0, 97454: 1, 39417: 0, 28184: 0, 37611: 1, 97157: 0, 97155: 0}, 'effect/phenotype': {91750: 0, 24513: 0, 93661: 0, 88349: 0, 88350: 0}}
Cop confidences with context: {'disease': {31913: 0.282940

 62%|██████▏   | 109940/177004 [11:50:56<34:32:16,  1.85s/it]

Wrong response format. Question 109939 ignored


 62%|██████▏   | 109977/177004 [11:53:11<50:50:02,  2.73s/it]

Wrong response format. Node 31974 ignored


 62%|██████▏   | 110001/177004 [11:54:29<60:33:48,  3.25s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3160052001476288
Cop confidence without context: 0.24228960275650024
LLM is correct without context: False
Confidences with context: {'disease': {28313: 0.3270955979824066, 99810: 0.34105440974235535, 32965: 0.33755314350128174, 84294: 0.3392925560474396, 97059: 0.3742172420024872}, 'effect/phenotype': {93824: 0.3977343440055847}, 'gene/protein': {12989: 0.33091890811920166, 6181: 0.3349292278289795, 13284: 0.33388784527778625, 33814: 0.33208656311035156, 8572: 0.3308391273021698, 3746: 0.3201993703842163, 608: 0.3324406147003174}}
Accuracies with context: {'disease': {28313: 0, 99810: 0, 32965: 0, 84294: 0, 97059: 0}, 'effect/phenotype': {93824: 0}, 'gene/protein': {12989: 0, 6181: 0, 13284: 0, 33814: 0, 8572: 0, 3746: 0, 608: 0}}
Cop confidences with context: {'disease': {28313: 0.20469120144844055, 99810: 0.20525017380714417, 32965: 0.19689300656318665, 84294: 0.20102418959140778, 97059: 0.19414134323596954

 62%|██████▏   | 110096/177004 [11:59:46<65:01:30,  3.50s/it]

Wrong response format. Question 110095 ignored


 62%|██████▏   | 110101/177004 [12:00:06<72:23:13,  3.90s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.3136744201183319
Cop confidence without context: 0.12772992253303528
LLM is correct without context: False
Confidences with context: {'disease': {84072: 0.3025335669517517, 32589: 0.3340659439563751, 35956: 0.3487418591976166, 30075: 0.33400556445121765, 33077: 0.325952410697937, 97036: 0.28864115476608276, 94801: 0.28517410159111023, 97389: 0.365993857383728, 97274: 0.3524678349494934, 97397: 0.27197355031967163, 97700: 0.27522289752960205, 97686: 0.2770731449127197, 28815: 0.39557868242263794, 95349: 0.314432293176651}, 'effect/phenotype': {90237: 0.3745749592781067}, 'gene/protein': {5039: 0.3214539885520935}}
Accuracies with context: {'disease': {84072: 0, 32589: 0, 35956: 0, 30075: 0, 33077: 0, 97036: 0, 94801: 0, 97389: 0, 97274: 0, 97397: 0, 97700: 0, 97686: 0, 28815: 0, 95349: 0}, 'effect/phenotype': {90237: 0}, 'gene/protein': {5039: 0}}
Cop confidences with context: {'disease': {84072: 0.109570391476

 62%|██████▏   | 110157/177004 [12:03:10<71:39:30,  3.86s/it]

Wrong response format. Node 14054 ignored


 62%|██████▏   | 110201/177004 [12:05:45<60:46:17,  3.27s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.7903444766998291
Cop confidence without context: 0.7903444766998291
LLM is correct without context: True
Confidences with context: {'disease': {33038: 0.7142581939697266, 38559: 0.6746845841407776}, 'drug': {17606: 0.8607472777366638, 15525: 0.515087902545929, 14273: 0.6065386533737183, 15781: 0.4484366178512573, 21652: 0.5886825323104858, 14069: 0.5163788795471191, 19246: 0.5809786319732666, 17423: 0.5145502686500549, 16195: 0.6229913830757141, 16756: 0.48977118730545044, 21461: 0.48368287086486816, 21470: 0.6265513896942139}, 'effect/phenotype': {94382: 0.42551836371421814}}
Accuracies with context: {'disease': {33038: 1, 38559: 1}, 'drug': {17606: 1, 15525: 1, 14273: 0, 15781: 1, 21652: 1, 14069: 1, 19246: 1, 17423: 1, 16195: 1, 16756: 1, 21461: 1, 21470: 1}, 'effect/phenotype': {94382: 1}}
Cop confidences with context: {'disease': {33038: 0.7142581939697266, 38559: 0.6746845841407776}, 'drug': {17606: 0.86

 62%|██████▏   | 110270/177004 [12:09:38<50:50:03,  2.74s/it]

Wrong response format. Question 110269 ignored


 62%|██████▏   | 110301/177004 [12:11:20<53:28:06,  2.89s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.38259464502334595
Cop confidence without context: 0.12518449127674103
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {86885: 0.35258740186691284}, 'disease': {37788: 0.4845077395439148}, 'anatomy': {64617: 0.3824473023414612, 66119: 0.751608669757843, 71555: 0.49738165736198425, 71563: 0.47424036264419556, 71544: 0.49638831615448}}
Accuracies with context: {'effect/phenotype': {86885: 0}, 'disease': {37788: 0}, 'anatomy': {64617: 0, 66119: 0, 71555: 0, 71563: 0, 71544: 0}}
Cop confidences with context: {'effect/phenotype': {86885: 0.1516459584236145}, 'disease': {37788: 0.11966496706008911}, 'anatomy': {64617: 0.17104101181030273, 66119: 0.04409199580550194, 71555: 0.11630693078041077, 71563: 0.12179499864578247, 71544: 0.11338578909635544}}


 62%|██████▏   | 110361/177004 [12:14:49<68:15:10,  3.69s/it]

Wrong response format. Node 21447 ignored
Wrong response format. Node 21449 ignored
Wrong response format. Node 21453 ignored
Wrong response format. Node 14046 ignored


 62%|██████▏   | 110401/177004 [12:17:05<69:56:09,  3.78s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.30786481499671936
Cop confidence without context: 0.23238855600357056
LLM is correct without context: False
Confidences with context: {'disease': {95030: 0.39569297432899475, 37819: 0.3768809139728546, 32231: 0.3859889507293701, 30516: 0.35487785935401917, 28767: 0.39278438687324524, 27695: 0.365897536277771, 97434: 0.37421488761901855, 38243: 0.3759586215019226}, 'anatomy': {76314: 0.3546069860458374}, 'effect/phenotype': {23400: 0.3480086028575897, 90746: 0.32459238171577454, 93888: 0.3726772367954254, 93884: 0.3666199743747711}}
Accuracies with context: {'disease': {95030: 0, 37819: 0, 32231: 0, 30516: 0, 28767: 0, 27695: 0, 97434: 0, 38243: 0}, 'anatomy': {76314: 0}, 'effect/phenotype': {23400: 0, 90746: 0, 93888: 0, 93884: 0}}
Cop confidences with context: {'disease': {95030: 0.21346034109592438, 37819: 0.22504587471485138, 32231: 0.22165480256080627, 30516: 0.2327345907688141, 28767: 0.21024233102798462,

 62%|██████▏   | 110451/177004 [12:19:49<61:40:18,  3.34s/it]

Wrong response format. Node 97543 ignored
Wrong response format. Node 98899 ignored
Wrong response format. Node 39607 ignored


 62%|██████▏   | 110501/177004 [12:22:36<48:32:46,  2.63s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.34109583497047424
Cop confidence without context: 0.1367426961660385
LLM is correct without context: False
Confidences with context: {'disease': {99428: 0.28169360756874084, 97063: 0.349407821893692, 35832: 0.2948566675186157, 39823: 0.3367083668708801, 36717: 0.8563553094863892, 83762: 0.28361037373542786, 99835: 0.3312288820743561}, 'anatomy': {67348: 0.3966648280620575}}
Accuracies with context: {'disease': {99428: 0, 97063: 0, 35832: 0, 39823: 0, 36717: 0, 83762: 0, 99835: 0}, 'anatomy': {67348: 0}}
Cop confidences with context: {'disease': {99428: 0.1966538280248642, 97063: 0.15027843415737152, 35832: 0.1553785502910614, 39823: 0.14941343665122986, 36717: 0.010366950184106827, 83762: 0.17472727596759796, 99835: 0.16017183661460876}, 'anatomy': {67348: 0.36116743087768555}}


 62%|██████▏   | 110536/177004 [12:24:23<35:36:49,  1.93s/it]

Wrong response format. Question 110535 ignored


 62%|██████▏   | 110596/177004 [12:27:42<50:58:36,  2.76s/it]

Wrong response format. Question 110595 ignored


 62%|██████▏   | 110601/177004 [12:28:01<63:30:51,  3.44s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.2674328684806824
Cop confidence without context: 0.2397252768278122
LLM is correct without context: False
Confidences with context: {'disease': {32700: 0.3046117424964905, 28665: 0.2992282509803772}, 'effect/phenotype': {23399: 0.29074928164482117}, 'drug': {16180: 0.27177175879478455}, 'anatomy': {71081: 0.27908164262771606}, 'gene/protein': {8248: 0.2790991961956024}}
Accuracies with context: {'disease': {32700: 0, 28665: 0}, 'effect/phenotype': {23399: 0}, 'drug': {16180: 0}, 'anatomy': {71081: 0}, 'gene/protein': {8248: 0}}
Cop confidences with context: {'disease': {32700: 0.1997692584991455, 28665: 0.17728687822818756}, 'effect/phenotype': {23399: 0.1936807930469513}, 'drug': {16180: 0.1882508099079132}, 'anatomy': {71081: 0.20578190684318542}, 'gene/protein': {8248: 0.2173626720905304}}


 63%|██████▎   | 110701/177004 [12:33:58<52:05:26,  2.83s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.38164255023002625
Cop confidence without context: 0.2643560767173767
LLM is correct without context: False
Confidences with context: {'anatomy': {63540: 0.31556954979896545, 64731: 0.3968222737312317, 71407: 0.3637154698371887, 71406: 0.32726576924324036}, 'effect/phenotype': {23477: 0.4150179624557495, 24634: 0.3414651155471802, 89475: 0.535085141658783}, 'disease': {32854: 0.4229185879230499}}
Accuracies with context: {'anatomy': {63540: 0, 64731: 0, 71407: 0, 71406: 0}, 'effect/phenotype': {23477: 0, 24634: 0, 89475: 0}, 'disease': {32854: 0}}
Cop confidences with context: {'anatomy': {63540: 0.24769335985183716, 64731: 0.18167848885059357, 71407: 0.2276073396205902, 71406: 0.2357207089662552}, 'effect/phenotype': {23477: 0.15148843824863434, 24634: 0.1885749250650406, 89475: 0.15092769265174866}, 'disease': {32854: 0.20291855931282043}}


 63%|██████▎   | 110801/177004 [12:39:31<73:31:52,  4.00s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.36902567744255066
Cop confidence without context: 0.36902567744255066
LLM is correct without context: True
Confidences with context: {'disease': {31502: 0.32796019315719604, 35969: 0.3434639573097229, 38787: 0.3519361615180969, 36650: 0.37010496854782104, 38717: 0.33345887064933777}, 'anatomy': {63907: 0.3532663583755493}, 'gene/protein': {5769: 0.4018361270427704, 5732: 0.3592045307159424, 5163: 0.3636740446090698, 13851: 0.33767765760421753}, 'effect/phenotype': {93662: 0.3571998178958893, 93628: 0.34589651226997375}}
Accuracies with context: {'disease': {31502: 0, 35969: 0, 38787: 0, 36650: 0, 38717: 0}, 'anatomy': {63907: 0}, 'gene/protein': {5769: 0, 5732: 0, 5163: 0, 13851: 0}, 'effect/phenotype': {93662: 0, 93628: 0}}
Cop confidences with context: {'disease': {31502: 0.3105064928531647, 35969: 0.29149410128593445, 38787: 0.25748199224472046, 36650: 0.23709814250469208, 38717: 0.28745952248573303}, 'anat

 63%|██████▎   | 110901/177004 [12:45:27<81:04:12,  4.42s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.31242406368255615
Cop confidence without context: 0.31242406368255615
LLM is correct without context: True
Confidences with context: {'drug': {14593: 0.262002557516098, 16086: 0.30053621530532837, 14871: 0.7643582224845886, 17225: 0.5907178521156311, 16589: 0.34958967566490173, 16195: 0.26976191997528076, 21203: 0.29508841037750244, 21516: 0.2684963345527649}, 'disease': {32563: 0.3916269540786743, 83907: 0.2768951654434204}}
Accuracies with context: {'drug': {14593: 1, 16086: 1, 14871: 0, 17225: 0, 16589: 0, 16195: 0, 21203: 0, 21516: 1}, 'disease': {32563: 1, 83907: 0}}
Cop confidences with context: {'drug': {14593: 0.262002557516098, 16086: 0.30053621530532837, 14871: 0.10344462841749191, 17225: 0.12190151959657669, 16589: 0.22395555675029755, 16195: 0.24181301891803741, 21203: 0.22625213861465454, 21516: 0.2684963345527649}, 'disease': {32563: 0.3916269540786743, 83907: 0.2462756633758545}}


 63%|██████▎   | 110942/177004 [12:47:40<47:33:17,  2.59s/it]

Wrong response format. Node 91661 ignored


 63%|██████▎   | 110945/177004 [12:47:45<35:43:50,  1.95s/it]

Wrong response format. Question 110944 ignored


 63%|██████▎   | 111001/177004 [12:51:10<65:04:04,  3.55s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.41134604811668396
Cop confidence without context: 0.41134604811668396
LLM is correct without context: True
Confidences with context: {'drug': {21537: 0.413005530834198, 20647: 0.45661741495132446, 20607: 0.38008546829223633, 21539: 0.48669472336769104}, 'disease': {31464: 0.460406631231308}, 'effect/phenotype': {22476: 0.5123836994171143, 86462: 0.411002516746521, 85418: 0.4584980010986328, 90960: 0.5021864771842957, 92165: 0.4706941843032837, 89721: 0.4857404828071594, 92343: 0.4661724865436554, 85571: 0.44413694739341736, 93869: 0.6524744033813477}, 'gene/protein': {7461: 0.4062288999557495}}
Accuracies with context: {'drug': {21537: 1, 20647: 1, 20607: 1, 21539: 1}, 'disease': {31464: 1}, 'effect/phenotype': {22476: 1, 86462: 1, 85418: 1, 90960: 1, 92165: 1, 89721: 1, 92343: 1, 85571: 1, 93869: 1}, 'gene/protein': {7461: 1}}
Cop confidences with context: {'drug': {21537: 0.413005530834198, 20647: 0.45661741

 63%|██████▎   | 111101/177004 [12:56:40<69:49:11,  3.81s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.8863089680671692
Cop confidence without context: 0.024560222402215004
LLM is correct without context: False
Confidences with context: {'disease': {38941: 0.8031737804412842, 84118: 0.3839966058731079, 37552: 0.4525721073150635, 38099: 0.5701082348823547, 37553: 0.5844495296478271, 37562: 0.5279871821403503, 96880: 0.5751921534538269}, 'gene/protein': {1592: 0.6405513882637024, 13169: 0.5811240077018738, 9524: 0.41709163784980774, 1460: 0.3724519610404968, 9538: 0.4497019946575165}}
Accuracies with context: {'disease': {38941: 0, 84118: 0, 37552: 0, 38099: 0, 37553: 0, 37562: 0, 96880: 0}, 'gene/protein': {1592: 0, 13169: 1, 9524: 0, 1460: 0, 9538: 0}}
Cop confidences with context: {'disease': {38941: 0.052153754979372025, 84118: 0.17039746046066284, 37552: 0.10418801754713058, 38099: 0.09602189809083939, 37553: 0.0969112291932106, 37562: 0.11598338931798935, 96880: 0.1023237332701683}, 'gene/protein': {1592: 0

 63%|██████▎   | 111201/177004 [13:02:09<73:57:03,  4.05s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.2909417450428009
Cop confidence without context: 0.25277456641197205
LLM is correct without context: False
Confidences with context: {'anatomy': {68655: 0.28541383147239685}, 'effect/phenotype': {84715: 0.24399930238723755, 84687: 0.2608714997768402, 26604: 0.24286498129367828, 92533: 0.2389388531446457}, 'disease': {98916: 0.26167982816696167, 99091: 0.25815847516059875, 29056: 0.26121047139167786, 32179: 0.2503120005130768, 32052: 0.24348017573356628, 31904: 0.26354455947875977}}
Accuracies with context: {'anatomy': {68655: 0}, 'effect/phenotype': {84715: 0, 84687: 0, 26604: 0, 92533: 0}, 'disease': {98916: 1, 99091: 0, 29056: 0, 32179: 0, 32052: 1, 31904: 0}}
Cop confidences with context: {'anatomy': {68655: 0.25584331154823303}, 'effect/phenotype': {84715: 0.2153286188840866, 84687: 0.23752620816230774, 26604: 0.18476194143295288, 92533: 0.21926261484622955}, 'disease': {98916: 0.26167982816696167, 99091: 

 63%|██████▎   | 111219/177004 [13:03:10<38:41:49,  2.12s/it]

Wrong response format. Question 111218 ignored


 63%|██████▎   | 111270/177004 [13:05:49<62:17:00,  3.41s/it]

Wrong response format. Node 25944 ignored


 63%|██████▎   | 111301/177004 [13:07:48<62:36:04,  3.43s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.3454551100730896
Cop confidence without context: 0.18204206228256226
LLM is correct without context: False
Confidences with context: {'disease': {97048: 0.28169742226600647}}
Accuracies with context: {'disease': {97048: 0}}
Cop confidences with context: {'disease': {97048: 0.19512610137462616}}


 63%|██████▎   | 111379/177004 [13:12:30<85:40:35,  4.70s/it]

Wrong response format. Node 31661 ignored
Wrong response format. Node 94925 ignored
Wrong response format. Node 94924 ignored


 63%|██████▎   | 111401/177004 [13:14:13<93:09:54,  5.11s/it] 

Example Feedback:

Response without context: C
Confidence without context: 0.7177563905715942
Cop confidence without context: 0.7177563905715942
LLM is correct without context: True
Confidences with context: {'gene/protein': {5628: 0.4066484570503235}, 'drug': {15834: 0.5690093040466309, 21231: 0.8395487070083618, 14244: 0.9636591672897339, 14312: 0.9034988284111023, 14676: 0.3649301528930664, 15545: 0.3515179753303528, 14214: 0.4763806462287903, 21634: 0.4414743185043335, 17172: 0.3894551694393158, 14749: 0.4831490218639374, 21411: 0.41642823815345764, 17207: 0.38307517766952515, 21635: 0.5772679448127747}}
Accuracies with context: {'gene/protein': {5628: 1}, 'drug': {15834: 0, 21231: 0, 14244: 1, 14312: 0, 14676: 0, 15545: 0, 14214: 1, 21634: 1, 17172: 0, 14749: 1, 21411: 0, 17207: 1, 21635: 1}}
Cop confidences with context: {'gene/protein': {5628: 0.4066484570503235}, 'drug': {15834: 0.0758131593465805, 21231: 0.0414733923971653, 14244: 0.9636591672897339, 14312: 0.01720736175775528

 63%|██████▎   | 111501/177004 [13:19:33<75:58:05,  4.18s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.4142346680164337
Cop confidence without context: 0.1700015515089035
LLM is correct without context: False
Confidences with context: {'disease': {30430: 0.38394713401794434, 95016: 0.5802505612373352}, 'effect/phenotype': {91261: 0.3630738854408264, 85395: 0.43330472707748413, 85340: 0.4241516888141632}, 'anatomy': {63818: 0.3312566578388214, 63825: 0.44903308153152466}, 'gene/protein': {6723: 0.4090382158756256, 35119: 0.37969884276390076, 1185: 0.40495938062667847, 2370: 0.4007790684700012, 7291: 0.41038766503334045, 35118: 0.38585519790649414, 4058: 0.3880400061607361, 11096: 0.4092189073562622, 12927: 0.424328476190567, 13188: 0.4009757339954376}}
Accuracies with context: {'disease': {30430: 0, 95016: 0}, 'effect/phenotype': {91261: 0, 85395: 0, 85340: 0}, 'anatomy': {63818: 0, 63825: 0}, 'gene/protein': {6723: 0, 35119: 0, 1185: 0, 2370: 0, 7291: 0, 35118: 0, 4058: 0, 11096: 0, 12927: 0, 13188: 0}}
Cop con

 63%|██████▎   | 111601/177004 [13:24:58<58:02:04,  3.19s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.31202828884124756
Cop confidence without context: 0.31202828884124756
LLM is correct without context: True
Confidences with context: {'disease': {96048: 0.3620143532752991}, 'anatomy': {70398: 0.3580701947212219, 69543: 0.40518617630004883, 76357: 0.3420831561088562, 76359: 0.33832848072052}}
Accuracies with context: {'disease': {96048: 1}, 'anatomy': {70398: 1, 69543: 1, 76357: 1, 76359: 1}}
Cop confidences with context: {'disease': {96048: 0.3620143532752991}, 'anatomy': {70398: 0.3580701947212219, 69543: 0.40518617630004883, 76357: 0.3420831561088562, 76359: 0.33832848072052}}


 63%|██████▎   | 111694/177004 [13:29:36<40:25:03,  2.23s/it]

Wrong response format. Question 111693 ignored


 63%|██████▎   | 111701/177004 [13:29:57<55:40:00,  3.07s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.5983771085739136
Cop confidence without context: 0.5983771085739136
LLM is correct without context: True
Confidences with context: {'disease': {33572: 0.4828193187713623, 100001: 0.3526701033115387, 84208: 0.3962244689464569, 97074: 0.32060369849205017, 96890: 0.34506720304489136}, 'anatomy': {74319: 0.4874192476272583}}
Accuracies with context: {'disease': {33572: 1, 100001: 1, 84208: 1, 97074: 1, 96890: 1}, 'anatomy': {74319: 1}}
Cop confidences with context: {'disease': {33572: 0.4828193187713623, 100001: 0.3526701033115387, 84208: 0.3962244689464569, 97074: 0.32060369849205017, 96890: 0.34506720304489136}, 'anatomy': {74319: 0.4874192476272583}}


 63%|██████▎   | 111739/177004 [13:32:01<66:33:09,  3.67s/it]

Wrong response format. Node 91834 ignored


 63%|██████▎   | 111744/177004 [13:32:17<49:43:18,  2.74s/it]

Wrong response format. Question 111743 ignored


 63%|██████▎   | 111749/177004 [13:32:37<63:11:44,  3.49s/it]

Wrong response format. Node 85076 ignored
Wrong response format. Node 32843 ignored


 63%|██████▎   | 111801/177004 [13:35:23<59:00:18,  3.26s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.34886378049850464
Cop confidence without context: 0.17270009219646454
LLM is correct without context: False
Confidences with context: {'disease': {95396: 0.4011981785297394, 94806: 0.42007386684417725}, 'anatomy': {68669: 0.40111783146858215}, 'gene/protein': {3073: 0.3851260244846344, 3754: 0.40042582154273987, 6457: 0.3927476406097412}}
Accuracies with context: {'disease': {95396: 0, 94806: 0}, 'anatomy': {68669: 0}, 'gene/protein': {3073: 0, 3754: 0, 6457: 0}}
Cop confidences with context: {'disease': {95396: 0.14193838834762573, 94806: 0.14292293787002563}, 'anatomy': {68669: 0.14302298426628113}, 'gene/protein': {3073: 0.15439406037330627, 3754: 0.15080174803733826, 6457: 0.14561697840690613}}


 63%|██████▎   | 111821/177004 [13:36:29<60:06:52,  3.32s/it]

Wrong response format. Node 99927 ignored
Wrong response format. Node 87330 ignored


 63%|██████▎   | 111901/177004 [13:40:45<42:23:29,  2.34s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.30562111735343933
Cop confidence without context: 0.24176622927188873
LLM is correct without context: False
Confidences with context: {'disease': {99610: 0.29586076736450195, 83846: 0.3239215612411499, 39733: 0.35645511746406555, 36130: 0.2946590483188629, 33596: 0.4353410303592682, 96898: 0.3628688454627991, 38087: 0.36062759160995483, 99538: 0.3251935541629791}, 'effect/phenotype': {94442: 0.5396350622177124}}
Accuracies with context: {'disease': {99610: 0, 83846: 0, 39733: 1, 36130: 0, 33596: 0, 96898: 0, 38087: 0, 99538: 1}, 'effect/phenotype': {94442: 0}}
Cop confidences with context: {'disease': {99610: 0.2672879099845886, 83846: 0.19191718101501465, 39733: 0.35645511746406555, 36130: 0.1932421326637268, 33596: 0.17866359651088715, 96898: 0.20355096459388733, 38087: 0.18855850398540497, 99538: 0.3251935541629791}, 'effect/phenotype': {94442: 0.15340487658977509}}


 63%|██████▎   | 111912/177004 [13:41:18<50:12:13,  2.78s/it]

Wrong response format. Node 17606 ignored


 63%|██████▎   | 111942/177004 [13:42:52<52:06:46,  2.88s/it]

Wrong response format. Node 92615 ignored


 63%|██████▎   | 111985/177004 [13:45:22<67:18:10,  3.73s/it]

Wrong response format. Node 14312 ignored
Wrong response format. Node 14681 ignored
Wrong response format. Node 14749 ignored


 63%|██████▎   | 112001/177004 [13:46:03<35:50:46,  1.99s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3174962103366852
Cop confidence without context: 0.23965871334075928
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {85387: 0.3822003901004791, 84702: 0.34469491243362427}, 'disease': {84046: 0.35912832617759705}}
Accuracies with context: {'effect/phenotype': {85387: 0, 84702: 0}, 'disease': {84046: 0}}
Cop confidences with context: {'effect/phenotype': {85387: 0.23917490243911743, 84702: 0.26018938422203064}, 'disease': {84046: 0.25071197748184204}}


 63%|██████▎   | 112101/177004 [13:51:35<60:47:27,  3.37s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.2970283329486847
Cop confidence without context: 0.2725685238838196
LLM is correct without context: False
Confidences with context: {'disease': {39751: 0.2674868404865265, 29173: 0.30376550555229187, 96241: 0.301235556602478}, 'gene/protein': {1054: 0.24745608866214752, 34744: 0.24977825582027435, 5269: 0.26877880096435547, 3339: 0.27060094475746155, 6058: 0.25477463006973267, 4062: 0.2637163996696472, 10004: 0.2519550621509552, 1492: 0.252788782119751, 219: 0.26907896995544434}}
Accuracies with context: {'disease': {39751: 0, 29173: 0, 96241: 0}, 'gene/protein': {1054: 0, 34744: 0, 5269: 0, 3339: 0, 6058: 0, 4062: 0, 10004: 0, 1492: 0, 219: 0}}
Cop confidences with context: {'disease': {39751: 0.19569754600524902, 29173: 0.23109260201454163, 96241: 0.21697184443473816}, 'gene/protein': {1054: 0.21166041493415833, 34744: 0.2103343904018402, 5269: 0.21096687018871307, 3339: 0.23694610595703125, 6058: 0.21792031

 63%|██████▎   | 112201/177004 [13:57:17<62:29:19,  3.47s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.358805775642395
Cop confidence without context: 0.25245141983032227
LLM is correct without context: False
Confidences with context: {'anatomy': {74023: 0.36690443754196167}, 'disease': {33591: 0.6151197552680969, 31661: 0.29044970870018005, 33128: 0.32316088676452637, 30061: 0.78508061170578, 98538: 0.5202227830886841, 83768: 0.40319955348968506, 84322: 0.4480619728565216, 95774: 0.39759954810142517, 99599: 0.401455283164978, 94925: 0.35714849829673767, 83951: 0.4152888357639313, 97672: 0.36996224522590637, 97867: 0.43810510635375977, 94924: 0.3391082286834717}}
Accuracies with context: {'anatomy': {74023: 0}, 'disease': {33591: 1, 31661: 0, 33128: 1, 30061: 0, 98538: 1, 83768: 0, 84322: 0, 95774: 0, 99599: 1, 94925: 1, 83951: 0, 97672: 0, 97867: 0, 94924: 1}}
Cop confidences with context: {'anatomy': {74023: 0.23689131438732147}, 'disease': {33591: 0.6151197552680969, 31661: 0.28594669699668884, 33128: 0.3231

 63%|██████▎   | 112301/177004 [14:02:26<55:31:43,  3.09s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.40050604939460754
Cop confidence without context: 0.27099597454071045
LLM is correct without context: False
Confidences with context: {'drug': {14646: 0.33197325468063354, 20269: 0.2827712893486023, 20276: 0.29141098260879517}, 'disease': {30272: 0.3739789128303528, 97516: 0.2876097857952118, 38659: 0.39002737402915955}, 'effect/phenotype': {92290: 0.2931319773197174}, 'anatomy': {70420: 0.28312674164772034}}
Accuracies with context: {'drug': {14646: 0, 20269: 0, 20276: 0}, 'disease': {30272: 0, 97516: 0, 38659: 0}, 'effect/phenotype': {92290: 1}, 'anatomy': {70420: 0}}
Cop confidences with context: {'drug': {14646: 0.30943310260772705, 20269: 0.2635718286037445, 20276: 0.25318223237991333}, 'disease': {30272: 0.15835264325141907, 97516: 0.24409127235412598, 38659: 0.14125855267047882}, 'effect/phenotype': {92290: 0.2931319773197174}, 'anatomy': {70420: 0.24598479270935059}}


 63%|██████▎   | 112309/177004 [14:02:41<36:19:49,  2.02s/it]

Wrong response format. Question 112308 ignored


 64%|██████▎   | 112401/177004 [14:07:53<52:13:00,  2.91s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.34952792525291443
Cop confidence without context: 0.2517555356025696
LLM is correct without context: False
Confidences with context: {'drug': {14546: 0.2906014025211334, 16076: 0.2807002663612366, 21402: 0.29308658838272095, 21398: 0.2751918137073517, 17172: 0.2774360179901123, 14666: 0.28436583280563354, 17336: 0.27331697940826416, 21544: 0.28004270792007446, 20092: 0.3041055500507355, 21408: 0.2981707751750946, 21850: 0.29791510105133057, 14214: 0.2905876338481903}, 'disease': {83949: 0.310932457447052}}
Accuracies with context: {'drug': {14546: 1, 16076: 1, 21402: 1, 21398: 1, 17172: 0, 14666: 1, 17336: 1, 21544: 1, 20092: 1, 21408: 1, 21850: 1, 14214: 1}, 'disease': {83949: 0}}
Cop confidences with context: {'drug': {14546: 0.2906014025211334, 16076: 0.2807002663612366, 21402: 0.29308658838272095, 21398: 0.2751918137073517, 17172: 0.25458961725234985, 14666: 0.28436583280563354, 17336: 0.27331697940826416,

 64%|██████▎   | 112411/177004 [14:08:25<58:43:09,  3.27s/it]

Wrong response format. Node 88442 ignored


 64%|██████▎   | 112501/177004 [14:13:50<60:27:57,  3.37s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.832240879535675
Cop confidence without context: 0.832240879535675
LLM is correct without context: True
Confidences with context: {'disease': {98867: 0.7497968077659607, 32419: 0.6254907846450806, 32426: 0.8664109110832214, 98534: 0.8641967177391052, 32970: 0.9048147201538086}, 'effect/phenotype': {26120: 0.846128523349762, 92860: 0.8685114979743958}}
Accuracies with context: {'disease': {98867: 1, 32419: 1, 32426: 1, 98534: 0, 32970: 1}, 'effect/phenotype': {26120: 1, 92860: 1}}
Cop confidences with context: {'disease': {98867: 0.7497968077659607, 32419: 0.6254907846450806, 32426: 0.8664109110832214, 98534: 0.07610492408275604, 32970: 0.9048147201538086}, 'effect/phenotype': {26120: 0.846128523349762, 92860: 0.8685114979743958}}


 64%|██████▎   | 112560/177004 [14:17:06<56:12:50,  3.14s/it]

Wrong response format. Node 26330 ignored
Wrong response format. Node 31138 ignored
Wrong response format. Node 21935 ignored
Wrong response format. Node 91576 ignored
Wrong response format. Node 91577 ignored
Wrong response format. Node 91580 ignored


 64%|██████▎   | 112579/177004 [14:18:00<51:55:37,  2.90s/it]

Wrong response format. Node 91661 ignored


 64%|██████▎   | 112601/177004 [14:19:03<46:47:22,  2.62s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.34170833230018616
Cop confidence without context: 0.34170833230018616
LLM is correct without context: True
Confidences with context: {'anatomy': {66071: 0.2833452820777893, 70274: 0.30590906739234924, 64828: 0.2806735634803772, 70933: 0.2690567672252655, 70067: 0.271589994430542}, 'gene/protein': {1430: 0.26326245069503784}, 'effect/phenotype': {90146: 0.25056079030036926}}
Accuracies with context: {'anatomy': {66071: 0, 70274: 0, 64828: 0, 70933: 0, 70067: 0}, 'gene/protein': {1430: 0}, 'effect/phenotype': {90146: 0}}
Cop confidences with context: {'anatomy': {66071: 0.23860065639019012, 70274: 0.21024803817272186, 64828: 0.16759783029556274, 70933: 0.20791095495224, 70067: 0.23049543797969818}, 'gene/protein': {1430: 0.2378378063440323}, 'effect/phenotype': {90146: 0.2193983644247055}}


 64%|██████▎   | 112662/177004 [14:22:33<47:08:53,  2.64s/it]

Wrong response format. Question 112661 ignored


 64%|██████▎   | 112701/177004 [14:24:41<47:28:25,  2.66s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.36736613512039185
Cop confidence without context: 0.056337423622608185
LLM is correct without context: False
Confidences with context: {'disease': {39846: 0.3651594817638397, 99917: 0.34594622254371643}, 'effect/phenotype': {26143: 0.3415669798851013, 91124: 0.34174811840057373, 90904: 0.3624529540538788}}
Accuracies with context: {'disease': {39846: 0, 99917: 0}, 'effect/phenotype': {26143: 0, 91124: 0, 90904: 0}}
Cop confidences with context: {'disease': {39846: 0.08809883147478104, 99917: 0.1344202309846878}, 'effect/phenotype': {26143: 0.10832179337739944, 91124: 0.06272508949041367, 90904: 0.12428559362888336}}


 64%|██████▎   | 112750/177004 [14:27:12<70:09:34,  3.93s/it]

Wrong response format. Node 28158 ignored
Wrong response format. Node 84061 ignored
Wrong response format. Node 33765 ignored


 64%|██████▎   | 112762/177004 [14:27:53<71:42:32,  4.02s/it]

Wrong response format. Node 31493 ignored


 64%|██████▎   | 112767/177004 [14:28:06<46:06:39,  2.58s/it]

Wrong response format. Node 94850 ignored
Wrong response format. Node 96486 ignored


 64%|██████▎   | 112801/177004 [14:29:45<41:57:09,  2.35s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.30421531200408936
Cop confidence without context: 0.2090839147567749
LLM is correct without context: False
Confidences with context: {'anatomy': {68181: 0.3608156144618988}}
Accuracies with context: {'anatomy': {68181: 0}}
Cop confidences with context: {'anatomy': {68181: 0.20239920914173126}}


 64%|██████▍   | 112889/177004 [14:34:23<70:59:44,  3.99s/it]

Wrong response format. Node 25730 ignored
Wrong response format. Node 92615 ignored


 64%|██████▍   | 112901/177004 [14:35:05<64:13:15,  3.61s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.3415285050868988
Cop confidence without context: 0.3415285050868988
LLM is correct without context: True
Confidences with context: {'disease': {84226: 0.43217459321022034}, 'drug': {14735: 0.8817490935325623, 18410: 0.7305915951728821, 17318: 0.9696858525276184, 18403: 0.29084959626197815, 16461: 0.2940995395183563, 21637: 0.2635112404823303, 15932: 0.2761482894420624, 21203: 0.28225046396255493, 21641: 0.2906230390071869, 16195: 0.3063383102416992, 15528: 0.29965755343437195, 21634: 0.28666815161705017, 21636: 0.33310845494270325}}
Accuracies with context: {'disease': {84226: 1}, 'drug': {14735: 1, 18410: 0, 17318: 0, 18403: 0, 16461: 0, 21637: 0, 15932: 0, 21203: 0, 21641: 0, 16195: 0, 15528: 1, 21634: 0, 21636: 0}}
Cop confidences with context: {'disease': {84226: 0.43217459321022034}, 'drug': {14735: 0.8817490935325623, 18410: 0.014023431576788425, 17318: 0.008455226197838783, 18403: 0.2127901017665863, 16

 64%|██████▍   | 112955/177004 [14:38:07<68:21:44,  3.84s/it]

Wrong response format. Node 20879 ignored
Wrong response format. Node 14742 ignored


 64%|██████▍   | 113001/177004 [14:40:33<54:15:16,  3.05s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.46159064769744873
Cop confidence without context: 0.46159064769744873
LLM is correct without context: True
Confidences with context: {'drug': {15547: 0.45350417494773865, 14186: 0.938224196434021, 15893: 0.9598420858383179, 14275: 0.9620311856269836, 14955: 0.9858576059341431, 15545: 0.3941083550453186, 17693: 0.422328382730484, 21238: 0.3791029453277588}, 'disease': {32924: 0.3058395981788635, 38692: 0.3214893043041229, 96564: 0.32864245772361755, 98514: 0.3346112072467804, 94843: 0.323341429233551}}
Accuracies with context: {'drug': {15547: 0, 14186: 1, 15893: 0, 14275: 0, 14955: 0, 15545: 0, 17693: 0, 21238: 0}, 'disease': {32924: 1, 38692: 0, 96564: 1, 98514: 1, 94843: 1}}
Cop confidences with context: {'drug': {15547: 0.2371201515197754, 14186: 0.938224196434021, 15893: 0.015636082738637924, 14275: 0.012397371232509613, 14955: 0.004424963146448135, 15545: 0.18616360425949097, 17693: 0.2813313603401184, 21

 64%|██████▍   | 113038/177004 [14:42:19<51:53:54,  2.92s/it]

Wrong response format. Node 99991 ignored
Wrong response format. Node 64652 ignored
Wrong response format. Node 70658 ignored
Wrong response format. Node 70183 ignored


 64%|██████▍   | 113047/177004 [14:42:46<55:09:58,  3.11s/it]

Wrong response format. Node 83788 ignored


 64%|██████▍   | 113072/177004 [14:43:58<34:19:39,  1.93s/it]

Wrong response format. Question 113071 ignored


 64%|██████▍   | 113101/177004 [14:45:34<44:34:55,  2.51s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3034810721874237
Cop confidence without context: 0.2806740999221802
LLM is correct without context: False
Confidences with context: {'disease': {39410: 0.42803284525871277, 35933: 0.360889196395874, 98639: 0.322757363319397, 37167: 0.3417946994304657, 84266: 0.3883715271949768}, 'effect/phenotype': {84702: 0.3267486095428467}}
Accuracies with context: {'disease': {39410: 0, 35933: 0, 98639: 0, 37167: 0, 84266: 0}, 'effect/phenotype': {84702: 0}}
Cop confidences with context: {'disease': {39410: 0.19750471413135529, 35933: 0.23483505845069885, 98639: 0.25934284925460815, 37167: 0.2404826134443283, 84266: 0.21447910368442535}, 'effect/phenotype': {84702: 0.21096476912498474}}


 64%|██████▍   | 113187/177004 [14:50:39<42:04:51,  2.37s/it]

Wrong response format. Question 113186 ignored


 64%|██████▍   | 113201/177004 [14:51:27<68:28:56,  3.86s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.27589142322540283
Cop confidence without context: 0.24730746448040009
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {91849: 0.2638382911682129, 87370: 0.2839439809322357, 22945: 0.28129997849464417}, 'disease': {38243: 0.2984859347343445, 32670: 0.327624648809433, 38549: 0.2573275864124298}, 'gene/protein': {2249: 0.2755526006221771}}
Accuracies with context: {'effect/phenotype': {91849: 1, 87370: 0, 22945: 0}, 'disease': {38243: 0, 32670: 0, 38549: 0}, 'gene/protein': {2249: 0}}
Cop confidences with context: {'effect/phenotype': {91849: 0.2638382911682129, 87370: 0.23910482227802277, 22945: 0.2705235481262207}, 'disease': {38243: 0.21837696433067322, 32670: 0.20502229034900665, 38549: 0.24173688888549805}, 'gene/protein': {2249: 0.24700376391410828}}


 64%|██████▍   | 113264/177004 [14:55:12<65:17:51,  3.69s/it]

Wrong response format. Node 32886 ignored
Wrong response format. Node 38695 ignored
Wrong response format. Node 38937 ignored
Wrong response format. Node 97808 ignored
Wrong response format. Node 34590 ignored
Wrong response format. Node 11808 ignored
Wrong response format. Node 1203 ignored
Wrong response format. Node 437 ignored


 64%|██████▍   | 113301/177004 [14:57:09<49:40:11,  2.81s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.29653164744377136
Cop confidence without context: 0.22735869884490967
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {92424: 0.29377636313438416, 91139: 0.30460065603256226}, 'disease': {30438: 0.3052062690258026, 28188: 0.34380465745925903}}
Accuracies with context: {'effect/phenotype': {92424: 1, 91139: 1}, 'disease': {30438: 1, 28188: 1}}
Cop confidences with context: {'effect/phenotype': {92424: 0.29377636313438416, 91139: 0.30460065603256226}, 'disease': {30438: 0.3052062690258026, 28188: 0.34380465745925903}}


 64%|██████▍   | 113342/177004 [14:59:26<61:36:08,  3.48s/it]

Wrong response format. Node 14260 ignored
Wrong response format. Node 17651 ignored
Wrong response format. Node 21544 ignored
Wrong response format. Node 17172 ignored
Wrong response format. Node 14666 ignored
Wrong response format. Node 17336 ignored
Wrong response format. Node 21402 ignored
Wrong response format. Node 17196 ignored
Wrong response format. Node 14214 ignored


 64%|██████▍   | 113401/177004 [15:02:52<79:10:11,  4.48s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.37024423480033875
Cop confidence without context: 0.1818578541278839
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {25867: 0.3977956771850586}, 'disease': {94878: 0.3315994441509247, 36041: 0.3024516701698303, 95399: 0.3173139691352844, 96809: 0.3170776963233948, 37524: 0.3127526044845581, 99429: 0.31679612398147583}, 'drug': {20602: 0.41432955861091614}}
Accuracies with context: {'effect/phenotype': {25867: 0}, 'disease': {94878: 0, 36041: 0, 95399: 0, 96809: 0, 37524: 0, 99429: 0}, 'drug': {20602: 0}}
Cop confidences with context: {'effect/phenotype': {25867: 0.16843730211257935}, 'disease': {94878: 0.19493725895881653, 36041: 0.18201863765716553, 95399: 0.16721272468566895, 96809: 0.18351049721240997, 37524: 0.18100732564926147, 99429: 0.1906512826681137}, 'drug': {20602: 0.17407290637493134}}


 64%|██████▍   | 113501/177004 [15:08:24<72:48:45,  4.13s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.6076477766036987
Cop confidence without context: 0.09614384919404984
LLM is correct without context: False
Confidences with context: {'disease': {97105: 0.6135163903236389, 37622: 0.5618484020233154, 36187: 0.5812158584594727, 35766: 0.6352811455726624, 94742: 0.942356526851654, 84320: 0.6029737591743469, 95248: 0.5325618982315063, 99455: 0.8803328275680542, 97067: 0.5645261406898499, 95491: 0.6057661771774292, 37744: 0.5775235295295715, 97690: 0.5613846778869629, 99699: 0.5858809947967529, 96964: 0.59982830286026}, 'effect/phenotype': {23075: 0.5707392692565918, 23917: 0.5611674785614014, 23084: 0.5411677360534668}, 'drug': {14770: 0.5850668549537659, 16589: 0.5311246514320374}, 'anatomy': {66568: 0.48605436086654663}}
Accuracies with context: {'disease': {97105: 0, 37622: 0, 36187: 0, 35766: 0, 94742: 0, 84320: 0, 95248: 0, 99455: 0, 97067: 0, 95491: 0, 37744: 0, 97690: 0, 99699: 0, 96964: 0}, 'effect/phenot

 64%|██████▍   | 113523/177004 [15:09:37<53:41:43,  3.05s/it]

Wrong response format. Node 66830 ignored
Wrong response format. Node 37540 ignored


 64%|██████▍   | 113554/177004 [15:11:41<48:55:47,  2.78s/it]

Wrong response format. Node 97524 ignored


 64%|██████▍   | 113601/177004 [15:14:06<48:12:48,  2.74s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.40022793412208557
Cop confidence without context: 0.2624759078025818
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {84795: 0.3410033881664276}, 'disease': {36299: 0.3142669200897217, 32595: 0.36758649349212646, 33547: 0.33054521679878235}}
Accuracies with context: {'effect/phenotype': {84795: 0}, 'disease': {36299: 0, 32595: 0, 33547: 0}}
Cop confidences with context: {'effect/phenotype': {84795: 0.1927858293056488}, 'disease': {36299: 0.29522645473480225, 32595: 0.20142076909542084, 33547: 0.20684991776943207}}


 64%|██████▍   | 113701/177004 [15:19:39<51:47:04,  2.94s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.5272660851478577
Cop confidence without context: 0.5272660851478577
LLM is correct without context: True
Confidences with context: {'disease': {36210: 0.47024086117744446, 97057: 0.4039625823497772, 84071: 0.40814778208732605, 95534: 0.4322967529296875}, 'drug': {20394: 0.43091881275177}}
Accuracies with context: {'disease': {36210: 1, 97057: 1, 84071: 0, 95534: 1}, 'drug': {20394: 1}}
Cop confidences with context: {'disease': {36210: 0.47024086117744446, 97057: 0.4039625823497772, 84071: 0.36018913984298706, 95534: 0.4322967529296875}, 'drug': {20394: 0.43091881275177}}


 64%|██████▍   | 113801/177004 [15:25:01<51:26:02,  2.93s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.5615081191062927
Cop confidence without context: 0.5615081191062927
LLM is correct without context: True
Confidences with context: {'disease': {27439: 0.5072761178016663, 32751: 0.36534205079078674, 32535: 0.48983943462371826, 39064: 0.34494537115097046, 39456: 0.33742138743400574, 38210: 0.41788193583488464}, 'anatomy': {63272: 0.43080055713653564}, 'effect/phenotype': {90216: 0.3653659224510193}}
Accuracies with context: {'disease': {27439: 1, 32751: 1, 32535: 1, 39064: 1, 39456: 1, 38210: 1}, 'anatomy': {63272: 1}, 'effect/phenotype': {90216: 1}}
Cop confidences with context: {'disease': {27439: 0.5072761178016663, 32751: 0.36534205079078674, 32535: 0.48983943462371826, 39064: 0.34494537115097046, 39456: 0.33742138743400574, 38210: 0.41788193583488464}, 'anatomy': {63272: 0.43080055713653564}, 'effect/phenotype': {90216: 0.3653659224510193}}


 64%|██████▍   | 113901/177004 [15:30:28<50:57:30,  2.91s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.3713918626308441
Cop confidence without context: 0.3713918626308441
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {89947: 0.3343676030635834}, 'anatomy': {71059: 0.4184940755367279, 72552: 0.2752450704574585}}
Accuracies with context: {'effect/phenotype': {89947: 1}, 'anatomy': {71059: 1, 72552: 1}}
Cop confidences with context: {'effect/phenotype': {89947: 0.3343676030635834}, 'anatomy': {71059: 0.4184940755367279, 72552: 0.2752450704574585}}


 64%|██████▍   | 113957/177004 [15:33:41<45:34:33,  2.60s/it]

Wrong response format. Question 113956 ignored


 64%|██████▍   | 113990/177004 [15:35:44<67:14:30,  3.84s/it]

Wrong response format. Node 92337 ignored


 64%|██████▍   | 114001/177004 [15:36:14<37:56:57,  2.17s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.5178936719894409
Cop confidence without context: 0.5178936719894409
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {91234: 0.43054088950157166, 24293: 0.4135309159755707, 92515: 0.3985329568386078}, 'drug': {20328: 0.40928807854652405, 21375: 0.47645193338394165}}
Accuracies with context: {'effect/phenotype': {91234: 1, 24293: 1, 92515: 1}, 'drug': {20328: 1, 21375: 1}}
Cop confidences with context: {'effect/phenotype': {91234: 0.43054088950157166, 24293: 0.4135309159755707, 92515: 0.3985329568386078}, 'drug': {20328: 0.40928807854652405, 21375: 0.47645193338394165}}


 64%|██████▍   | 114101/177004 [15:41:35<56:59:04,  3.26s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3288128972053528
Cop confidence without context: 0.18301193416118622
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {90079: 0.3070085942745209, 94237: 0.2946348488330841, 94376: 0.28099504113197327, 94272: 0.3312172591686249}, 'disease': {84079: 0.7829604744911194, 83913: 0.8792445063591003, 33567: 0.6837172508239746, 97106: 0.30023881793022156, 84227: 0.30725884437561035, 95372: 0.5332636833190918, 37429: 0.3372374475002289, 95369: 0.34241020679473877, 95373: 0.36691832542419434, 84089: 0.37765493988990784, 96994: 0.3692212998867035}}
Accuracies with context: {'effect/phenotype': {90079: 0, 94237: 0, 94376: 0, 94272: 0}, 'disease': {84079: 0, 83913: 1, 33567: 0, 97106: 0, 84227: 0, 95372: 0, 37429: 0, 95369: 0, 95373: 0, 84089: 0, 96994: 0}}
Cop confidences with context: {'effect/phenotype': {90079: 0.2077324241399765, 94237: 0.20092350244522095, 94376: 0.1931248903274536

 65%|██████▍   | 114201/177004 [15:46:38<69:56:58,  4.01s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.5951570272445679
Cop confidence without context: 0.5951570272445679
LLM is correct without context: True
Confidences with context: {'drug': {16220: 0.43470731377601624, 20269: 0.41174980998039246, 20276: 0.3734714090824127}, 'anatomy': {77022: 0.38564085960388184}, 'gene/protein': {6540: 0.3969486951828003}, 'disease': {35669: 0.2964470386505127, 95477: 0.2786143720149994, 99299: 0.34606844186782837, 94980: 0.3003289997577667, 99834: 0.3597685992717743}, 'effect/phenotype': {25275: 0.33766821026802063, 92296: 0.34523633122444153, 88448: 0.2957892119884491, 90190: 0.33473265171051025, 88447: 0.2981255054473877, 88446: 0.2792089581489563}}
Accuracies with context: {'drug': {16220: 1, 20269: 1, 20276: 1}, 'anatomy': {77022: 1}, 'gene/protein': {6540: 1}, 'disease': {35669: 0, 95477: 0, 99299: 1, 94980: 0, 99834: 0}, 'effect/phenotype': {25275: 0, 92296: 1, 88448: 0, 90190: 1, 88447: 0, 88446: 1}}
Cop confidences 

 65%|██████▍   | 114239/177004 [15:48:57<68:42:35,  3.94s/it]

Wrong response format. Node 38352 ignored


 65%|██████▍   | 114301/177004 [15:52:39<63:03:32,  3.62s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.575138509273529
Cop confidence without context: 0.575138509273529
LLM is correct without context: True
Confidences with context: {'disease': {35811: 0.48976439237594604, 95740: 0.5409084558486938, 33216: 0.45442700386047363, 37274: 0.5469475984573364, 32249: 0.8034639358520508, 97183: 0.43834564089775085, 36409: 0.5054652094841003, 99069: 0.44801729917526245, 84141: 0.5880646109580994, 95276: 0.5269014835357666, 95188: 0.5251194834709167, 98510: 0.5408464074134827}, 'effect/phenotype': {89668: 0.39664021134376526, 89704: 0.5112709403038025, 91617: 0.48294195532798767}}
Accuracies with context: {'disease': {35811: 1, 95740: 1, 33216: 1, 37274: 1, 32249: 1, 97183: 1, 36409: 1, 99069: 0, 84141: 1, 95276: 1, 95188: 1, 98510: 1}, 'effect/phenotype': {89668: 0, 89704: 1, 91617: 1}}
Cop confidences with context: {'disease': {35811: 0.48976439237594604, 95740: 0.5409084558486938, 33216: 0.45442700386047363, 37274: 0.5

 65%|██████▍   | 114349/177004 [15:55:11<52:31:55,  3.02s/it]

Wrong response format. Node 91661 ignored


 65%|██████▍   | 114401/177004 [15:58:01<57:04:45,  3.28s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.39195147156715393
Cop confidence without context: 0.14306879043579102
LLM is correct without context: False
Confidences with context: {'disease': {28466: 0.3867427110671997, 35798: 0.40957507491111755, 95251: 0.37047111988067627, 99523: 0.3783000409603119}, 'effect/phenotype': {23299: 0.34250012040138245, 89394: 0.3146064281463623, 91737: 0.3741473853588104, 87382: 0.3641636371612549}, 'drug': {20950: 0.39761248230934143, 21365: 0.4045613408088684}, 'gene/protein': {1651: 0.39873820543289185}}
Accuracies with context: {'disease': {28466: 0, 35798: 0, 95251: 0, 99523: 0}, 'effect/phenotype': {23299: 0, 89394: 0, 91737: 0, 87382: 0}, 'drug': {20950: 0, 21365: 0}, 'gene/protein': {1651: 0}}
Cop confidences with context: {'disease': {28466: 0.1676405668258667, 35798: 0.20117652416229248, 95251: 0.21108807623386383, 99523: 0.20091362297534943}, 'effect/phenotype': {23299: 0.1763039529323578, 89394: 0.14291641116142

 65%|██████▍   | 114415/177004 [15:58:51<50:37:51,  2.91s/it]

Wrong response format. Question 114414 ignored


 65%|██████▍   | 114416/177004 [15:58:55<54:30:27,  3.14s/it]

Wrong response format. Node 97222 ignored


 65%|██████▍   | 114501/177004 [16:03:23<58:44:29,  3.38s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.46310099959373474
Cop confidence without context: 0.16641883552074432
LLM is correct without context: False
Confidences with context: {'disease': {37343: 0.3456503450870514, 37119: 0.3317456543445587, 37410: 0.5733462572097778, 96915: 0.31304582953453064, 96847: 0.3837614953517914, 96491: 0.3839226961135864, 96597: 0.40145012736320496, 96611: 0.44015857577323914, 97529: 0.32466691732406616, 37518: 0.2540951669216156, 97309: 0.283220112323761, 36885: 0.2650566101074219, 96266: 0.28067895770072937}, 'anatomy': {76157: 0.28523361682891846}, 'gene/protein': {752: 0.3184491991996765}, 'effect/phenotype': {26069: 0.279816597700119, 90638: 0.28451892733573914}}
Accuracies with context: {'disease': {37343: 0, 37119: 0, 37410: 0, 96915: 0, 96847: 0, 96491: 0, 96597: 0, 96611: 0, 97529: 0, 37518: 0, 97309: 0, 36885: 0, 96266: 0}, 'anatomy': {76157: 0}, 'gene/protein': {752: 0}, 'effect/phenotype': {26069: 0, 90638: 0}}


 65%|██████▍   | 114601/177004 [16:09:05<59:37:55,  3.44s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.9570284485816956
Cop confidence without context: 0.9570284485816956
LLM is correct without context: True
Confidences with context: {'drug': {17777: 0.6322234869003296, 17629: 0.9275768399238586, 14493: 0.5716583728790283, 14030: 0.8004443645477295, 14688: 0.9786158800125122, 14019: 0.9648643732070923, 21653: 0.8008931279182434, 21495: 0.8749191761016846, 21614: 0.7117713689804077, 15512: 0.8275886178016663}}
Accuracies with context: {'drug': {17777: 1, 17629: 1, 14493: 1, 14030: 0, 14688: 1, 14019: 0, 21653: 1, 21495: 1, 21614: 1, 15512: 1}}
Cop confidences with context: {'drug': {17777: 0.6322234869003296, 17629: 0.9275768399238586, 14493: 0.5716583728790283, 14030: 0.05576266348361969, 14688: 0.9786158800125122, 14019: 0.009683515876531601, 21653: 0.8008931279182434, 21495: 0.8749191761016846, 21614: 0.7117713689804077, 15512: 0.8275886178016663}}


 65%|██████▍   | 114618/177004 [16:10:01<50:08:49,  2.89s/it]

Wrong response format. Node 14054 ignored


 65%|██████▍   | 114666/177004 [16:12:55<86:51:43,  5.02s/it]

Wrong response format. Node 15290 ignored


 65%|██████▍   | 114701/177004 [16:14:56<44:32:23,  2.57s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.31356483697891235
Cop confidence without context: 0.2121685892343521
LLM is correct without context: False
Confidences with context: {'anatomy': {76439: 0.32937008142471313}, 'gene/protein': {34355: 0.3095358908176422}, 'disease': {97017: 0.35614213347435}}
Accuracies with context: {'anatomy': {76439: 0}, 'gene/protein': {34355: 0}, 'disease': {97017: 0}}
Cop confidences with context: {'anatomy': {76439: 0.16954518854618073}, 'gene/protein': {34355: 0.16829173266887665}, 'disease': {97017: 0.13517630100250244}}


 65%|██████▍   | 114706/177004 [16:15:08<31:23:10,  1.81s/it]

Wrong response format. Question 114705 ignored


 65%|██████▍   | 114801/177004 [16:20:13<64:47:52,  3.75s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.5583038330078125
Cop confidence without context: 0.5583038330078125
LLM is correct without context: True
Confidences with context: {'disease': {94672: 0.38286110758781433, 30725: 0.4145558476448059, 33222: 0.47221699357032776, 32263: 0.30387988686561584, 29637: 0.4784995913505554, 95104: 0.4346974492073059, 95857: 0.4783833622932434}, 'effect/phenotype': {93116: 0.45729097723960876}, 'gene/protein': {8465: 0.42712804675102234, 3154: 0.46365198493003845, 34367: 0.4423258900642395, 34972: 0.4229840636253357, 1013: 0.44418665766716003, 13434: 0.4522128999233246, 8745: 0.4530241787433624, 11768: 0.42455658316612244}}
Accuracies with context: {'disease': {94672: 1, 30725: 1, 33222: 1, 32263: 1, 29637: 1, 95104: 1, 95857: 1}, 'effect/phenotype': {93116: 1}, 'gene/protein': {8465: 1, 3154: 1, 34367: 1, 34972: 1, 1013: 1, 13434: 1, 8745: 1, 11768: 1}}
Cop confidences with context: {'disease': {94672: 0.382861107587814

 65%|██████▍   | 114840/177004 [16:22:34<36:42:19,  2.13s/it]

Wrong response format. Question 114839 ignored


 65%|██████▍   | 114855/177004 [16:23:28<44:45:48,  2.59s/it]

Wrong response format. Question 114854 ignored


 65%|██████▍   | 114900/177004 [16:26:32<55:06:21,  3.19s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.26941734552383423
Cop confidence without context: 0.2673207223415375
LLM is correct without context: False
Confidences with context: {'disease': {36909: 0.2946659028530121, 36863: 0.2842651605606079, 96044: 0.30068981647491455, 39462: 0.2884851098060608, 96770: 0.3221862316131592, 96753: 0.3110258877277374, 96010: 0.2852356731891632, 97355: 0.33278343081474304, 96012: 0.336103230714798, 96013: 0.31194597482681274}, 'drug': {20820: 0.2929670512676239}, 'effect/phenotype': {88327: 0.29897764325141907}, 'anatomy': {73961: 0.2958873510360718, 76542: 0.2827950716018677, 73350: 0.2881067991256714, 73348: 0.28353214263916016}}
Accuracies with context: {'disease': {36909: 1, 36863: 1, 96044: 1, 39462: 1, 96770: 1, 96753: 1, 96010: 1, 97355: 1, 96012: 1, 96013: 1}, 'drug': {20820: 1}, 'effect/phenotype': {88327: 1}, 'anatomy': {73961: 1, 76542: 1, 73350: 1, 73348: 1}}
Cop confidences with context: {'disease': {36909: 0

 65%|██████▍   | 114946/177004 [16:29:05<36:16:38,  2.10s/it]

Wrong response format. Question 114945 ignored


 65%|██████▍   | 114957/177004 [16:29:40<33:43:40,  1.96s/it]

Wrong response format. Question 114956 ignored


 65%|██████▍   | 115001/177004 [16:32:01<56:12:50,  3.26s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.30683690309524536
Cop confidence without context: 0.1530866175889969
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {92682: 0.27446261048316956}, 'drug': {14736: 0.48744767904281616, 20267: 0.43382322788238525, 17693: 0.3329688012599945, 15932: 0.3225758671760559, 20327: 0.32393354177474976, 21510: 0.36227986216545105, 14496: 0.3196534216403961}, 'disease': {96566: 0.3681160807609558}}
Accuracies with context: {'effect/phenotype': {92682: 0}, 'drug': {14736: 0, 20267: 0, 17693: 0, 15932: 0, 20327: 0, 21510: 0, 14496: 0}, 'disease': {96566: 0}}
Cop confidences with context: {'effect/phenotype': {92682: 0.18571069836616516}, 'drug': {14736: 0.1281556487083435, 20267: 0.1214132010936737, 17693: 0.1810319870710373, 15932: 0.17538145184516907, 20327: 0.15911081433296204, 21510: 0.18503445386886597, 14496: 0.1632630079984665}, 'disease': {96566: 0.17662402987480164}}


 65%|██████▌   | 115101/177004 [16:38:01<55:35:54,  3.23s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.4021645486354828
Cop confidence without context: 0.30595067143440247
LLM is correct without context: False
Confidences with context: {'disease': {84071: 0.3371977210044861, 99049: 0.3822057843208313, 100023: 0.377539724111557, 29761: 0.38800084590911865}, 'drug': {15902: 0.3872796893119812, 20394: 0.38186419010162354}, 'effect/phenotype': {26349: 0.3468552529811859}}
Accuracies with context: {'disease': {84071: 0, 99049: 0, 100023: 1, 29761: 0}, 'drug': {15902: 0, 20394: 0}, 'effect/phenotype': {26349: 0}}
Cop confidences with context: {'disease': {84071: 0.3319699764251709, 99049: 0.2907668352127075, 100023: 0.377539724111557, 29761: 0.3069339692592621}, 'drug': {15902: 0.278947114944458, 20394: 0.3165765404701233}, 'effect/phenotype': {26349: 0.33882033824920654}}


 65%|██████▌   | 115194/177004 [16:43:08<43:16:30,  2.52s/it]

Wrong response format. Question 115193 ignored


 65%|██████▌   | 115201/177004 [16:43:33<70:55:29,  4.13s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.534524142742157
Cop confidence without context: 0.11741936951875687
LLM is correct without context: False
Confidences with context: {'disease': {98669: 0.3516097664833069, 38821: 0.4001716077327728, 96162: 0.4354904592037201, 30438: 0.480827271938324, 36988: 0.30286163091659546, 29852: 0.47433704137802124, 32939: 0.9568459391593933, 33709: 0.9733238220214844, 36931: 0.9424967765808105, 94771: 0.9463262557983398, 94770: 0.9363642334938049, 37108: 0.9555533528327942, 33298: 0.44210320711135864}, 'anatomy': {63724: 0.3730585277080536, 68240: 0.43339312076568604}, 'effect/phenotype': {26485: 0.49562209844589233, 90111: 0.45094820857048035, 92065: 0.47189757227897644, 92101: 0.48351889848709106}, 'gene/protein': {34682: 0.3994394540786743}}
Accuracies with context: {'disease': {98669: 0, 38821: 0, 96162: 0, 30438: 0, 36988: 0, 29852: 0, 32939: 0, 33709: 0, 36931: 0, 94771: 0, 94770: 0, 37108: 0, 33298: 0}, 'anatomy

 65%|██████▌   | 115225/177004 [16:44:44<45:14:31,  2.64s/it]

Wrong response format. Question 115224 ignored


 65%|██████▌   | 115249/177004 [16:46:04<64:58:22,  3.79s/it]

Wrong response format. Question 115248 ignored


 65%|██████▌   | 115267/177004 [16:47:12<63:16:43,  3.69s/it]

Wrong response format. Node 17736 ignored


 65%|██████▌   | 115301/177004 [16:49:11<80:23:19,  4.69s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.33654797077178955
Cop confidence without context: 0.33654797077178955
LLM is correct without context: True
Confidences with context: {'disease': {32797: 0.31050917506217957, 99237: 0.32560303807258606, 83976: 0.31171202659606934, 36253: 0.3283696174621582, 97063: 0.3207816779613495, 35856: 0.30103936791419983, 98647: 0.3069627285003662, 37681: 0.3122548758983612, 39776: 0.3274311125278473, 39220: 0.32646653056144714, 98624: 0.31499379873275757}, 'effect/phenotype': {85356: 0.29688310623168945}, 'anatomy': {63767: 0.30750682950019836}, 'drug': {20802: 0.3114846348762512, 20808: 0.300590455532074, 17105: 0.2940162420272827, 39890: 0.29396113753318787}}
Accuracies with context: {'disease': {32797: 1, 99237: 1, 83976: 1, 36253: 1, 97063: 1, 35856: 1, 98647: 1, 37681: 1, 39776: 1, 39220: 1, 98624: 1}, 'effect/phenotype': {85356: 1}, 'anatomy': {63767: 1}, 'drug': {20802: 1, 20808: 1, 17105: 1, 39890: 1}}
Cop confid

 65%|██████▌   | 115401/177004 [16:54:55<51:28:40,  3.01s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.33182182908058167
Cop confidence without context: 0.09581418335437775
LLM is correct without context: False
Confidences with context: {'anatomy': {66463: 0.301925390958786}, 'disease': {96527: 0.5823196172714233, 29749: 0.8939635753631592, 94599: 0.26940903067588806, 38406: 0.7908419370651245}}
Accuracies with context: {'anatomy': {66463: 0}, 'disease': {96527: 0, 29749: 0, 94599: 0, 38406: 0}}
Cop confidences with context: {'anatomy': {66463: 0.12488177418708801}, 'disease': {96527: 0.06089840456843376, 29749: 0.02273237332701683, 94599: 0.09166181087493896, 38406: 0.014039125293493271}}


 65%|██████▌   | 115501/177004 [17:00:38<62:51:39,  3.68s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.28459179401397705
Cop confidence without context: 0.28017961978912354
LLM is correct without context: False
Confidences with context: {'drug': {20409: 0.32728099822998047, 20543: 0.30847740173339844}, 'anatomy': {75703: 0.33928200602531433, 74671: 0.3260292112827301}, 'disease': {35931: 0.3390858471393585, 84183: 0.30214276909828186, 84185: 0.3222872018814087, 94923: 0.32140764594078064, 84179: 0.3140942454338074, 83971: 0.2994799017906189, 83973: 0.3394256830215454, 99830: 0.31629523634910583, 99859: 0.295041561126709}}
Accuracies with context: {'drug': {20409: 0, 20543: 0}, 'anatomy': {75703: 0, 74671: 0}, 'disease': {35931: 0, 84183: 0, 84185: 0, 94923: 0, 84179: 0, 83971: 0, 83973: 0, 99830: 0, 99859: 0}}
Cop confidences with context: {'drug': {20409: 0.2032134085893631, 20543: 0.22568687796592712}, 'anatomy': {75703: 0.2295697033405304, 74671: 0.2366718202829361}, 'disease': {35931: 0.2223779261112213, 84

 65%|██████▌   | 115536/177004 [17:02:29<32:11:49,  1.89s/it]

Wrong response format. Question 115535 ignored


 65%|██████▌   | 115599/177004 [17:05:46<55:13:53,  3.24s/it]

Wrong response format. Node 94865 ignored
Wrong response format. Node 94866 ignored


 65%|██████▌   | 115601/177004 [17:05:53<60:31:54,  3.55s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.3090929388999939
Cop confidence without context: 0.1874743402004242
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {84636: 0.31603533029556274, 94382: 0.3755221366882324}, 'disease': {29256: 0.3815079629421234, 99571: 0.2978116273880005, 32076: 0.6372964978218079, 30697: 0.8020244836807251, 29628: 0.37913477420806885, 32956: 0.4180857837200165, 32637: 0.2985912263393402, 36672: 0.2883296608924866}, 'gene/protein': {71: 0.32356464862823486, 7543: 0.30485260486602783, 10132: 0.2859523296356201, 2374: 0.31572869420051575, 34031: 0.2868943512439728, 35102: 0.31754812598228455}}
Accuracies with context: {'effect/phenotype': {84636: 0, 94382: 0}, 'disease': {29256: 0, 99571: 0, 32076: 1, 30697: 0, 29628: 0, 32956: 0, 32637: 0, 36672: 0}, 'gene/protein': {71: 0, 7543: 0, 10132: 0, 2374: 0, 34031: 0, 35102: 0}}
Cop confidences with context: {'effect/phenotype': {84636: 0.262002497

 65%|██████▌   | 115674/177004 [17:09:54<68:06:01,  4.00s/it]

Wrong response format. Node 31918 ignored


 65%|██████▌   | 115700/177004 [17:11:12<55:53:44,  3.28s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.4309452474117279
Cop confidence without context: 0.3832906484603882
LLM is correct without context: False
Confidences with context: {'disease': {96867: 0.4813593029975891, 38242: 0.4243491291999817, 38957: 0.4726751744747162, 83760: 0.5611630082130432, 28249: 0.48288607597351074}, 'drug': {20387: 0.40698811411857605}, 'gene/protein': {3996: 0.41413024067878723, 10905: 0.40869054198265076, 33835: 0.4077611267566681, 33913: 0.43619340658187866, 13771: 0.44858303666114807}}
Accuracies with context: {'disease': {96867: 1, 38242: 1, 38957: 1, 83760: 0, 28249: 1}, 'drug': {20387: 1}, 'gene/protein': {3996: 1, 10905: 1, 33835: 1, 33913: 1, 13771: 1}}
Cop confidences with context: {'disease': {96867: 0.4813593029975891, 38242: 0.4243491291999817, 38957: 0.4726751744747162, 83760: 0.3172527849674225, 28249: 0.48288607597351074}, 'drug': {20387: 0.40698811411857605}, 'gene/protein': {3996: 0.41413024067878723, 10905: 0.

 65%|██████▌   | 115800/177004 [17:16:52<65:40:47,  3.86s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3230673372745514
Cop confidence without context: 0.28068575263023376
LLM is correct without context: False
Confidences with context: {'disease': {31177: 0.34625115990638733, 84290: 0.3456611633300781, 94784: 0.34317514300346375, 98940: 0.352605938911438}}
Accuracies with context: {'disease': {31177: 0, 84290: 1, 94784: 0, 98940: 0}}
Cop confidences with context: {'disease': {31177: 0.3079621493816376, 84290: 0.3456611633300781, 94784: 0.2958354651927948, 98940: 0.3015998601913452}}


 65%|██████▌   | 115901/177004 [17:22:23<47:45:40,  2.81s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3494049310684204
Cop confidence without context: 0.16764631867408752
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {91332: 0.3715966045856476}, 'anatomy': {64930: 0.37201547622680664}, 'disease': {98805: 0.36424052715301514, 39452: 0.38540807366371155}}
Accuracies with context: {'effect/phenotype': {91332: 0}, 'anatomy': {64930: 0}, 'disease': {98805: 0, 39452: 0}}
Cop confidences with context: {'effect/phenotype': {91332: 0.16880536079406738}, 'anatomy': {64930: 0.17989493906497955}, 'disease': {98805: 0.19958734512329102, 39452: 0.19839107990264893}}


 65%|██████▌   | 115910/177004 [17:23:02<55:49:48,  3.29s/it]

Wrong response format. Question 115909 ignored


 65%|██████▌   | 115923/177004 [17:23:52<45:45:24,  2.70s/it]

Wrong response format. Question 115922 ignored


 66%|██████▌   | 115954/177004 [17:25:31<52:56:21,  3.12s/it]

Wrong response format. Node 99863 ignored


 66%|██████▌   | 116001/177004 [17:28:15<64:22:11,  3.80s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.36741700768470764
Cop confidence without context: 0.1669066995382309
LLM is correct without context: False
Confidences with context: {'disease': {83868: 0.4201110601425171, 84185: 0.5947861671447754, 30438: 0.3171461224555969, 83788: 0.8849612474441528}, 'gene/protein': {6240: 0.2909279763698578, 3049: 0.27362099289894104, 1929: 0.27796342968940735, 199: 0.2736208438873291, 34976: 0.28054308891296387, 9221: 0.2589510381221771, 6725: 0.29513290524482727, 8457: 0.2940354645252228, 34975: 0.279182106256485, 833: 0.2688123881816864}}
Accuracies with context: {'disease': {83868: 0, 84185: 1, 30438: 1, 83788: 0}, 'gene/protein': {6240: 0, 3049: 0, 1929: 0, 199: 0, 34976: 0, 9221: 0, 6725: 0, 8457: 0, 34975: 1, 833: 0}}
Cop confidences with context: {'disease': {83868: 0.2628988027572632, 84185: 0.5947861671447754, 30438: 0.3171461224555969, 83788: 0.017253998667001724}, 'gene/protein': {6240: 0.2393108308315277, 304

 66%|██████▌   | 116101/177004 [17:34:11<57:57:13,  3.43s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.4255898892879486
Cop confidence without context: 0.4255898892879486
LLM is correct without context: True
Confidences with context: {'disease': {96586: 0.35279151797294617, 29527: 0.401077538728714, 33222: 0.37490323185920715, 30867: 0.43931350111961365, 28867: 0.3492056131362915}, 'effect/phenotype': {22757: 0.30272752046585083, 91916: 0.34099504351615906}}
Accuracies with context: {'disease': {96586: 1, 29527: 1, 33222: 1, 30867: 1, 28867: 1}, 'effect/phenotype': {22757: 1, 91916: 1}}
Cop confidences with context: {'disease': {96586: 0.35279151797294617, 29527: 0.401077538728714, 33222: 0.37490323185920715, 30867: 0.43931350111961365, 28867: 0.3492056131362915}, 'effect/phenotype': {22757: 0.30272752046585083, 91916: 0.34099504351615906}}


 66%|██████▌   | 116153/177004 [17:37:09<35:49:11,  2.12s/it]

Wrong response format. Question 116152 ignored


 66%|██████▌   | 116200/177004 [17:40:08<71:51:00,  4.25s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.38305234909057617
Cop confidence without context: 0.2453925460577011
LLM is correct without context: False
Confidences with context: {'disease': {27534: 0.2831311821937561, 84011: 0.4005792737007141, 84019: 0.3161107897758484, 39653: 0.33142051100730896, 33373: 0.3323569595813751, 95527: 0.35479292273521423, 38295: 0.2982035279273987}}
Accuracies with context: {'disease': {27534: 1, 84011: 1, 84019: 0, 39653: 0, 33373: 0, 95527: 0, 38295: 0}}
Cop confidences with context: {'disease': {27534: 0.2831311821937561, 84011: 0.4005792737007141, 84019: 0.2641204595565796, 39653: 0.2601349353790283, 33373: 0.26915088295936584, 95527: 0.2272888720035553, 38295: 0.26730790734291077}}


 66%|██████▌   | 116248/177004 [17:43:04<58:49:35,  3.49s/it]

Wrong response format. Node 37802 ignored
Wrong response format. Node 37625 ignored


 66%|██████▌   | 116300/177004 [17:46:15<71:47:19,  4.26s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3884238004684448
Cop confidence without context: 0.20468497276306152
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {87633: 0.48080143332481384}, 'anatomy': {68434: 0.4869394898414612}}
Accuracies with context: {'effect/phenotype': {87633: 0}, 'anatomy': {68434: 0}}
Cop confidences with context: {'effect/phenotype': {87633: 0.14212462306022644}, 'anatomy': {68434: 0.13627874851226807}}


 66%|██████▌   | 116302/177004 [17:46:17<43:13:07,  2.56s/it]

Wrong response format. Question 116301 ignored


 66%|██████▌   | 116365/177004 [17:49:58<71:35:41,  4.25s/it]

Wrong response format. Node 91661 ignored


 66%|██████▌   | 116400/177004 [17:52:09<44:14:21,  2.63s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.4509357511997223
Cop confidence without context: 0.4509357511997223
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {84364: 0.4771515130996704, 23284: 0.45298606157302856, 84795: 0.5065695643424988}, 'disease': {38244: 0.5067366361618042, 36806: 0.47663193941116333, 27985: 0.485553115606308, 30252: 0.4702216386795044, 37175: 0.4748855531215668, 95532: 0.5045791864395142, 29933: 0.9565783739089966, 99453: 0.48844271898269653, 29939: 0.44433656334877014, 35435: 0.5036908984184265, 39881: 0.4886292517185211, 37964: 0.44293907284736633, 30648: 0.43113985657691956, 36019: 0.4948885142803192}, 'anatomy': {64632: 0.4880580008029938, 65383: 0.45532962679862976, 64634: 0.9760323762893677}}
Accuracies with context: {'effect/phenotype': {84364: 1, 23284: 1, 84795: 1}, 'disease': {38244: 1, 36806: 1, 27985: 1, 30252: 1, 37175: 1, 95532: 1, 29933: 1, 99453: 1, 29939: 1, 35435: 1, 39881: 

 66%|██████▌   | 116500/177004 [17:58:36<63:52:31,  3.80s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.2996801435947418
Cop confidence without context: 0.20119574666023254
LLM is correct without context: False
Confidences with context: {'disease': {35538: 0.315131813287735, 94838: 0.7074244618415833, 97207: 0.3338150084018707, 97088: 0.3512254059314728, 39793: 0.34818270802497864, 30876: 0.2861437499523163}, 'anatomy': {70938: 0.3313049376010895, 69280: 0.31792813539505005, 72890: 0.2882028818130493, 72930: 0.311549574136734, 72929: 0.3475381135940552, 72927: 0.33251845836639404, 72932: 0.3143140971660614}, 'drug': {17948: 0.3271769881248474}, 'effect/phenotype': {84890: 0.29392367601394653, 85567: 0.2599741816520691}, 'gene/protein': {13028: 0.5498612523078918, 6524: 0.29655081033706665, 13202: 0.32026025652885437}}
Accuracies with context: {'disease': {35538: 0, 94838: 0, 97207: 0, 97088: 0, 39793: 0, 30876: 0}, 'anatomy': {70938: 0, 69280: 0, 72890: 0, 72930: 0, 72929: 0, 72927: 0, 72932: 0}, 'drug': {17948:

 66%|██████▌   | 116542/177004 [18:01:21<80:55:55,  4.82s/it]

Wrong response format. Node 91661 ignored


 66%|██████▌   | 116584/177004 [18:03:59<75:47:24,  4.52s/it]

Wrong response format. Node 91116 ignored
Wrong response format. Node 87341 ignored
Wrong response format. Node 33512 ignored
Wrong response format. Node 39827 ignored
Wrong response format. Node 28631 ignored
Wrong response format. Node 38092 ignored
Wrong response format. Node 36368 ignored
Wrong response format. Node 97556 ignored
Wrong response format. Node 99843 ignored
Wrong response format. Node 99842 ignored
Wrong response format. Node 84102 ignored
Wrong response format. Node 95682 ignored
Wrong response format. Node 38899 ignored
Wrong response format. Node 99862 ignored
Wrong response format. Node 30786 ignored
Wrong response format. Node 99863 ignored


 66%|██████▌   | 116600/177004 [18:04:59<64:45:21,  3.86s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.36471331119537354
Cop confidence without context: 0.36471331119537354
LLM is correct without context: True
Confidences with context: {'disease': {32807: 0.3305108845233917, 38442: 0.33439692854881287, 83974: 0.29860609769821167, 83788: 0.3117063045501709, 99520: 0.3331238925457001, 97228: 0.2986413538455963}}
Accuracies with context: {'disease': {32807: 1, 38442: 1, 83974: 1, 83788: 0, 99520: 1, 97228: 1}}
Cop confidences with context: {'disease': {32807: 0.3305108845233917, 38442: 0.33439692854881287, 83974: 0.29860609769821167, 83788: 0.25440725684165955, 99520: 0.3331238925457001, 97228: 0.2986413538455963}}


 66%|██████▌   | 116658/177004 [18:08:22<57:29:27,  3.43s/it]

Wrong response format. Node 98781 ignored


 66%|██████▌   | 116700/177004 [18:10:35<62:55:23,  3.76s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.32647427916526794
Cop confidence without context: 0.2027125060558319
LLM is correct without context: False
Confidences with context: {'disease': {36496: 0.3084505498409271, 32009: 0.39273637533187866, 39751: 0.31024274230003357, 37682: 0.29393908381462097, 99571: 0.3086710572242737, 37498: 0.29119613766670227, 96762: 0.3077244162559509}, 'anatomy': {65153: 0.29398876428604126, 68847: 0.28492486476898193, 68848: 0.28812646865844727}, 'effect/phenotype': {92524: 0.2738761603832245}, 'drug': {16715: 0.5662849545478821}, 'gene/protein': {35254: 0.3067469894886017}}
Accuracies with context: {'disease': {36496: 0, 32009: 0, 39751: 0, 37682: 0, 99571: 0, 37498: 0, 96762: 0}, 'anatomy': {65153: 0, 68847: 0, 68848: 0}, 'effect/phenotype': {92524: 0}, 'drug': {16715: 0}, 'gene/protein': {35254: 0}}
Cop confidences with context: {'disease': {36496: 0.21199475228786469, 32009: 0.17564256489276886, 39751: 0.211567163467407

 66%|██████▌   | 116710/177004 [18:11:17<64:09:36,  3.83s/it]

Wrong response format. Node 37573 ignored


 66%|██████▌   | 116800/177004 [18:17:07<103:32:48,  6.19s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3245748281478882
Cop confidence without context: 0.2819955050945282
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {85218: 0.3157607614994049}, 'gene/protein': {630: 0.28274479508399963}}
Accuracies with context: {'effect/phenotype': {85218: 0}, 'gene/protein': {630: 0}}
Cop confidences with context: {'effect/phenotype': {85218: 0.25771644711494446}, 'gene/protein': {630: 0.26769739389419556}}


 66%|██████▌   | 116900/177004 [18:23:03<56:34:16,  3.39s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.2815857231616974
Cop confidence without context: 0.22449971735477448
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {85395: 0.3144094944000244, 88772: 0.30561360716819763, 25654: 0.2831568717956543, 88317: 0.2947196662425995, 88744: 0.290099561214447, 87010: 0.32480379939079285, 89510: 0.2813059985637665}, 'drug': {14978: 0.3117043673992157, 20172: 0.32617250084877014}, 'disease': {31260: 0.33870628476142883, 33531: 0.30616456270217896, 27494: 0.30971822142601013, 94590: 0.2945306599140167}}
Accuracies with context: {'effect/phenotype': {85395: 0, 88772: 0, 25654: 0, 88317: 0, 88744: 0, 87010: 0, 89510: 0}, 'drug': {14978: 0, 20172: 0}, 'disease': {31260: 0, 33531: 0, 27494: 0, 94590: 0}}
Cop confidences with context: {'effect/phenotype': {85395: 0.23364928364753723, 88772: 0.20358256995677948, 25654: 0.2460109442472458, 88317: 0.2089875191450119, 88744: 0.2259297519922256

 66%|██████▌   | 117000/177004 [18:29:06<54:23:18,  3.26s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.33399710059165955
Cop confidence without context: 0.22955264151096344
LLM is correct without context: False
Confidences with context: {'anatomy': {74018: 0.3713012933731079, 75607: 0.3843320906162262}, 'disease': {83790: 0.37710142135620117, 94655: 0.3864724040031433, 28608: 0.34420478343963623}, 'gene/protein': {7637: 0.33228620886802673, 7034: 0.33600544929504395}}
Accuracies with context: {'anatomy': {74018: 0, 75607: 0}, 'disease': {83790: 0, 94655: 0, 28608: 0}, 'gene/protein': {7637: 0, 7034: 0}}
Cop confidences with context: {'anatomy': {74018: 0.22875209152698517, 75607: 0.225937157869339}, 'disease': {83790: 0.22517752647399902, 94655: 0.22020533680915833, 28608: 0.23472726345062256}, 'gene/protein': {7637: 0.2356261909008026, 7034: 0.23093295097351074}}


 66%|██████▌   | 117100/177004 [18:35:08<64:34:39,  3.88s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.533286452293396
Cop confidence without context: 0.06469499319791794
LLM is correct without context: False
Confidences with context: {'disease': {32220: 0.45188233256340027, 30626: 0.3751501441001892, 38469: 0.3946596682071686, 99562: 0.3457443416118622}, 'effect/phenotype': {25297: 0.3650178611278534, 91709: 0.3885533809661865}, 'gene/protein': {7183: 0.40158382058143616}}
Accuracies with context: {'disease': {32220: 0, 30626: 0, 38469: 0, 99562: 0}, 'effect/phenotype': {25297: 0, 91709: 0}, 'gene/protein': {7183: 0}}
Cop confidences with context: {'disease': {32220: 0.0835898295044899, 30626: 0.11089417338371277, 38469: 0.11757615953683853, 99562: 0.13225899636745453}, 'effect/phenotype': {25297: 0.10376552492380142, 91709: 0.11132241040468216}, 'gene/protein': {7183: 0.10394416004419327}}


 66%|██████▌   | 117167/177004 [18:39:05<68:22:54,  4.11s/it]

Wrong response format. Node 14317 ignored


 66%|██████▌   | 117170/177004 [18:39:15<55:41:15,  3.35s/it]

Wrong response format. Node 15854 ignored


 66%|██████▌   | 117200/177004 [18:40:55<74:18:55,  4.47s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.2638700306415558
Cop confidence without context: 0.21705356240272522
LLM is correct without context: False
Confidences with context: {'drug': {14510: 0.28687646985054016, 21274: 0.3630884289741516, 16521: 0.3114321827888489, 20548: 0.29655250906944275, 17736: 0.29025375843048096, 21614: 0.29284602403640747, 14513: 0.3007863759994507, 21513: 0.3130543828010559, 17693: 0.3083578944206238}, 'effect/phenotype': {89820: 0.3325725197792053, 84553: 0.43927788734436035}, 'disease': {83776: 0.3659060597419739, 38469: 0.31550082564353943, 99562: 0.2632772624492645, 33283: 0.26570412516593933}, 'gene/protein': {527: 0.33824971318244934}}
Accuracies with context: {'drug': {14510: 1, 21274: 0, 16521: 0, 20548: 0, 17736: 0, 21614: 0, 14513: 0, 21513: 0, 17693: 0}, 'effect/phenotype': {89820: 0, 84553: 0}, 'disease': {83776: 0, 38469: 1, 99562: 1, 33283: 0}, 'gene/protein': {527: 0}}
Cop confidences with context: {'drug': {1

 66%|██████▋   | 117273/177004 [18:45:38<60:38:52,  3.66s/it]

Wrong response format. Node 37844 ignored


 66%|██████▋   | 117300/177004 [18:47:12<47:52:54,  2.89s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.5965611338615417
Cop confidence without context: 0.1643698811531067
LLM is correct without context: False
Confidences with context: {'anatomy': {73767: 0.5776591897010803}}
Accuracies with context: {'anatomy': {73767: 0}}
Cop confidences with context: {'anatomy': {73767: 0.1817685216665268}}


 66%|██████▋   | 117358/177004 [18:50:35<70:14:15,  4.24s/it]

Wrong response format. Node 27403 ignored
Wrong response format. Node 31318 ignored
Wrong response format. Node 90794 ignored
Wrong response format. Node 22615 ignored
Wrong response format. Node 94382 ignored
Wrong response format. Node 38353 ignored
Wrong response format. Node 31444 ignored
Wrong response format. Node 26272 ignored
Wrong response format. Node 31445 ignored
Wrong response format. Node 30685 ignored
Wrong response format. Node 27916 ignored


 66%|██████▋   | 117400/177004 [18:53:18<65:41:01,  3.97s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.2832737863063812
Cop confidence without context: 0.2832737863063812
LLM is correct without context: True
Confidences with context: {'anatomy': {75762: 0.28061312437057495}, 'effect/phenotype': {91143: 0.30525630712509155}, 'disease': {33405: 0.3213176727294922}}
Accuracies with context: {'anatomy': {75762: 0}, 'effect/phenotype': {91143: 0}, 'disease': {33405: 0}}
Cop confidences with context: {'anatomy': {75762: 0.2326364517211914}, 'effect/phenotype': {91143: 0.2097993940114975}, 'disease': {33405: 0.1802428811788559}}


 66%|██████▋   | 117427/177004 [18:54:56<67:45:27,  4.09s/it]

Wrong response format. Node 37574 ignored


 66%|██████▋   | 117489/177004 [18:58:50<56:55:06,  3.44s/it]

Wrong response format. Question 117488 ignored


 66%|██████▋   | 117500/177004 [18:59:25<64:19:45,  3.89s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.25714364647865295
Cop confidence without context: 0.20987452566623688
LLM is correct without context: False
Confidences with context: {'disease': {37919: 0.22956085205078125, 38487: 0.2488826960325241, 39639: 0.24746061861515045, 97255: 0.22921881079673767, 97338: 0.23850128054618835, 99907: 0.22844858467578888, 98974: 0.23654663562774658, 99908: 0.24587462842464447, 39638: 0.24110954999923706}, 'gene/protein': {3409: 0.23736770451068878, 3866: 0.24418489634990692, 13181: 0.24750547111034393, 5794: 0.24255378544330597}}
Accuracies with context: {'disease': {37919: 0, 38487: 0, 39639: 0, 97255: 0, 97338: 0, 99907: 0, 98974: 0, 99908: 0, 39638: 0}, 'gene/protein': {3409: 0, 3866: 0, 13181: 0, 5794: 0}}
Cop confidences with context: {'disease': {37919: 0.19031263887882233, 38487: 0.196882426738739, 39639: 0.21838322281837463, 97255: 0.194535493850708, 97338: 0.18144220113754272, 99907: 0.19237299263477325, 98974:

 66%|██████▋   | 117533/177004 [19:01:24<56:21:39,  3.41s/it]

Wrong response format. Node 38011 ignored


 66%|██████▋   | 117600/177004 [19:05:07<66:49:00,  4.05s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.30086392164230347
Cop confidence without context: 0.2455579936504364
LLM is correct without context: False
Confidences with context: {'disease': {98408: 0.3217637240886688, 97027: 0.3716931641101837, 83956: 0.36370033025741577, 84072: 0.3057536780834198}}
Accuracies with context: {'disease': {98408: 0, 97027: 0, 83956: 0, 84072: 0}}
Cop confidences with context: {'disease': {98408: 0.22995416820049286, 97027: 0.22721149027347565, 83956: 0.13379789888858795, 84072: 0.22369414567947388}}


 66%|██████▋   | 117693/177004 [19:10:38<46:37:26,  2.83s/it]

Wrong response format. Question 117692 ignored


 66%|██████▋   | 117700/177004 [19:11:06<55:13:13,  3.35s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.2748698592185974
Cop confidence without context: 0.2748698592185974
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {91619: 0.31806039810180664, 87633: 0.3803858458995819, 85395: 0.3128938376903534}, 'anatomy': {63818: 0.9014509916305542}}
Accuracies with context: {'effect/phenotype': {91619: 1, 87633: 1, 85395: 1}, 'anatomy': {63818: 1}}
Cop confidences with context: {'effect/phenotype': {91619: 0.31806039810180664, 87633: 0.3803858458995819, 85395: 0.3128938376903534}, 'anatomy': {63818: 0.9014509916305542}}


 67%|██████▋   | 117769/177004 [19:15:07<59:23:54,  3.61s/it]

Wrong response format. Node 14054 ignored


 67%|██████▋   | 117800/177004 [19:17:04<50:00:34,  3.04s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.39964577555656433
Cop confidence without context: 0.39964577555656433
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {92951: 0.42984116077423096, 85108: 0.43401530385017395, 22991: 0.36669933795928955, 85046: 0.39161601662635803, 94382: 0.3778161406517029}, 'disease': {30894: 0.6701549887657166, 32076: 0.4081697463989258, 30697: 0.8040198087692261, 27967: 0.9493088126182556, 32561: 0.6116930842399597, 32121: 0.5354440808296204, 33330: 0.46753889322280884, 98561: 0.9155240654945374, 30136: 0.4063892364501953, 32769: 0.3483378291130066, 33004: 0.3049352765083313, 29643: 0.693229615688324}}
Accuracies with context: {'effect/phenotype': {92951: 1, 85108: 1, 22991: 1, 85046: 1, 94382: 1}, 'disease': {30894: 0, 32076: 1, 30697: 0, 27967: 1, 32561: 1, 32121: 1, 33330: 1, 98561: 1, 30136: 1, 32769: 1, 33004: 1, 29643: 1}}
Cop confidences with context: {'effect/phenotype': {92951: 0

 67%|██████▋   | 117900/177004 [19:23:37<67:09:52,  4.09s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.3547227084636688
Cop confidence without context: 0.1662551909685135
LLM is correct without context: False
Confidences with context: {'drug': {14216: 0.34099650382995605, 14510: 0.518194317817688, 21573: 0.35497796535491943, 21601: 0.36461880803108215, 16224: 0.35144975781440735, 21574: 0.3786676228046417, 21604: 0.3498511016368866, 21455: 0.35958296060562134, 21594: 0.37419459223747253, 21605: 0.35001423954963684, 21578: 0.34995490312576294}, 'effect/phenotype': {92905: 0.358561247587204, 91149: 0.3667055368423462}, 'gene/protein': {1676: 0.3626939356327057, 363: 0.3763887286186218}, 'disease': {84064: 0.40295830368995667}}
Accuracies with context: {'drug': {14216: 0, 14510: 0, 21573: 0, 21601: 0, 16224: 0, 21574: 0, 21604: 0, 21455: 0, 21594: 0, 21605: 0, 21578: 0}, 'effect/phenotype': {92905: 0, 91149: 0}, 'gene/protein': {1676: 0, 363: 0}, 'disease': {84064: 0}}
Cop confidences with context: {'drug': {14216

 67%|██████▋   | 117922/177004 [19:25:05<64:14:53,  3.91s/it]

Wrong response format. Node 89585 ignored
Wrong response format. Node 30438 ignored
Wrong response format. Node 24522 ignored
Wrong response format. Node 28016 ignored
Wrong response format. Node 87391 ignored
Wrong response format. Node 31925 ignored
Wrong response format. Node 84716 ignored


 67%|██████▋   | 118000/177004 [19:29:47<60:00:47,  3.66s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.33471226692199707
Cop confidence without context: 0.23920807242393494
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {86386: 0.3653850555419922, 25046: 0.374230295419693, 90679: 0.2815600037574768, 25168: 0.3782585859298706, 26892: 0.3786587119102478, 88177: 0.37981605529785156, 26893: 0.38003450632095337, 26894: 0.3787456452846527, 88220: 0.387727826833725, 93381: 0.37768134474754333, 88214: 0.37724757194519043, 93378: 0.3898918032646179, 88226: 0.3800804018974304}, 'gene/protein': {10687: 0.38040590286254883}, 'anatomy': {72490: 0.3990638256072998}, 'drug': {19591: 0.40245696902275085}}
Accuracies with context: {'effect/phenotype': {86386: 0, 25046: 0, 90679: 0, 25168: 0, 26892: 0, 88177: 0, 26893: 0, 26894: 0, 88220: 0, 93381: 0, 88214: 0, 93378: 0, 88226: 0}, 'gene/protein': {10687: 0}, 'anatomy': {72490: 0}, 'drug': {19591: 0}}
Cop confidences with context: {'effect/p

 67%|██████▋   | 118041/177004 [19:32:12<61:02:23,  3.73s/it]

Wrong response format. Node 38011 ignored


 67%|██████▋   | 118100/177004 [19:35:42<42:01:01,  2.57s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.2985362708568573
Cop confidence without context: 0.1623113453388214
LLM is correct without context: False
Confidences with context: {'disease': {29618: 0.3546680808067322, 94910: 0.319806843996048, 38424: 0.33304494619369507}}
Accuracies with context: {'disease': {29618: 0, 94910: 0, 38424: 0}}
Cop confidences with context: {'disease': {29618: 0.15135376155376434, 94910: 0.16984812915325165, 38424: 0.16486918926239014}}


 67%|██████▋   | 118200/177004 [19:41:40<68:34:41,  4.20s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.43858602643013
Cop confidence without context: 0.10018250346183777
LLM is correct without context: False
Confidences with context: {'disease': {33188: 0.45114484429359436, 83861: 0.4562498927116394, 32516: 0.3651033043861389, 96034: 0.4599079191684723, 97099: 0.43975383043289185, 37665: 0.4453558921813965, 37902: 0.4497978389263153, 36030: 0.4454958736896515, 37515: 0.44805803894996643, 96035: 0.4643336832523346, 84253: 0.4472256600856781, 95395: 0.43679654598236084}, 'gene/protein': {3557: 0.40801623463630676, 2090: 0.3927185535430908}, 'effect/phenotype': {26768: 0.43793559074401855}}
Accuracies with context: {'disease': {33188: 0, 83861: 0, 32516: 0, 96034: 0, 97099: 0, 37665: 0, 37902: 0, 36030: 0, 37515: 0, 96035: 0, 84253: 0, 95395: 0}, 'gene/protein': {3557: 0, 2090: 0}, 'effect/phenotype': {26768: 0}}
Cop confidences with context: {'disease': {33188: 0.11055780202150345, 83861: 0.13806550204753876, 325

 67%|██████▋   | 118300/177004 [19:48:04<68:38:43,  4.21s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.29103630781173706
Cop confidence without context: 0.1835542917251587
LLM is correct without context: False
Confidences with context: {'anatomy': {64397: 0.30519771575927734, 68094: 0.2792447507381439, 66710: 0.30936700105667114, 65967: 0.309349924325943, 65872: 0.304277241230011, 71104: 0.2881757616996765, 68096: 0.302295058965683, 71106: 0.30176833271980286, 71105: 0.30236732959747314, 70308: 0.335259348154068, 71103: 0.3179242014884949}, 'drug': {14312: 0.3029128611087799, 15545: 0.29643023014068604, 21513: 0.34651070833206177}}
Accuracies with context: {'anatomy': {64397: 0, 68094: 0, 66710: 0, 65967: 0, 65872: 0, 71104: 0, 68096: 0, 71106: 0, 71105: 0, 70308: 0, 71103: 0}, 'drug': {14312: 0, 15545: 0, 21513: 0}}
Cop confidences with context: {'anatomy': {64397: 0.19705045223236084, 68094: 0.19042836129665375, 66710: 0.1832938939332962, 65967: 0.18328377604484558, 65872: 0.19492730498313904, 71104: 0.207565

 67%|██████▋   | 118400/177004 [19:54:07<54:55:15,  3.37s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.2708447277545929
Cop confidence without context: 0.22986295819282532
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {86587: 0.3021729290485382, 86575: 0.30386561155319214, 92888: 0.3033111095428467}, 'anatomy': {67894: 0.2994135022163391, 72109: 0.30270853638648987}, 'gene/protein': {8139: 0.30483099818229675}, 'disease': {37710: 0.2859953045845032, 36328: 0.3018331825733185, 95799: 0.2910075783729553, 95638: 0.31102484464645386, 95793: 0.3037000000476837, 37429: 0.29832154512405396, 99851: 0.3020963668823242, 33283: 0.29817765951156616}}
Accuracies with context: {'effect/phenotype': {86587: 0, 86575: 0, 92888: 0}, 'anatomy': {67894: 0, 72109: 0}, 'gene/protein': {8139: 0}, 'disease': {37710: 0, 36328: 0, 95799: 0, 95638: 0, 95793: 0, 37429: 0, 99851: 0, 33283: 0}}
Cop confidences with context: {'effect/phenotype': {86587: 0.22988103330135345, 86575: 0.2205827832221985, 92

 67%|██████▋   | 118500/177004 [20:00:07<73:28:57,  4.52s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.45841798186302185
Cop confidence without context: 0.22167646884918213
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {92633: 0.5071669816970825}, 'anatomy': {63120: 0.6246824860572815}}
Accuracies with context: {'effect/phenotype': {92633: 0}, 'anatomy': {63120: 0}}
Cop confidences with context: {'effect/phenotype': {92633: 0.14644542336463928}, 'anatomy': {63120: 0.06900082528591156}}


 67%|██████▋   | 118536/177004 [20:02:17<80:10:26,  4.94s/it]

Wrong response format. Node 83790 ignored


 67%|██████▋   | 118600/177004 [20:05:57<58:11:49,  3.59s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.2787415087223053
Cop confidence without context: 0.20553110539913177
LLM is correct without context: False
Confidences with context: {'disease': {30047: 0.4976325035095215, 38478: 0.38489818572998047, 83833: 0.3908851444721222, 95149: 0.7504812479019165, 83960: 0.33928436040878296, 31777: 0.523152232170105, 32854: 0.4393586218357086, 29487: 0.33207613229751587, 33545: 0.38275811076164246, 31546: 0.4183276891708374, 32277: 0.5152623057365417, 33095: 0.48028868436813354, 28024: 0.3619025647640228}, 'effect/phenotype': {85074: 0.30060815811157227, 23118: 0.3433040380477905, 85184: 0.6869676113128662, 24304: 0.5172688961029053}}
Accuracies with context: {'disease': {30047: 0, 38478: 1, 83833: 0, 95149: 0, 83960: 0, 31777: 0, 32854: 0, 29487: 0, 33545: 0, 31546: 0, 32277: 0, 33095: 0, 28024: 0}, 'effect/phenotype': {85074: 1, 23118: 1, 85184: 1, 24304: 1}}
Cop confidences with context: {'disease': {30047: 0.1641017

 67%|██████▋   | 118700/177004 [20:11:58<61:08:33,  3.78s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3199884593486786
Cop confidence without context: 0.2082204520702362
LLM is correct without context: False
Confidences with context: {'disease': {84139: 0.36027514934539795, 36752: 0.3329418897628784, 95955: 0.33360618352890015, 98497: 0.3419443368911743}, 'effect/phenotype': {93459: 0.3593176007270813, 93298: 0.2837463915348053}}
Accuracies with context: {'disease': {84139: 0, 36752: 0, 95955: 0, 98497: 0}, 'effect/phenotype': {93459: 0, 93298: 0}}
Cop confidences with context: {'disease': {84139: 0.16754351556301117, 36752: 0.1882282793521881, 95955: 0.18423481285572052, 98497: 0.18591190874576569}, 'effect/phenotype': {93459: 0.16579784452915192, 93298: 0.16943269968032837}}


 67%|██████▋   | 118716/177004 [20:13:02<83:29:19,  5.16s/it]

Wrong response format. Node 97867 ignored


 67%|██████▋   | 118800/177004 [20:18:16<46:24:21,  2.87s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.2653333246707916
Cop confidence without context: 0.2434835433959961
LLM is correct without context: False
Confidences with context: {'disease': {84122: 0.277470201253891, 98508: 0.29228875041007996, 94933: 0.2888834774494171, 32914: 0.3146471679210663, 33304: 0.29296064376831055}}
Accuracies with context: {'disease': {84122: 1, 98508: 1, 94933: 1, 32914: 1, 33304: 1}}
Cop confidences with context: {'disease': {84122: 0.277470201253891, 98508: 0.29228875041007996, 94933: 0.2888834774494171, 32914: 0.3146471679210663, 33304: 0.29296064376831055}}


 67%|██████▋   | 118900/177004 [20:23:51<73:16:28,  4.54s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.4499327838420868
Cop confidence without context: 0.21420012414455414
LLM is correct without context: False
Confidences with context: {'disease': {97048: 0.5017517805099487, 33268: 0.48385462164878845}, 'effect/phenotype': {22720: 0.35367223620414734, 85218: 0.3748380243778229, 24111: 0.3485577404499054, 23309: 0.48843345046043396, 24939: 0.4681032598018646, 24093: 0.4785236716270447, 23623: 0.45712074637413025}, 'anatomy': {67894: 0.5856974720954895}}
Accuracies with context: {'disease': {97048: 0, 33268: 0}, 'effect/phenotype': {22720: 0, 85218: 0, 24111: 0, 23309: 0, 24939: 0, 24093: 0, 23623: 0}, 'anatomy': {67894: 0}}
Cop confidences with context: {'disease': {97048: 0.1803082823753357, 33268: 0.17661495506763458}, 'effect/phenotype': {22720: 0.28197208046913147, 85218: 0.25165435671806335, 24111: 0.29813724756240845, 23309: 0.1912732869386673, 24939: 0.1966649442911148, 24093: 0.19792599976062775, 23623: 

 67%|██████▋   | 118948/177004 [20:26:44<70:56:42,  4.40s/it]

Wrong response format. Node 83788 ignored


 67%|██████▋   | 118965/177004 [20:27:44<60:38:54,  3.76s/it]

Wrong response format. Node 14504 ignored


 67%|██████▋   | 119000/177004 [20:29:39<58:56:49,  3.66s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.277280330657959
Cop confidence without context: 0.22987347841262817
LLM is correct without context: False
Confidences with context: {'gene/protein': {1450: 0.2655560374259949}, 'anatomy': {74009: 0.2975700795650482}, 'effect/phenotype': {91900: 0.2583780884742737}, 'disease': {31494: 0.30873388051986694, 95293: 0.2998252213001251, 32626: 0.3288401663303375, 83801: 0.31651684641838074}}
Accuracies with context: {'gene/protein': {1450: 0}, 'anatomy': {74009: 0}, 'effect/phenotype': {91900: 0}, 'disease': {31494: 0, 95293: 0, 32626: 0, 83801: 0}}
Cop confidences with context: {'gene/protein': {1450: 0.2271421253681183}, 'anatomy': {74009: 0.21601273119449615}, 'effect/phenotype': {91900: 0.2352559119462967}, 'disease': {31494: 0.2040606439113617, 95293: 0.20606665313243866, 32626: 0.18590937554836273, 83801: 0.1980711966753006}}


 67%|██████▋   | 119012/177004 [20:30:20<39:51:12,  2.47s/it]

Wrong response format. Node 20231 ignored
Wrong response format. Node 84014 ignored


 67%|██████▋   | 119052/177004 [20:32:48<59:17:18,  3.68s/it]

Wrong response format. Node 94180 ignored


 67%|██████▋   | 119100/177004 [20:35:42<51:20:22,  3.19s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.334530234336853
Cop confidence without context: 0.2883830964565277
LLM is correct without context: False
Confidences with context: {'disease': {84284: 0.3252798020839691, 97048: 0.3054422438144684, 38360: 0.31712356209754944, 95969: 0.27340996265411377, 28599: 0.5564083456993103, 95018: 0.2749301493167877, 28774: 0.2767033874988556, 36201: 0.303437203168869, 97444: 0.2548940181732178, 29193: 0.31393298506736755}, 'anatomy': {76455: 0.32550492882728577, 64607: 0.31837964057922363, 69098: 0.34519249200820923, 67894: 0.3124070465564728, 69091: 0.3382410407066345}, 'effect/phenotype': {24319: 0.28479743003845215, 26435: 0.320778489112854, 87187: 0.2881490886211395, 24421: 0.27765578031539917, 86964: 0.26783278584480286}}
Accuracies with context: {'disease': {84284: 0, 97048: 0, 38360: 0, 95969: 0, 28599: 0, 95018: 0, 28774: 1, 36201: 1, 97444: 0, 29193: 0}, 'anatomy': {76455: 0, 64607: 0, 69098: 1, 67894: 0, 69091

 67%|██████▋   | 119107/177004 [20:36:13<63:05:23,  3.92s/it]

Wrong response format. Node 92610 ignored
Wrong response format. Node 34114 ignored


 67%|██████▋   | 119143/177004 [20:38:19<28:41:20,  1.78s/it]

Wrong response format. Question 119142 ignored


 67%|██████▋   | 119171/177004 [20:39:48<45:23:17,  2.83s/it]

Wrong response format. Question 119170 ignored


 67%|██████▋   | 119200/177004 [20:41:29<52:49:48,  3.29s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.514490008354187
Cop confidence without context: 0.16189144551753998
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {23854: 0.5259877443313599}, 'anatomy': {76542: 0.5370728969573975, 73349: 0.5230104327201843}, 'disease': {37721: 0.47867631912231445, 98416: 0.5502095818519592, 83925: 0.4822048246860504, 39169: 0.5388006567955017, 39365: 0.5172079801559448, 28017: 0.47331222891807556, 97510: 0.640254557132721, 99403: 0.4976661503314972, 33346: 0.5664697289466858, 30177: 0.4567718505859375}, 'drug': {15531: 0.4243098795413971, 14783: 0.5227097272872925, 20327: 0.4889696538448334}}
Accuracies with context: {'effect/phenotype': {23854: 0}, 'anatomy': {76542: 0, 73349: 0}, 'disease': {37721: 0, 98416: 0, 83925: 0, 39169: 0, 39365: 0, 28017: 0, 97510: 0, 99403: 0, 33346: 0, 30177: 0}, 'drug': {15531: 0, 14783: 0, 20327: 0}}
Cop confidences with context: {'effect/phenotype': {238

 67%|██████▋   | 119300/177004 [20:47:34<45:44:26,  2.85s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3617001175880432
Cop confidence without context: 0.1951225847005844
LLM is correct without context: False
Confidences with context: {'anatomy': {68756: 0.46725761890411377}, 'disease': {84104: 0.3768381178379059, 27275: 0.2934158742427826, 36469: 0.47094792127609253, 84257: 0.6149753332138062, 32017: 0.3563954830169678, 38144: 0.4220375418663025, 31565: 0.4212509095668793, 27978: 0.35700684785842896, 33169: 0.4361966550350189, 32530: 0.4231632947921753}, 'effect/phenotype': {22726: 0.5037310719490051, 87984: 0.46625474095344543, 84776: 0.4350854158401489, 23391: 0.47178468108177185}}
Accuracies with context: {'anatomy': {68756: 0}, 'disease': {84104: 0, 27275: 0, 36469: 1, 84257: 0, 32017: 0, 38144: 0, 31565: 0, 27978: 0, 33169: 0, 32530: 0}, 'effect/phenotype': {22726: 0, 87984: 0, 84776: 0, 23391: 0}}
Cop confidences with context: {'anatomy': {68756: 0.1505158394575119}, 'disease': {84104: 0.1808089017868042

 67%|██████▋   | 119387/177004 [20:52:28<48:12:51,  3.01s/it]

Wrong response format. Node 14054 ignored


 67%|██████▋   | 119400/177004 [20:53:03<39:16:42,  2.45s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.49789655208587646
Cop confidence without context: 0.15667009353637695
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {89276: 0.2888166010379791}, 'anatomy': {75593: 0.3339904546737671}, 'disease': {33043: 0.31907472014427185}}
Accuracies with context: {'effect/phenotype': {89276: 0}, 'anatomy': {75593: 0}, 'disease': {33043: 0}}
Cop confidences with context: {'effect/phenotype': {89276: 0.2451154738664627}, 'anatomy': {75593: 0.24245108664035797}, 'disease': {33043: 0.22275005280971527}}


 68%|██████▊   | 119500/177004 [20:59:02<35:01:34,  2.19s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.3011530041694641
Cop confidence without context: 0.24198324978351593
LLM is correct without context: False
Confidences with context: {'disease': {27963: 0.31218448281288147, 32421: 0.28290534019470215, 97187: 0.279691219329834, 84072: 0.34241023659706116, 32880: 0.30046841502189636, 38486: 0.29292529821395874}}
Accuracies with context: {'disease': {27963: 1, 32421: 0, 97187: 1, 84072: 0, 32880: 0, 38486: 1}}
Cop confidences with context: {'disease': {27963: 0.31218448281288147, 32421: 0.21188631653785706, 97187: 0.279691219329834, 84072: 0.2905998229980469, 32880: 0.2164193093776703, 38486: 0.29292529821395874}}


 68%|██████▊   | 119600/177004 [21:04:56<72:06:31,  4.52s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.37060025334358215
Cop confidence without context: 0.1352752447128296
LLM is correct without context: False
Confidences with context: {'gene/protein': {9288: 0.2702195942401886}, 'effect/phenotype': {84609: 0.556567907333374}, 'disease': {29056: 0.35251644253730774, 31038: 0.3095869719982147, 32168: 0.295166552066803, 29136: 0.30432796478271484, 27596: 0.31933727860450745, 30116: 0.33868902921676636, 29968: 0.37683212757110596, 27956: 0.3164750635623932, 29196: 0.2751716673374176, 32402: 0.31534281373023987}}
Accuracies with context: {'gene/protein': {9288: 0}, 'effect/phenotype': {84609: 0}, 'disease': {29056: 0, 31038: 0, 32168: 0, 29136: 0, 27596: 0, 30116: 0, 29968: 0, 27956: 0, 29196: 0, 32402: 0}}
Cop confidences with context: {'gene/protein': {9288: 0.19769686460494995}, 'effect/phenotype': {84609: 0.0808100774884224}, 'disease': {29056: 0.16393537819385529, 31038: 0.15689104795455933, 32168: 0.165573418

 68%|██████▊   | 119613/177004 [21:05:39<65:43:49,  4.12s/it]

Wrong response format. Node 92615 ignored


 68%|██████▊   | 119700/177004 [21:10:57<58:24:34,  3.67s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.30423879623413086
Cop confidence without context: 0.22085368633270264
LLM is correct without context: False
Confidences with context: {'drug': {14581: 0.3396921455860138, 14127: 0.29406628012657166, 14595: 0.3622296154499054, 16523: 0.3822892904281616, 21408: 0.3485028147697449, 17195: 0.33088773488998413, 21402: 0.32145342230796814, 21418: 0.37378746271133423, 14214: 0.35724738240242004, 21400: 0.37145495414733887, 21034: 0.3485439717769623, 17172: 0.3602796494960785, 17196: 0.34447088837623596, 21393: 0.354554146528244}, 'disease': {97059: 0.3305678069591522, 84034: 0.37835946679115295}, 'gene/protein': {5626: 0.3071558475494385}}
Accuracies with context: {'drug': {14581: 0, 14127: 0, 14595: 0, 16523: 0, 21408: 0, 17195: 0, 21402: 0, 21418: 0, 14214: 0, 21400: 0, 21034: 0, 17172: 0, 17196: 0, 21393: 0}, 'disease': {97059: 0, 84034: 0}, 'gene/protein': {5626: 0}}
Cop confidences with context: {'drug': {14581:

 68%|██████▊   | 119771/177004 [21:14:50<52:55:24,  3.33s/it]

Wrong response format. Node 92413 ignored


 68%|██████▊   | 119800/177004 [21:16:44<70:30:07,  4.44s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.3158267140388489
Cop confidence without context: 0.2597920000553131
LLM is correct without context: False
Confidences with context: {'drug': {16014: 0.3177681863307953, 14952: 0.32096096873283386, 18592: 0.3355034291744232, 15223: 0.3450033664703369, 14161: 0.3159435987472534, 15098: 0.28218474984169006, 17336: 0.3336831331253052, 21850: 0.3330320119857788, 21492: 0.3758975863456726, 21402: 0.31061506271362305}}
Accuracies with context: {'drug': {16014: 1, 14952: 0, 18592: 1, 15223: 1, 14161: 0, 15098: 0, 17336: 1, 21850: 1, 21492: 1, 21402: 1}}
Cop confidences with context: {'drug': {16014: 0.3177681863307953, 14952: 0.29916849732398987, 18592: 0.3355034291744232, 15223: 0.3450033664703369, 14161: 0.3086247742176056, 15098: 0.2650880515575409, 17336: 0.3336831331253052, 21850: 0.3330320119857788, 21492: 0.3758975863456726, 21402: 0.31061506271362305}}


 68%|██████▊   | 119818/177004 [21:17:59<71:12:30,  4.48s/it]

Wrong response format. Node 21505 ignored


 68%|██████▊   | 119900/177004 [21:22:57<54:39:48,  3.45s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.4014832079410553
Cop confidence without context: 0.28692713379859924
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {24317: 0.3458426594734192, 85145: 0.3480905592441559, 90154: 0.3975958228111267, 86958: 0.3461555242538452}, 'disease': {94774: 0.34850043058395386, 35834: 0.38923588395118713, 94989: 0.41374361515045166, 32521: 0.36239778995513916, 35460: 0.35498225688934326, 30356: 0.3936997652053833, 35514: 0.38202422857284546}, 'drug': {21377: 0.3736584782600403}, 'anatomy': {69901: 0.3862496614456177, 67114: 0.38865184783935547, 75900: 0.3927186131477356, 76320: 0.3906174302101135}, 'gene/protein': {2422: 0.38289836049079895, 5352: 0.38160523772239685, 6004: 0.36666402220726013}}
Accuracies with context: {'effect/phenotype': {24317: 0, 85145: 0, 90154: 0, 86958: 0}, 'disease': {94774: 0, 35834: 0, 94989: 0, 32521: 0, 35460: 0, 30356: 0, 35514: 0}, 'drug': {21377: 0}, 'a

 68%|██████▊   | 120000/177004 [21:28:38<73:37:36,  4.65s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.4616127014160156
Cop confidence without context: 0.13968834280967712
LLM is correct without context: False
Confidences with context: {'disease': {98408: 0.32427626848220825, 39774: 0.3264292776584625, 37394: 0.42703720927238464}, 'drug': {20381: 0.3562329113483429, 20380: 0.32421085238456726}}
Accuracies with context: {'disease': {98408: 0, 39774: 0, 37394: 0}, 'drug': {20381: 0, 20380: 0}}
Cop confidences with context: {'disease': {98408: 0.2061225324869156, 39774: 0.18311022222042084, 37394: 0.16986367106437683}, 'drug': {20381: 0.18053004145622253, 20380: 0.1832921952009201}}


 68%|██████▊   | 120070/177004 [21:32:20<43:48:43,  2.77s/it]

Wrong response format. Question 120069 ignored


 68%|██████▊   | 120100/177004 [21:33:52<40:45:43,  2.58s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.34768983721733093
Cop confidence without context: 0.07637732475996017
LLM is correct without context: False
Confidences with context: {'drug': {20503: 0.9646089673042297, 21517: 0.3750758767127991, 21527: 0.3150551915168762}, 'effect/phenotype': {91234: 0.4186151921749115, 85442: 0.36715254187583923}, 'disease': {98546: 0.5593435168266296, 27830: 0.9393743276596069, 97548: 0.9720444083213806, 32060: 0.9811621904373169, 38351: 0.9718347191810608, 30095: 0.2889672517776489, 27444: 0.8613379597663879, 28711: 0.29676178097724915, 98063: 0.9753678441047668, 33061: 0.3015596568584442, 31518: 0.2890441119670868, 38778: 0.335430771112442}}
Accuracies with context: {'drug': {20503: 0, 21517: 0, 21527: 0}, 'effect/phenotype': {91234: 0, 85442: 0}, 'disease': {98546: 0, 27830: 0, 97548: 1, 32060: 0, 38351: 0, 30095: 0, 27444: 1, 28711: 0, 98063: 0, 33061: 0, 31518: 0, 38778: 0}}
Cop confidences with context: {'drug': {20

 68%|██████▊   | 120200/177004 [21:39:49<63:09:20,  4.00s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.42274177074432373
Cop confidence without context: 0.42274177074432373
LLM is correct without context: True
Confidences with context: {'disease': {37788: 0.3753329813480377, 83932: 0.3597625494003296, 36322: 0.6461317539215088, 36381: 0.429116427898407, 97223: 0.3682185709476471, 83762: 0.38967978954315186, 96664: 0.4078815281391144, 96831: 0.39334261417388916, 97204: 0.40778255462646484}, 'drug': {20856: 0.3664528727531433, 20422: 0.38765278458595276, 20482: 0.3558897078037262, 20484: 0.35633981227874756}, 'anatomy': {70999: 0.46276113390922546, 73560: 0.411451518535614}, 'effect/phenotype': {85556: 0.3760546147823334}}
Accuracies with context: {'disease': {37788: 1, 83932: 1, 36322: 0, 36381: 0, 97223: 1, 83762: 0, 96664: 0, 96831: 0, 97204: 0}, 'drug': {20856: 1, 20422: 1, 20482: 1, 20484: 1}, 'anatomy': {70999: 1, 73560: 1}, 'effect/phenotype': {85556: 1}}
Cop confidences with context: {'disease': {37788: 0

 68%|██████▊   | 120202/177004 [21:40:01<83:01:24,  5.26s/it]

Wrong response format. Node 85707 ignored
Wrong response format. Node 94456 ignored


 68%|██████▊   | 120238/177004 [21:42:27<48:34:27,  3.08s/it]

Wrong response format. Node 92615 ignored
Wrong response format. Node 83956 ignored


 68%|██████▊   | 120264/177004 [21:44:08<50:36:03,  3.21s/it]

Wrong response format. Node 14286 ignored


 68%|██████▊   | 120268/177004 [21:44:14<29:59:09,  1.90s/it]

Wrong response format. Question 120267 ignored


 68%|██████▊   | 120300/177004 [21:46:03<49:32:41,  3.15s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.34100285172462463
Cop confidence without context: 0.19429755210876465
LLM is correct without context: False
Confidences with context: {'disease': {33667: 0.36688053607940674, 39815: 0.35561344027519226, 36066: 0.35486501455307007, 38926: 0.32328057289123535, 95021: 0.37133488059043884, 95023: 0.35874950885772705, 95022: 0.32428911328315735, 28235: 0.5505738854408264, 29511: 0.4827828109264374, 33088: 0.5014485716819763, 33277: 0.4307065010070801, 30369: 0.4585733115673065}, 'drug': {14187: 0.33392372727394104}, 'effect/phenotype': {84857: 0.3533117175102234, 22272: 0.31933534145355225}}
Accuracies with context: {'disease': {33667: 0, 39815: 0, 36066: 0, 38926: 0, 95021: 0, 95023: 0, 95022: 0, 28235: 0, 29511: 0, 33088: 0, 33277: 0, 30369: 0}, 'drug': {14187: 0}, 'effect/phenotype': {84857: 0, 22272: 0}}
Cop confidences with context: {'disease': {33667: 0.151748389005661, 39815: 0.1640886515378952, 36066: 0.170

 68%|██████▊   | 120400/177004 [21:52:01<51:36:35,  3.28s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.33369386196136475
Cop confidence without context: 0.13694769144058228
LLM is correct without context: False
Confidences with context: {'disease': {33249: 0.29684126377105713, 99396: 0.464932918548584, 96563: 0.6706057786941528, 36380: 0.29610785841941833, 32595: 0.2894740402698517, 33547: 0.27981770038604736, 28973: 0.2858412265777588, 32632: 0.2940472662448883, 32157: 0.301172137260437, 32947: 0.27074557542800903, 30269: 0.3546714782714844}, 'gene/protein': {8584: 0.26574504375457764, 9524: 0.2864842116832733}, 'effect/phenotype': {84795: 0.2571619153022766, 22157: 0.32777902483940125, 22152: 0.243175208568573}}
Accuracies with context: {'disease': {33249: 0, 99396: 0, 96563: 0, 36380: 0, 32595: 0, 33547: 0, 28973: 0, 32632: 0, 32157: 0, 32947: 0, 30269: 0}, 'gene/protein': {8584: 0, 9524: 0}, 'effect/phenotype': {84795: 0, 22157: 0, 22152: 0}}
Cop confidences with context: {'disease': {33249: 0.1516116857528

 68%|██████▊   | 120467/177004 [21:56:02<46:50:32,  2.98s/it]

Wrong response format. Question 120466 ignored


 68%|██████▊   | 120500/177004 [21:57:26<39:35:56,  2.52s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.32052212953567505
Cop confidence without context: 0.32052212953567505
LLM is correct without context: True
Confidences with context: {'disease': {98001: 0.33462288975715637, 37343: 0.6604224443435669, 36626: 0.5632168650627136, 37119: 0.8287829756736755, 97009: 0.2695535123348236, 37048: 0.32580241560935974, 96847: 0.5283749103546143, 97529: 0.7825589179992676, 36885: 0.7614912390708923, 96597: 0.5844244956970215, 37793: 0.3042101263999939, 97309: 0.7738059163093567, 37518: 0.806744396686554, 96491: 0.5870870351791382, 96266: 0.8007608652114868}}
Accuracies with context: {'disease': {98001: 1, 37343: 1, 36626: 0, 37119: 0, 97009: 0, 37048: 1, 96847: 1, 97529: 0, 36885: 0, 96597: 1, 37793: 1, 97309: 0, 37518: 0, 96491: 1, 96266: 0}}
Cop confidences with context: {'disease': {98001: 0.33462288975715637, 37343: 0.6604224443435669, 36626: 0.2311437427997589, 37119: 0.07019016891717911, 97009: 0.2269868403673172, 3

 68%|██████▊   | 120564/177004 [22:01:08<42:28:55,  2.71s/it]

Wrong response format. Node 74561 ignored
Wrong response format. Node 34574 ignored


 68%|██████▊   | 120600/177004 [22:03:04<73:15:33,  4.68s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.4128338396549225
Cop confidence without context: 0.17209474742412567
LLM is correct without context: False
Confidences with context: {'disease': {33670: 0.39533039927482605, 38047: 0.3865934908390045, 95248: 0.36508068442344666}, 'effect/phenotype': {94436: 0.3333964943885803}}
Accuracies with context: {'disease': {33670: 0, 38047: 0, 95248: 0}, 'effect/phenotype': {94436: 0}}
Cop confidences with context: {'disease': {33670: 0.18099544942378998, 38047: 0.18119272589683533, 95248: 0.20479170978069305}, 'effect/phenotype': {94436: 0.18126453459262848}}


 68%|██████▊   | 120648/177004 [22:06:01<62:19:52,  3.98s/it]

Wrong response format. Node 19350 ignored
Wrong response format. Node 16282 ignored


 68%|██████▊   | 120700/177004 [22:09:06<51:30:42,  3.29s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3829386532306671
Cop confidence without context: 0.2233661413192749
LLM is correct without context: False
Confidences with context: {'disease': {97485: 0.3969953954219818}, 'anatomy': {73329: 0.3971114754676819}, 'effect/phenotype': {89929: 0.3817066252231598, 25991: 0.35493505001068115, 90405: 0.30443358421325684, 90406: 0.27744799852371216}}
Accuracies with context: {'disease': {97485: 0}, 'anatomy': {73329: 0}, 'effect/phenotype': {89929: 0, 25991: 1, 90405: 1, 90406: 0}}
Cop confidences with context: {'disease': {97485: 0.20276542007923126}, 'anatomy': {73329: 0.1818108856678009}, 'effect/phenotype': {89929: 0.20114527642726898, 25991: 0.35493505001068115, 90405: 0.30443358421325684, 90406: 0.26474273204803467}}


 68%|██████▊   | 120800/177004 [22:14:51<66:08:42,  4.24s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3023788630962372
Cop confidence without context: 0.2336602658033371
LLM is correct without context: False
Confidences with context: {'gene/protein': {8844: 0.4060836732387543}}
Accuracies with context: {'gene/protein': {8844: 0}}
Cop confidences with context: {'gene/protein': {8844: 0.18884646892547607}}


 68%|██████▊   | 120822/177004 [22:16:04<57:26:23,  3.68s/it]

Wrong response format. Node 94180 ignored


 68%|██████▊   | 120900/177004 [22:20:08<41:18:46,  2.65s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3420743942260742
Cop confidence without context: 0.2542080879211426
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {89759: 0.4157397150993347}, 'disease': {94617: 0.4963315725326538, 98002: 0.31376850605010986, 32224: 0.30750107765197754, 98350: 0.4182177782058716, 39623: 0.4304333031177521, 36938: 0.38944196701049805, 94837: 0.4373261332511902}, 'gene/protein': {13586: 0.41645103693008423}}
Accuracies with context: {'effect/phenotype': {89759: 0}, 'disease': {94617: 0, 98002: 0, 32224: 1, 98350: 0, 39623: 0, 36938: 0, 94837: 0}, 'gene/protein': {13586: 0}}
Cop confidences with context: {'effect/phenotype': {89759: 0.19033952057361603}, 'disease': {94617: 0.15617765486240387, 98002: 0.2580989599227905, 32224: 0.30750107765197754, 98350: 0.18703855574131012, 39623: 0.18368639051914215, 36938: 0.2004675567150116, 94837: 0.172602578997612}, 'gene/protein': {13586: 0.187709197

 68%|██████▊   | 121000/177004 [22:25:58<51:09:23,  3.29s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.3430830240249634
Cop confidence without context: 0.3430830240249634
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {85395: 0.35526084899902344, 85451: 0.3108551502227783}, 'disease': {33461: 0.3562348783016205, 84300: 0.33935806155204773, 31963: 0.3344525694847107}, 'anatomy': {69414: 0.32228022813796997}, 'drug': {14741: 0.3435731828212738, 16180: 0.35199639201164246, 14581: 0.371751070022583, 14214: 0.35115760564804077, 17203: 0.3313937485218048, 21402: 0.3510812222957611, 14870: 0.34242120385169983, 21034: 0.3644597828388214, 21411: 0.3768923580646515, 14863: 0.3427654206752777, 21393: 0.38348621129989624, 21400: 0.35523903369903564}}
Accuracies with context: {'effect/phenotype': {85395: 0, 85451: 0}, 'disease': {33461: 0, 84300: 0, 31963: 0}, 'anatomy': {69414: 1}, 'drug': {14741: 0, 16180: 0, 14581: 0, 14214: 0, 17203: 0, 21402: 0, 14870: 0, 21034: 0, 21411: 0, 14863: 

 68%|██████▊   | 121004/177004 [22:26:15<57:06:28,  3.67s/it]

Wrong response format. Node 63288 ignored


 68%|██████▊   | 121100/177004 [22:31:36<49:50:34,  3.21s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.28648823499679565
Cop confidence without context: 0.20156966149806976
LLM is correct without context: False
Confidences with context: {'disease': {28473: 0.26961228251457214, 32595: 0.25807440280914307, 29106: 0.2602607309818268, 33073: 0.2608291804790497, 33074: 0.2644372582435608}, 'anatomy': {74025: 0.275848925113678}, 'effect/phenotype': {92363: 0.2561348080635071, 26199: 0.2746586501598358, 22452: 0.2628800570964813, 26105: 0.2577265202999115, 91891: 0.2574104070663452}, 'gene/protein': {1491: 0.2598586082458496, 8021: 0.2548222839832306}}
Accuracies with context: {'disease': {28473: 0, 32595: 0, 29106: 1, 33073: 0, 33074: 0}, 'anatomy': {74025: 0}, 'effect/phenotype': {92363: 0, 26199: 0, 22452: 1, 26105: 0, 91891: 0}, 'gene/protein': {1491: 0, 8021: 0}}
Cop confidences with context: {'disease': {28473: 0.2397981435060501, 32595: 0.25013428926467896, 29106: 0.2602607309818268, 33073: 0.2393503189086914, 

 68%|██████▊   | 121131/177004 [22:33:35<59:25:57,  3.83s/it]

Wrong response format. Question 121130 ignored


 68%|██████▊   | 121187/177004 [22:36:50<42:40:16,  2.75s/it]

Wrong response format. Question 121186 ignored


 68%|██████▊   | 121200/177004 [22:37:29<45:20:51,  2.93s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.7480243444442749
Cop confidence without context: 0.7480243444442749
LLM is correct without context: True
Confidences with context: {'disease': {31528: 0.337747722864151, 28038: 0.536617636680603, 38222: 0.6527292728424072, 35825: 0.7403199076652527}, 'effect/phenotype': {26958: 0.7172864079475403}, 'gene/protein': {3026: 0.55729740858078}}
Accuracies with context: {'disease': {31528: 1, 28038: 1, 38222: 1, 35825: 1}, 'effect/phenotype': {26958: 1}, 'gene/protein': {3026: 1}}
Cop confidences with context: {'disease': {31528: 0.337747722864151, 28038: 0.536617636680603, 38222: 0.6527292728424072, 35825: 0.7403199076652527}, 'effect/phenotype': {26958: 0.7172864079475403}, 'gene/protein': {3026: 0.55729740858078}}


 69%|██████▊   | 121264/177004 [22:41:24<55:54:48,  3.61s/it]

Wrong response format. Node 14245 ignored


 69%|██████▊   | 121300/177004 [22:43:21<48:57:02,  3.16s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.3108612298965454
Cop confidence without context: 0.280839741230011
LLM is correct without context: False
Confidences with context: {'disease': {84213: 0.3272109031677246, 98020: 0.32935628294944763}, 'anatomy': {68822: 0.30110809206962585, 64417: 0.2841068506240845}}
Accuracies with context: {'disease': {84213: 1, 98020: 1}, 'anatomy': {68822: 1, 64417: 1}}
Cop confidences with context: {'disease': {84213: 0.3272109031677246, 98020: 0.32935628294944763}, 'anatomy': {68822: 0.30110809206962585, 64417: 0.2841068506240845}}


 69%|██████▊   | 121400/177004 [22:49:12<50:24:55,  3.26s/it]

Example Feedback:

Response without context: C
Confidence without context: 0.27980905771255493
Cop confidence without context: 0.275471031665802
LLM is correct without context: False
Confidences with context: {'anatomy': {66037: 0.2678616940975189}, 'disease': {37787: 0.9303577542304993, 36117: 0.9711761474609375, 28110: 0.4224032461643219, 97214: 0.8625013828277588, 83973: 0.6367195844650269, 38087: 0.9537120461463928, 99043: 0.9557861089706421}}
Accuracies with context: {'anatomy': {66037: 0}, 'disease': {37787: 0, 36117: 1, 28110: 0, 97214: 0, 83973: 1, 38087: 1, 99043: 1}}
Cop confidences with context: {'anatomy': {66037: 0.26165667176246643}, 'disease': {37787: 0.028986182063817978, 36117: 0.9711761474609375, 28110: 0.1471235603094101, 97214: 0.05688813328742981, 83973: 0.6367195844650269, 38087: 0.9537120461463928, 99043: 0.9557861089706421}}


 69%|██████▊   | 121500/177004 [22:55:29<64:19:51,  4.17s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3230811059474945
Cop confidence without context: 0.17565613985061646
LLM is correct without context: False
Confidences with context: {'anatomy': {74329: 0.4964290261268616}}
Accuracies with context: {'anatomy': {74329: 0}}
Cop confidences with context: {'anatomy': {74329: 0.14905501902103424}}


 69%|██████▊   | 121549/177004 [22:58:32<61:07:50,  3.97s/it]

Wrong response format. Node 20890 ignored


 69%|██████▊   | 121553/177004 [22:58:47<59:07:19,  3.84s/it]

Wrong response format. Node 97502 ignored


 69%|██████▊   | 121592/177004 [23:01:20<70:29:20,  4.58s/it]

Wrong response format. Node 36175 ignored
Wrong response format. Node 14610 ignored
Wrong response format. Node 95498 ignored
Wrong response format. Node 17172 ignored
Wrong response format. Node 14214 ignored
Wrong response format. Node 95565 ignored
Wrong response format. Node 17195 ignored
Wrong response format. Node 89308 ignored
Wrong response format. Node 17196 ignored
Wrong response format. Node 21408 ignored
Wrong response format. Node 21879 ignored


 69%|██████▊   | 121600/177004 [23:01:46<42:36:42,  2.77s/it]

Example Feedback:

Response without context: A
Confidence without context: 0.43672794103622437
Cop confidence without context: 0.43672794103622437
LLM is correct without context: True
Confidences with context: {'disease': {37772: 0.3611522614955902}}
Accuracies with context: {'disease': {37772: 1}}
Cop confidences with context: {'disease': {37772: 0.3611522614955902}}


 69%|██████▊   | 121618/177004 [23:02:48<56:22:59,  3.66s/it]

Wrong response format. Node 30114 ignored


 69%|██████▉   | 121700/177004 [23:07:33<56:44:09,  3.69s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.37695544958114624
Cop confidence without context: 0.19556190073490143
LLM is correct without context: False
Confidences with context: {'disease': {98981: 0.4092933237552643}}
Accuracies with context: {'disease': {98981: 0}}
Cop confidences with context: {'disease': {98981: 0.17061883211135864}}


 69%|██████▉   | 121723/177004 [23:08:43<37:25:30,  2.44s/it]

Wrong response format. Question 121722 ignored


 69%|██████▉   | 121800/177004 [23:13:24<63:58:07,  4.17s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.29744768142700195
Cop confidence without context: 0.29744768142700195
LLM is correct without context: True
Confidences with context: {'disease': {39169: 0.31051960587501526, 36175: 0.3388303518295288, 95498: 0.34103384613990784, 95565: 0.3383057713508606}, 'drug': {16067: 0.35502704977989197, 17172: 0.3469933569431305, 14214: 0.3273353576660156, 21398: 0.35939648747444153, 21402: 0.33350497484207153, 21400: 0.3402848243713379, 21408: 0.3425525426864624, 14870: 0.3449820578098297, 17195: 0.3386712968349457}, 'effect/phenotype': {91344: 0.3223262131214142}}
Accuracies with context: {'disease': {39169: 0, 36175: 0, 95498: 0, 95565: 0}, 'drug': {16067: 0, 17172: 0, 14214: 0, 21398: 0, 21402: 0, 21400: 0, 21408: 0, 14870: 0, 17195: 0}, 'effect/phenotype': {91344: 0}}
Cop confidences with context: {'disease': {39169: 0.2761818766593933, 36175: 0.27013909816741943, 95498: 0.2676805257797241, 95565: 0.2761171162128448

 69%|██████▉   | 121854/177004 [23:16:35<57:36:16,  3.76s/it]

Wrong response format. Node 22365 ignored


 69%|██████▉   | 121900/177004 [23:18:48<41:12:04,  2.69s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.2829325497150421
Cop confidence without context: 0.27637842297554016
LLM is correct without context: False
Confidences with context: {'disease': {35631: 0.5942975878715515, 37480: 0.7163794040679932, 98522: 0.6142691969871521, 35686: 0.6196397542953491, 37144: 0.9214130640029907, 37931: 0.8321374654769897, 96282: 0.848896861076355, 96142: 0.416280061006546, 97015: 0.32219263911247253, 96713: 0.723552405834198, 96440: 0.4159186780452728, 96570: 0.409094899892807, 95234: 0.33165210485458374, 97335: 0.47312918305397034, 35883: 0.5204614400863647}}
Accuracies with context: {'disease': {35631: 0, 37480: 0, 98522: 0, 35686: 0, 37144: 0, 37931: 0, 96282: 0, 96142: 0, 97015: 0, 96713: 0, 96440: 0, 96570: 0, 95234: 0, 97335: 0, 35883: 0}}
Cop confidences with context: {'disease': {35631: 0.1624719202518463, 37480: 0.020967213436961174, 98522: 0.08577119559049606, 35686: 0.07877834886312485, 37144: 0.03985641896724701, 

 69%|██████▉   | 121923/177004 [23:20:19<51:11:59,  3.35s/it]

Wrong response format. Node 4392 ignored


 69%|██████▉   | 122000/177004 [23:24:38<50:49:11,  3.33s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.3073364198207855
Cop confidence without context: 0.19383463263511658
LLM is correct without context: False
Confidences with context: {'effect/phenotype': {91284: 0.3470843732357025}, 'drug': {15671: 0.3451094627380371, 20896: 0.34098461270332336}, 'disease': {84224: 0.32885900139808655, 84205: 0.3244893550872803, 32439: 0.35100090503692627}}
Accuracies with context: {'effect/phenotype': {91284: 0}, 'drug': {15671: 0, 20896: 0}, 'disease': {84224: 0, 84205: 0, 32439: 0}}
Cop confidences with context: {'effect/phenotype': {91284: 0.17727336287498474}, 'drug': {15671: 0.1642967313528061, 20896: 0.18539011478424072}, 'disease': {84224: 0.16796475648880005, 84205: 0.19225358963012695, 32439: 0.18935078382492065}}


 69%|██████▉   | 122100/177004 [23:30:06<54:51:27,  3.60s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.8642176985740662
Cop confidence without context: 0.8642176985740662
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {23211: 0.7363947629928589}, 'disease': {83826: 0.6087790727615356, 35701: 0.5937246680259705, 37428: 0.390749990940094, 99682: 0.5878783464431763, 39296: 0.920876145362854, 95231: 0.6176710724830627, 37107: 0.7068573236465454, 37925: 0.7089900374412537}, 'gene/protein': {3943: 0.7804604172706604, 13865: 0.8271308541297913, 13223: 0.7676923274993896, 1395: 0.8230323791503906}}
Accuracies with context: {'effect/phenotype': {23211: 1}, 'disease': {83826: 1, 35701: 1, 37428: 1, 99682: 1, 39296: 1, 95231: 1, 37107: 1, 37925: 1}, 'gene/protein': {3943: 1, 13865: 1, 13223: 1, 1395: 1}}
Cop confidences with context: {'effect/phenotype': {23211: 0.7363947629928589}, 'disease': {83826: 0.6087790727615356, 35701: 0.5937246680259705, 37428: 0.390749990940094, 99682: 0.587

 69%|██████▉   | 122161/177004 [23:33:44<52:17:48,  3.43s/it]

Wrong response format. Node 96507 ignored
Wrong response format. Node 90756 ignored
Wrong response format. Node 33661 ignored


 69%|██████▉   | 122200/177004 [23:36:05<48:34:12,  3.19s/it]

Example Feedback:

Response without context: D
Confidence without context: 0.5835819840431213
Cop confidence without context: 0.5835819840431213
LLM is correct without context: True
Confidences with context: {'effect/phenotype': {89276: 0.5136265158653259}, 'gene/protein': {3590: 0.5234652161598206}, 'disease': {33043: 0.5090462565422058}}
Accuracies with context: {'effect/phenotype': {89276: 1}, 'gene/protein': {3590: 1}, 'disease': {33043: 1}}
Cop confidences with context: {'effect/phenotype': {89276: 0.5136265158653259}, 'gene/protein': {3590: 0.5234652161598206}, 'disease': {33043: 0.5090462565422058}}


 69%|██████▉   | 122300/177004 [23:41:57<58:34:28,  3.85s/it]

Example Feedback:

Response without context: B
Confidence without context: 0.2873021960258484
Cop confidence without context: 0.2873021960258484
LLM is correct without context: True
Confidences with context: {'anatomy': {63521: 0.29909247159957886}, 'disease': {36999: 0.3304433822631836, 30152: 0.3106319010257721, 84271: 0.328849732875824, 37594: 0.341179221868515, 37508: 0.3155207335948944}, 'gene/protein': {11011: 0.29304003715515137, 9373: 0.2969866693019867, 10055: 0.29171180725097656, 7563: 0.2920510768890381, 33995: 0.30487263202667236, 4385: 0.2916332185268402, 34001: 0.293792188167572, 6047: 0.28024566173553467}}
Accuracies with context: {'anatomy': {63521: 0}, 'disease': {36999: 0, 30152: 0, 84271: 0, 37594: 0, 37508: 0}, 'gene/protein': {11011: 0, 9373: 0, 10055: 0, 7563: 0, 33995: 0, 4385: 0, 34001: 0, 6047: 0}}
Cop confidences with context: {'anatomy': {63521: 0.24990111589431763}, 'disease': {36999: 0.24365368485450745, 30152: 0.22373977303504944, 84271: 0.2225108742713928

 69%|██████▉   | 122384/177004 [23:46:57<56:38:43,  3.73s/it]