In [14]:
import collections
import json
import os
from io import StringIO

import pandas as pd
from intertext_graph.itgraph import IntertextDocument

import config
from histogram import histogram
from metadata_analyzer import parse_json_object


def mean(l):
    return sum(l) / len(l)

# Evidence Inference Metadata Exploration

In [15]:
# configuration
config.load_config_json_file("../path_config_local.json", include_in_hash=False)

AssertionError: The value of "path.EVIDENCE-INFERENCE-ITG" cannot be changed after it has been accessed!

In [27]:
# load the dataset
documents = []
path = os.path.join(config.get("path.EVIDENCE-INFERENCE-ITG"), "deep-train.jsonl")
with open(path, "r", encoding="utf-8") as file:
    for document_json_str in file:
        with StringIO(document_json_str) as f:
            document = IntertextDocument.load_json(f)
            documents.append(document)
path = os.path.join(config.get("path.EVIDENCE-INFERENCE-ITG"), "deep-dev.jsonl")
with open(path, "r", encoding="utf-8") as file:
    for document_json_str in file:
        with StringIO(document_json_str) as f:
            document = IntertextDocument.load_json(f)
            documents.append(document)
path = os.path.join(config.get("path.EVIDENCE-INFERENCE-ITG"), "deep-test.jsonl")
with open(path, "r", encoding="utf-8") as file:
    for document_json_str in file:
        with StringIO(document_json_str) as f:
            document = IntertextDocument.load_json(f)
            documents.append(document)

print(f"Loaded {len(documents)} documents.")

Loaded 4454 documents.


## Document Metadata Schema

In [28]:
import metadata_analyzer


def _value_to_pattern(value, point):
    if isinstance(value, dict):
        if point.key in ["prompts", "annotations"]:
            return metadata_analyzer._Pattern(kind=metadata_analyzer._PatternKind.OBJECT, keys={"XXX"}, point=point,
                                              data=[value])
        else:
            return metadata_analyzer._Pattern(kind=metadata_analyzer._PatternKind.OBJECT, keys=set(value.keys()),
                                              point=point, data=[value])
    if isinstance(value, list):
        return metadata_analyzer._Pattern(kind=metadata_analyzer._PatternKind.ARRAY, keys=[], point=point, data=[value])
    if isinstance(value, str):
        if metadata_analyzer._DIFF_EMPTY_STRING:
            return metadata_analyzer._Pattern(kind=metadata_analyzer._PatternKind.STRING,
                                              keys=["empty" if value == "" else "non-empty"], point=point,
                                              data=[value])
        else:
            return metadata_analyzer._Pattern(kind=metadata_analyzer._PatternKind.STRING, keys=[], point=point,
                                              data=[value])
    if isinstance(value, int) or isinstance(value, float):
        return metadata_analyzer._Pattern(kind=metadata_analyzer._PatternKind.NUMBER, keys=[], point=point,
                                          data=[value])
    if isinstance(value, bool):
        if metadata_analyzer._DIFF_TRUE_FALSE:
            return metadata_analyzer._Pattern(kind=metadata_analyzer._PatternKind.STRING,
                                              keys=["true" if value else "false"], point=point, data=[value])
        else:
            return metadata_analyzer._Pattern(kind=metadata_analyzer._PatternKind.BOOL, keys=[], point=point,
                                              data=[value])
    if value is None:
        return metadata_analyzer._Pattern(kind=metadata_analyzer._PatternKind.NULL, keys=[], point=point, data=[value])

    raise ValueError(f"Unknown JSON value! {type(value)}")


metadata_analyzer._value_to_pattern = _value_to_pattern


def _process_point(point):
    # cluster the values into patterns
    for value in point.data:
        new_pattern = metadata_analyzer._value_to_pattern(value, point)
        for pattern in point.patterns:
            if pattern == new_pattern:
                pattern.count += 1
                pattern.data.append(value)
                break
        else:
            point.patterns.append(new_pattern)

    # go deeper into objects and arrays
    for pattern in point.patterns:
        if pattern.kind == metadata_analyzer._PatternKind.ARRAY:
            data = [v for data in pattern.data for v in data]
            child_point = metadata_analyzer._Point(parent=point, key=None, data=data)
            pattern.children.append(child_point)
            _process_point(child_point)
        elif pattern.kind == metadata_analyzer._PatternKind.OBJECT:
            if pattern.keys == {"XXX"}:
                data = [v for data in pattern.data for v in data.values()]
                child_point = metadata_analyzer._Point(parent=point, key="XXX", data=data)
                pattern.children.append(child_point)
                _process_point(child_point)
            else:
                for key in pattern.keys:
                    data = [v for data in pattern.data for k, v in data.items() if k == key]
                    child_point = metadata_analyzer._Point(parent=point, key=key, data=data)
                    pattern.children.append(child_point)
                    _process_point(child_point)


metadata_analyzer._process_point = _process_point

metadata = [document.meta for document in documents]
print(parse_json_object(metadata))

  1/1x [
      4454/4454x {
        'pmc_id': 4454/4454x non-empty string
        'annotations': 
          4454/4454x {
            'XXX': 
              12577/12577x [
                  23222/23222x {
                    'valid_reasoning': 23222/23222x number
                    'evidence_start': 23222/23222x [24249/24249x number]
                    'label_code': 23222/23222x number
                    'prompt_id': 23222/23222x non-empty string
                    'evidence_end': 23222/23222x [24249/24249x number]
                    'valid_label': 23222/23222x number
                    'label': 23222/23222x non-empty string
                    'pmc_id': 23222/23222x non-empty string
                    'annotations': 
                      23222/23222x [
(1): 24242/24249x non-empty string || (2): 7/24249x empty string
                      ]
                    'in_abstract': 23222/23222x [24249/24249x number]
                    'evidence_text': 
                      23222/23222

## Node Metadata Schema

In [29]:
metadata = [node.meta for document in documents for node in document.nodes]
print(parse_json_object(metadata))

  1/1x [
      316870/316870x {
        'is_evidence_for': 
          316870/316870x [
              23943/23943x {
                'prompt_id': 23943/23943x non-empty string
                'evidence_ix': 23943/23943x number
                'user_id': 23943/23943x non-empty string
              }
          ]
      }
  ]



## Prompts and Ground Truth

In [30]:
# assert validity of all annotations
num_invalid_label = 0
num_invalid_reasoning = 0
for document in documents:
    for annotations in document.meta["annotations"].values():
        for annotation in annotations:
            if not annotation["valid_label"]:
                num_invalid_label += 1
            if not annotation["valid_reasoning"]:
                num_invalid_reasoning += 1
print("num invalid label:", num_invalid_label)
print("num invalid reasoning", num_invalid_reasoning)

num invalid label: 0
num invalid reasoning 0


### Number of prompts per document

In [31]:
num_prompts = collections.Counter()
for document in documents:
    num_prompts[len(document.meta["prompts"])] += 1

print("Total number of prompts:", sum(x * y for x, y in num_prompts.items()))

pd.DataFrame(data={
    "# prompts": [x for x, _ in num_prompts.most_common()],
    "# documents with this many prompts": [x for _, x in num_prompts.most_common()]
}).sort_values("# prompts")

Total number of prompts: 12730


Unnamed: 0,# prompts,# documents with this many prompts
0,0,1118
6,1,353
3,2,589
2,3,608
4,4,587
1,5,647
5,6,421
7,7,70
8,8,16
9,9,15


### Number of annotations per prompt

In [32]:
num_annotations = collections.Counter()
for document in documents:
    for prompt_id in document.meta["prompts"].keys():
        if prompt_id not in document.meta["annotations"].keys():
            num_annotations[0] += 1
        else:
            annotations = document.meta["annotations"][prompt_id]
            num_annotations[len(annotations)] += 1

print("Total number of annotations:", sum(x * y for x, y in num_annotations.items()))

pd.DataFrame(data={
    "# annotations": [x for x, _ in num_prompts.most_common()],
    "# prompts with this many annotations": [x for _, x in num_prompts.most_common()]
}).sort_values("# annotations")

Total number of annotations: 23030


Unnamed: 0,# annotations,# prompts with this many annotations
0,0,1118
6,1,353
3,2,589
2,3,608
4,4,587
1,5,647
5,6,421
7,7,70
8,8,16
9,9,15


### Number of annotations per document

In [39]:
num_annotations = collections.Counter()
for document in documents:
    num_annotations[sum(len(annotations) for annotations in document.meta["annotations"].values())] += 1

print("Total number of annotations:", sum(x * y for x, y in num_annotations.items()))

pd.DataFrame(data={
    "# annotations": [x for x, _ in num_prompts.most_common()],
    "# documents with this many annotations": [x for _, x in num_prompts.most_common()]
}).sort_values("# annotations")

Total number of annotations: 23222


Unnamed: 0,# annotations,# documents with this many annotations
0,0,1118
6,1,353
3,2,589
2,3,608
4,4,587
1,5,647
5,6,421
7,7,70
8,8,16
9,9,15


### Inference Label Distribution

In [34]:
num_labels = collections.Counter()
for document in documents:
    for prompt_id, annotations in document.meta["annotations"].items():
        for annotation in annotations:
            num_labels[annotation["label"]] += 1

print(json.dumps(dict(num_labels), indent=4))

{
    "no significant difference": 10530,
    "significantly increased": 7141,
    "significantly decreased": 5551
}


### Number of evidence nodes per annotation

In [35]:
evidence_nodes = collections.Counter()
for document in documents:
    for prompt_id, annotations in document.meta["annotations"].items():
        for annotation in annotations:
            user_id = annotation["user_id"]
            assert (prompt_id, user_id) not in evidence_nodes.keys()
            evidence_nodes[(prompt_id, user_id)] = 0
    for node in document.nodes:
        for is_evidence_for in node.meta["is_evidence_for"]:
            evidence_nodes[(is_evidence_for["prompt_id"], is_evidence_for["user_id"])] += 1

num_evidence_nodes = collections.Counter(evidence_nodes.values())

histogram(num_evidence_nodes, 5, 1)

Unnamed: 0,num
0-0,979
1-1,21007
2-2,1056
3-3,118
4-4,34
≥5,28


### Number of evidence spans per annotation

In [36]:
num_evidence = collections.Counter()
for document in documents:
    for annotations in document.meta["annotations"].values():
        for annotation in annotations:
            num_evidence[len(annotation["evidence_text"])] += 1

pd.DataFrame(data={
    "# evidence spans": [count for count, _ in num_evidence.most_common()],
    "# answers with this many evidence spans": [count for _, count in num_evidence.most_common()]
}).sort_values("# evidence spans")

Unnamed: 0,# evidence spans,# answers with this many evidence spans
0,1,22457
1,2,591
2,3,125
3,4,30
4,5,11
5,6,4
9,7,1
6,8,1
7,9,1
8,12,1


### Number of evidence nodes per evidence span

In [37]:
num_nodes = collections.Counter()
missed_evidence_spans = collections.Counter()
for document in documents:
    for prompt_id, annotations in document.meta["annotations"].items():
        for annotation in annotations:
            counter = collections.Counter()
            for node in document.nodes:
                for is_evidence_for in node.meta["is_evidence_for"]:
                    if is_evidence_for["prompt_id"] == prompt_id and is_evidence_for["user_id"] == annotation["user_id"]:
                        counter[is_evidence_for["evidence_ix"]] += 1
            for ix in range(len(annotation["evidence_text"])):
                if counter[ix] == 0:
                    missed_evidence_spans[annotation["evidence_text"][ix]] += 1
                num_nodes[counter[ix]] += 1

pd.DataFrame(data={
    "# evidence nodes": [count for count, _ in num_nodes.most_common()],
    "# evidence spans with this many evidence nodes": [count for _, count in num_nodes.most_common()]
}).sort_values("# evidence nodes")

Unnamed: 0,# evidence nodes,# evidence spans with this many evidence nodes
1,0,1237
0,1,22313
2,2,649
3,3,35
4,4,3
5,5,2
11,6,1
8,8,1
9,9,1
12,11,1


### Most often-occurring missing evidence spans

In [38]:
pd.DataFrame(
    data={
        "count": [count for _, count in missed_evidence_spans.most_common(20)],
        "missing evidence text": [text for text, _ in missed_evidence_spans.most_common(20)],
    }
)

Unnamed: 0,count,missing evidence text
0,944,
1,3,The control group obtained means that were 24%...
2,3,s-controlled diabetes (above median baseline A...
3,2,Pi‐AUC\nmg*h/L\n0.68a [–0.68‐3.5]\n54.17b [37....
4,2,Improved acceptance of illness\n3.15 ± 0.51\n3...
5,2,Improved social well being\n2.69 ± 0.90\n3.46 ...
6,2,"Early apoptotic cells (annexin positive, PI ne..."
7,2,Infant hospital\n Baseline\n1.17 (3.26)\n12/41...
8,2,Number of fallers with injuries due to falls\n...
9,2,Pain index\n-3.31 (0.68)\n-4.90 (0.53)\n-5.65 ...
