In [22]:
import collections
import difflib
import json
import os
from io import StringIO

import pandas as pd
from intertext_graph.itgraph import IntertextDocument

import config
from histogram import histogram
from metadata_analyzer import parse_json_object

def mean(l):
    return sum(l) / len(l)

# QASPER Metadata Exploration

In [2]:
# configuration
config.load_config_json_file("../path_config_local.json", include_in_hash=False)

In [3]:
# load the dataset
documents = []
path = os.path.join(config.get("path.QASPER-ITG"), "train.jsonl")
with open(path, "r", encoding="utf-8") as file:
    for document_json_str in file:
        with StringIO(document_json_str) as f:
            document = IntertextDocument.load_json(f)
            documents.append(document)
path = os.path.join(config.get("path.QASPER-ITG"), "dev.jsonl")
with open(path, "r", encoding="utf-8") as file:
    for document_json_str in file:
        with StringIO(document_json_str) as f:
            document = IntertextDocument.load_json(f)
            documents.append(document)
path = os.path.join(config.get("path.QASPER-ITG"), "test.jsonl")
with open(path, "r", encoding="utf-8") as file:
    for document_json_str in file:
        with StringIO(document_json_str) as f:
            document = IntertextDocument.load_json(f)
            documents.append(document)
print(f"Loaded {len(documents)} documents.")

Loaded 1585 documents.


## Document Metadata Schema

In [4]:
metadata = [document.meta for document in documents]
print(parse_json_object(metadata))

  1/1x [
      1585/1585x {
        'qas': 
          1585/1585x [
              5049/5049x {
                'topic_background': (1): 3725/5049x non-empty string || (2): 1324/5049x empty string
                'question_id': 5049/5049x non-empty string
                'question': 5049/5049x non-empty string
                'search_query': (1): 3498/5049x empty string || (2): 1551/5049x non-empty string
                'question_writer': 5049/5049x non-empty string
                'nlp_background': (1): 3876/5049x non-empty string || (2): 1173/5049x empty string
                'paper_read': (1): 3725/5049x non-empty string || (2): 1324/5049x empty string
                'answers': 
                  5049/5049x [
                      7993/7993x {
                        'answer': 
                          7993/7993x {
                            'evidence': 7993/7993x [12761/12761x non-empty string]
                            'yes_no': (1): 6883/7993x null || (2): 1110/7993x number


## Node Metadata Schema

In [5]:
metadata = [node.meta for document in documents for node in document.nodes]
print(parse_json_object(metadata))

  1/1x [
      (1):
      109751/121115x {
        'is_evidence_for': 
          109751/109751x [
              11663/11663x {
                'evidence_ix': 11663/11663x number
                'annotation_id': 11663/11663x non-empty string
              }
          ]
      }

      (2):
      11364/121115x {
        'is_evidence_for': 
          11364/11364x [
              1092/1092x {
                'evidence_ix': 1092/1092x number
                'annotation_id': 1092/1092x non-empty string
              }
          ]
        'file': 11364/11364x non-empty string
      }
  ]



## Prompts and Ground Truth

### Number of questions per document

In [6]:
num_questions = collections.Counter()
for document in documents:
    num_questions[len(document.meta["qas"])] += 1

pd.DataFrame(data={
    "# questions": [x for x, _  in num_questions.most_common()],
    "# documents with this many questions": [x for _, x in num_questions.most_common()]
}).sort_values("# questions")

Unnamed: 0,# questions,# documents with this many questions
3,1,278
2,2,320
0,3,371
1,4,325
4,5,149
5,6,81
6,7,28
7,8,19
8,9,7
9,10,3


### Number of answers per question

In [7]:
num_answers = collections.Counter()
for document in documents:
    for qas in document.meta["qas"]:
        num_answers[len(qas["answers"])] += 1

pd.DataFrame(data={
    "# answers": [count for count, _  in num_answers.most_common()],
    "# questions with this many answers": [count for _, count in num_answers.most_common()]
}).sort_values("# answers")

Unnamed: 0,# answers,# questions with this many answers
0,1,2796
1,2,1648
2,3,531
3,4,65
4,5,6
5,6,3


### Question length

In [8]:
char_lengths = [len(qas["question"]) for document in documents for qas in document.meta["qas"]]
token_lengths = [len(qas["question"].split()) for document in documents for qas in document.meta["qas"]]

pd.DataFrame(
    index=["# characters", "# whitespace-separated tokens"],
    data={
        "min": [min(char_lengths), min(token_lengths)],
        "avg": [mean(char_lengths), mean(token_lengths)],
        "max": [max(char_lengths), max(token_lengths)],
        "total": [sum(char_lengths), sum(token_lengths)]
    }
)

Unnamed: 0,min,avg,max,total
# characters,4,50.497326,176,254961
# whitespace-separated tokens,1,8.139037,25,41094


### Answer Lengths

In [9]:
answer_lengths = collections.Counter()
for document in documents:
    for qas in document.meta["qas"]:
        for answer in qas["answers"]:
            text = None
            if answer["answer"]["unanswerable"]:
                text = "Unanswerable"
            elif answer["answer"]["extractive_spans"]:
                text = ", ".join(answer["answer"]["extractive_spans"])
            elif answer["answer"]["free_form_answer"]:
                text = answer["answer"]["free_form_answer"]
            elif answer["answer"]["yes_no"]:
                text = "Yes"
            elif answer["answer"]["yes_no"] is not None:
                text = "No"
            answer_lengths[len(text.split())] += 1

histogram(answer_lengths, 50, 5)

Unnamed: 0,num
0-4,3671
5-9,1335
10-14,900
15-19,665
20-24,442
25-29,268
30-34,192
35-39,111
40-44,93
45-49,71


### Answer Types

In [23]:
answer_types = collections.Counter()
for document in documents:
    for qas in document.meta["qas"]:
        for answer in qas["answers"]:
            if answer["answer"]["unanswerable"]:
                answer_types["unanswerable"] += 1
            elif answer["answer"]["extractive_spans"]:
                answer_types["extractive_spans"] += 1
            elif answer["answer"]["free_form_answer"]:
                answer_types["free_form_answer"] += 1
            elif answer["answer"]["yes_no"]:
                answer_types["yes_no"] += 1
            elif answer["answer"]["yes_no"] is not None:
                answer_types["yes_no"] += 1

print(json.dumps(answer_types, indent=4))

{
    "free_form_answer": 1931,
    "extractive_spans": 4142,
    "yes_no": 1110,
    "unanswerable": 810
}


### Most often-occurring questions

In [10]:
counter = collections.Counter(qas["question"] for document in documents for qas in document.meta["qas"])
pd.DataFrame(
    data={
        "count": [count for _, count in counter.most_common(10)],
        "text": [text for text, _ in counter.most_common(10)],
    }
)

Unnamed: 0,count,text
0,36,Do they report results only on English data?
1,18,Which dataset do they use?
2,14,What is the size of the dataset?
3,13,what was the baseline?
4,12,What was the baseline?
5,12,What datasets are used?
6,12,What are the baselines?
7,10,Do they evaluate only on English datasets?
8,10,What dataset do they use?
9,10,what are the baselines?


### Most often-occurring 'yes_no'

In [11]:
counter = collections.Counter(a["answer"]["yes_no"] for document in documents for qas in document.meta["qas"] for a in qas["answers"])
pd.DataFrame(
    data={
        "count": [count for _, count in counter.most_common(10)],
        "text": [text for text, _ in counter.most_common(10)],
    }
)

Unnamed: 0,count,text
0,6883,
1,611,True
2,499,False


### Most often-occurring 'free_form_answer'

In [12]:
counter = collections.Counter(a["answer"]["free_form_answer"] for document in documents for qas in document.meta["qas"] for a in qas["answers"])
pd.DataFrame(
    data={
        "count": [count for _, count in counter.most_common(10)],
        "text": [text for text, _ in counter.most_common(10)],
    }
)

Unnamed: 0,count,text
0,6062,
1,15,English
2,6,3
3,4,
4,3,1
5,3,2
6,3,12
7,3,SVM
8,3,5
9,2,one


### Most often-occurring 'unanswerable'

In [13]:
counter = collections.Counter(a["answer"]["unanswerable"] for document in documents for qas in document.meta["qas"] for a in qas["answers"])
pd.DataFrame(
    data={
        "count": [count for _, count in counter.most_common(10)],
        "text": [text for text, _ in counter.most_common(10)],
    }
)

Unnamed: 0,count,text
0,7183,False
1,810,True


### Most often-occurring 'extractive_spans'

In [14]:
counter = collections.Counter(tuple(a["answer"]["extractive_spans"]) for document in documents for qas in document.meta["qas"] for a in qas["answers"])
pd.DataFrame(
    data={
        "count": [count for _, count in counter.most_common(10)],
        "text": [text for text, _ in counter.most_common(10)],
    }
)

Unnamed: 0,count,text
0,3851,()
1,23,"(English,)"
2,9,"(LSTM,)"
3,7,"(Twitter,)"
4,7,"(Amazon Mechanical Turk,)"
5,6,"(accuracy,)"
6,6,"(English ,)"
7,5,"(BERT,)"
8,4,"(three,)"
9,4,"(Peng and Dredze peng-dredze:2016:P16-2,)"


### Number of evidence paragraphs per answer

In [15]:
num_evidence = collections.Counter()
for document in documents:
    for qas in document.meta["qas"]:
        for answer in qas["answers"]:
            num_evidence[len(answer["answer"]["evidence"])] += 1

pd.DataFrame(data={
    "# evidence paragraphs": [count for count, _  in num_evidence.most_common()],
    "# answers with this many evidence paragraphs": [count for _, count in num_evidence.most_common()]
}).sort_values("# evidence paragraphs")

Unnamed: 0,# evidence paragraphs,# answers with this many evidence paragraphs
2,0,1023
0,1,4533
1,2,1332
3,3,420
4,4,253
5,5,140
6,6,85
7,7,57
8,8,49
9,9,36


### Number of evidence nodes per evidence paragraph

In [16]:
num_nodes = collections.Counter()
missed_evidence_paragraphs = collections.Counter()
missing = []
for document in documents:
    for qas in document.meta["qas"]:
        for answer in qas["answers"]:
            counter = collections.Counter()
            for node in document.nodes:
                for is_evidence_for in node.meta["is_evidence_for"]:
                    if is_evidence_for["annotation_id"] == answer["annotation_id"]:
                        counter[is_evidence_for["evidence_ix"]] += 1
            for ix in range(len(answer["answer"]["evidence"])):
                if counter[ix] == 0:
                    missed_evidence_paragraphs[answer["answer"]["evidence"][ix]] += 1
                    missing.append((
                        answer["answer"]["evidence"][ix],
                        [node.content for node in document.nodes],
                        answer["annotation_id"]
                    ))
                num_nodes[counter[ix]] += 1

pd.DataFrame(data={
    "# evidence nodes": [count for count, _  in num_nodes.most_common()],
    "# evidence paragraphs with this many evidence nodes": [count for _, count in num_nodes.most_common()]
}).sort_values("# evidence nodes")

Unnamed: 0,# evidence nodes,# evidence paragraphs with this many evidence nodes
1,0,53
0,1,12678
2,2,20
3,3,6
4,4,3
5,7,1


### Most often-occurring missing evidence paragraphs

In [17]:
pd.DataFrame(
    data={
        "count": [count for _, count in missed_evidence_paragraphs.most_common(20)],
        "missing evidence text": [text for text, _ in missed_evidence_paragraphs.most_common(20)],
    }
)

Unnamed: 0,count,missing evidence text
0,4,"$$\textsc {Rel}(q) = \cos (v(q),v(D_{in}))$$ (..."
1,4,$$\textsc {AvgLM}(q) = \frac{\textsc {Lm}(q)}{...
2,3,"$$ NST(\tilde{E},N,K) = \frac{1}{N \vert \tild..."
3,2,FLOAT SELECTED: Table 1: Average ROUGE-2 Score...
4,2,"$$freq(*, word) = freq(word, *) = freq(word)$$..."
5,1,FLOAT SELECTED: Table 2: Comparison of the pro...
6,1,$$\begin{split} g_t &= {\rm sigmoid}(W_gx_t+b_...
7,1,"$$\begin{split} o_t &= {\rm BiLSTM}(o_{t-1}, [..."
8,1,"$$\begin{split} p_t &= {\rm LSTM}(p_{t-1}, c_t..."
9,1,$${\rm P}(\textbf {a}|\mathbf {O}) = \prod _t ...


### Inspection of missing evidence nodes

In [18]:
for evidence_paragraph, node_contents, annotation_id in missing:
    print(f"Annotation ID: '{annotation_id}'")
    print(f"Evidence Paragraph: '{evidence_paragraph}'")
    for node_content in difflib.get_close_matches(evidence_paragraph, node_contents):
        print(f"- '{node_content}'")
    print()

Annotation ID: '1384b1e2ddc8d8417896cb3664c4586037474138'
Evidence Paragraph: 'FLOAT SELECTED: Table 2: Comparison of the proposed ordinal regression neural network (ORNN) against Immediate-Threshold ordinal logistic regression (IT), All-Threshold ordinal logistic regression (AT), Least Absolute Deviation (LAD), multi-class logistic regression (MC), and the Human Trafficking Deep Network (HTDN) in terms of Mean Absolute Error (MAE), macro-averaged Mean Absolute Error (MAEM ), binary classification accuracy (Acc.) and weighted binary classification accuracy (Wt. Acc.). The results are averaged across 10-fold CV on Trafficking10k with naive standard errors in the parentheses. The best and second best results are highlighted.'
- 'Table 2: Comparison of the proposed ordinal regression neural network (ORNN) against Immediate-Threshold ordinal logistic regression (IT), All-Threshold ordinal logistic regression (AT), Least Absolute Deviation (LAD), multi-class logistic regression (MC), and th

## Title Patterns

In [19]:
yes, no = 0, 0
for document in documents:
    for node in document.nodes:
        if node.ntype == "title":
            if ":::" in node.content:
                yes += 1
            else:
                no += 1
pd.DataFrame(
    index=["# titles that contain ':::'", "# titles that do not contain ':::'"],
    data={
        "count": [yes, no]
    }
)

Unnamed: 0,count
# titles that contain ':::',6946
# titles that do not contain ':::',14963


### Most often-occurring titles

In [20]:
counter = collections.Counter(node.content for document in documents for node in document.nodes if node.ntype == "title")
pd.DataFrame(
    data={
        "count": [count for _, count in counter.most_common(10)],
        "text": [text for text, _ in counter.most_common(10)],
    }
)

Unnamed: 0,count,text
0,1499,Introduction
1,818,Conclusion
2,626,Related Work
3,376,Acknowledgments
4,332,Experiments
5,305,Results
6,252,Conclusions
7,212,Acknowledgements
8,165,Discussion
9,139,Conclusion and Future Work
