In [1]:
import collections
import os
from io import StringIO

import pandas as pd
from intertext_graph.itgraph import IntertextDocument, Etype

import config

from histogram import histogram
from metadata_analyzer import parse_json_object

def mean(l):
    return sum(l) / len(l)

# ITG Dataset Exploration

In [2]:
# configuration
config.load_config_json_file("path_config_local.json", include_in_hash=False)

In [3]:
# load the dataset
documents = []
path = os.path.join(config.get("path.QASPER-ITG"), "shallow-train.jsonl")
with open(path, "r", encoding="utf-8") as file:
    for document_json_str in file:
        with StringIO(document_json_str) as f:
            document = IntertextDocument.load_json(f)
            documents.append(document)
path = os.path.join(config.get("path.QASPER-ITG"), "shallow-dev.jsonl")
with open(path, "r", encoding="utf-8") as file:
    for document_json_str in file:
        with StringIO(document_json_str) as f:
            document = IntertextDocument.load_json(f)
            documents.append(document)
path = os.path.join(config.get("path.QASPER-ITG"), "shallow-test.jsonl")
with open(path, "r", encoding="utf-8") as file:
    for document_json_str in file:
        with StringIO(document_json_str) as f:
            document = IntertextDocument.load_json(f)
            documents.append(document)

print(f"Loaded {len(documents)} documents.")

Loaded 1585 documents.


In [4]:
if False:  # filter for GovReport FastFact documents
    new_documents = []
    for document in documents:
        if "fastfact" in document.meta.keys() and document.meta["fastfact"] != []:
            new_documents.append(document)
    documents = new_documents
    print(f"Filtered to {len(documents)} FastFact Documents")

## Document Statistics

In [5]:
print(f"Number of documents: {len(documents)}")

Number of documents: 1585


## Node Statistics

In [6]:
print(f"Total number of nodes: {sum(len(document.nodes) for document in documents)}")

Total number of nodes: 121115


In [7]:
node_types = list(set(node.ntype for document in documents for node in document.nodes))
node_types_and_total = node_types + ["total"]
print("Node types:")
for node_type in node_types:
    print(f"- '{node_type}'")

Node types:
- 'title'
- 'abstract'
- 'figures-and-tables-section'
- 'figure-or-table'
- 'article-title'
- 'p'


### Number of nodes per document by type

In [8]:
by_type = collections.defaultdict()
for node_type in node_types:
    by_type[node_type] = [len([node for node in document.nodes if node.ntype == node_type]) for document in documents]
by_type["total"] = [len([node for node in document.nodes]) for document in documents]

pd.DataFrame(
    index=[f"'{node_type}'" for node_type in node_types] + ["total"],
    data={
        "min": [min(by_type[node_type]) for node_type in node_types_and_total],
        "avg": [mean(by_type[node_type]) for node_type in node_types_and_total],
        "max": [max(by_type[node_type]) for node_type in node_types_and_total],
        "total": [sum(by_type[node_type]) for node_type in node_types_and_total],
        "fraction": [sum(by_type[node_type]) / sum(by_type["total"]) for node_type in node_types_and_total]
    }
)

Unnamed: 0,min,avg,max,total,fraction
'title',0,13.822713,49,21909,0.180894
'abstract',1,1.0,1,1585,0.013087
'figures-and-tables-section',1,1.0,1,1585,0.013087
'figure-or-table',0,7.169716,78,11364,0.093828
'article-title',1,1.0,1,1585,0.013087
'p',1,52.42082,645,83087,0.686017
total,4,76.413249,664,121115,1.0


### Histogram: Number of nodes per document for given type

In [9]:
include_node_types = ["title", "abstract"]

by_type = [len([node for node in document.nodes if node.ntype in include_node_types]) for document in documents]
values = dict(collections.Counter(by_type))

histogram(values, 55, 5)

Unnamed: 0,num
0-4,1
5-9,272
10-14,598
15-19,440
20-24,164
25-29,61
30-34,23
35-39,18
40-44,2
45-49,5


In [10]:
include_node_types = ["p"]

by_type = [len([node for node in document.nodes if node.ntype in include_node_types]) for document in documents]
values = dict(collections.Counter(by_type))

histogram(values, 110, 10)

Unnamed: 0,num
0-9,14
10-19,84
20-29,239
30-39,266
40-49,276
50-59,247
60-69,171
70-79,126
80-89,52
90-99,25


### Number of nodes at depth by type

In [11]:
at_depth = collections.defaultdict(collections.Counter)
for document in documents:
    for node in document.nodes:
        at_depth[len(list(document.breadcrumbs(node, Etype.PARENT)))][node.ntype] += 1
        at_depth[len(list(document.breadcrumbs(node, Etype.PARENT)))]["total"] += 1
        at_depth["total"][node.ntype] += 1
        at_depth["total"]["total"] += 1

pd.DataFrame(
    index=[f"'{node_type}'" for node_type in node_types] + ["total"],
    columns=[depth for depth in at_depth.keys() if isinstance(depth, int)] + ["total"],
    data={depth: [values[node_type] for node_type in node_types_and_total] for depth, values in at_depth.items()}
)

Unnamed: 0,1,2,3,total
'title',0,21909,0,21909
'abstract',0,1585,0,1585
'figures-and-tables-section',0,1585,0,1585
'figure-or-table',0,0,11364,11364
'article-title',1585,0,0,1585
'p',0,0,83087,83087
total,1585,25079,94451,121115


## Text Statistics

In [12]:
print(f"Total number of characters: {sum(sum(len(node.content) for node in document.nodes) for document in documents)}")
print(f"Total number of whitespace-separated tokens: {sum(sum(len(node.content.split()) for node in document.nodes) for document in documents)}")

Total number of characters: 40848547
Total number of whitespace-separated tokens: 6270686


### Number of characters and whitespace-separated tokens per document

In [13]:
character_nums = collections.Counter([sum(len(node.content) for node in document.nodes) for document in documents])
token_nums = collections.Counter([sum(len(node.content.split()) for node in document.nodes) for document in documents])

N = 16000
print(f"Number of documents with more than {N} whitespace-separated tokens: {sum(y for x, y in token_nums.items() if x > N)}")

pd.DataFrame(
    index=["# characters per document", "# whitespace-separated tokens per document"],
    data={
        "min": [min(character_nums), min(token_nums)],
        "avg": [mean(character_nums), mean(token_nums)],
        "max": [max(character_nums), max(token_nums)],
        "total": [sum(character_nums, sum(token_nums))]
    }
)

Number of documents with more than 16000 whitespace-separated tokens: 6


Unnamed: 0,min,avg,max,total
# characters per document,1093,25813.711601,179912,45313654
# whitespace-separated tokens per document,161,4019.865836,27811,45313654


### Histogram: Number of whitespace-separated tokens per document

In [14]:
token_nums = collections.Counter([sum(len(node.content.split()) for node in document.nodes) for document in documents])
values = dict(collections.Counter(token_nums))

histogram(values, 11000, 1000)

Unnamed: 0,num
0-999,17
1000-1999,138
2000-2999,389
3000-3999,344
4000-4999,403
5000-5999,152
6000-6999,56
7000-7999,31
8000-8999,14
9000-9999,12


### Number of characters per node by node type

In [15]:
by_type = collections.defaultdict()
for node_type in node_types:
    by_type[node_type] = [len(node.content) for document in documents for node in document.nodes if node.ntype == node_type]
by_type["total"] = [len(node.content) for document in documents for node in document.nodes]

pd.DataFrame(
    index=[f"'{node_type}'" for node_type in node_types] + ["total"],
    data={
        "min": [min(by_type[node_type]) for node_type in node_types_and_total],
        "avg": [mean(by_type[node_type]) for node_type in node_types_and_total],
        "max": [max(by_type[node_type]) for node_type in node_types_and_total],
        "total": [sum(by_type[node_type]) for node_type in node_types_and_total]
    }
)

Unnamed: 0,min,avg,max,total
'title',0,30.019353,197,657694
'abstract',8,8.0,8,12680
'figures-and-tables-section',18,18.0,18,28530
'figure-or-table',6,156.950634,1623,1783587
'article-title',12,71.61388,156,113508
'p',0,460.391493,18951,38252548
total,0,337.270751,18951,40848547


### Number of whitespace-separated tokens per node by node type

In [16]:
by_type = collections.defaultdict()
for node_type in node_types:
    by_type[node_type] = [len(node.content.split()) for document in documents for node in document.nodes if node.ntype == node_type]
by_type["total"] = [len(node.content.split()) for document in documents for node in document.nodes]

pd.DataFrame(
    index=[f"'{node_type}'" for node_type in node_types] + ["total"],
    data={
        "min": [min(by_type[node_type]) for node_type in node_types_and_total],
        "avg": [mean(by_type[node_type]) for node_type in node_types_and_total],
        "max": [max(by_type[node_type]) for node_type in node_types_and_total],
        "total": [sum(by_type[node_type]) for node_type in node_types_and_total]
    }
)

Unnamed: 0,min,avg,max,total
'title',0,3.821124,33,83717
'abstract',1,1.0,1,1585
'figures-and-tables-section',3,3.0,3,4755
'figure-or-table',2,24.63138,270,279911
'article-title',1,9.076972,26,14387
'p',0,70.845391,3624,5886331
total,0,51.774644,3624,6270686


In [17]:
node_type = "title"

by_type = [len([node for node in document.nodes if node.ntype == node_type]) for document in documents]
values = dict(collections.Counter(by_type))

histogram(values, 100, 10)

Unnamed: 0,num
0-9,376
10-19,996
20-29,171
30-39,36
40-49,6
50-59,0
60-69,0
70-79,0
80-89,0
90-99,0


### Number of nodes with empty content by node type

In [18]:
by_type = collections.defaultdict()
for node_type in node_types:
    by_type[node_type] = len([node for document in documents for node in document.nodes if node.ntype == node_type and node.content == ""])
by_type["total"] = len([node for document in documents for node in document.nodes if node.content == ""])

by_type_all = collections.defaultdict()
for node_type in node_types:
    by_type_all[node_type] = len([node for document in documents for node in document.nodes if node.ntype == node_type])
by_type_all["total"] = len([node for document in documents for node in document.nodes])

pd.DataFrame(
    index=[f"'{node_type}'" for node_type in node_types] + ["total"],
    data={
        "num": [by_type[node_type] for node_type in node_types_and_total],
        "fraction": [by_type[node_type] / by_type_all[node_type] for node_type in node_types_and_total]
    }
)

Unnamed: 0,num,fraction
'title',20,0.000913
'abstract',0,0.0
'figures-and-tables-section',0,0.0
'figure-or-table',0,0.0
'article-title',0,0.0
'p',1537,0.018499
total,1557,0.012856


### Most often-occurring content (at depth)

In [19]:
node_type = "title"
max_depth = max(len(list(document.breadcrumbs(node, Etype.PARENT))) for document in documents for node in document.nodes if node.ntype == node_type)
depth = max_depth  # max_depth
with_breadcrumbs = True

if depth is None:
    counter = collections.Counter([node.content for document in documents for node in document.nodes if node.ntype == node_type])
else:
    counter = collections.Counter()
    for document in documents:
        for node in document.nodes:
            if node.ntype == node_type:
                breadcrumbs = list(document.breadcrumbs(node, Etype.PARENT))
                if len(breadcrumbs) == depth:
                    if with_breadcrumbs:
                        counter[" || ".join(breadcrumb.content for breadcrumb in breadcrumbs)] += 1
                    else:
                        counter[node.content] += 1

s = "" if depth is None else f" at depth {depth}"
print(f"Most often-occurring contents for nodes of type '{node_type}'{s}:")

pd.DataFrame(
    data={
        "count": [count for _, count in counter.most_common(10)],
        "text": [text for text, _ in counter.most_common(10)],
    }
)

Most often-occurring contents for nodes of type 'title' at depth 2:


Unnamed: 0,count,text
0,2,Subjectivity Detection || Basic tasks of senti...
1,1,Introduction || Minimally Supervised Learning ...
2,1,Related Work || Minimally Supervised Learning ...
3,1,Proposed Method || Minimally Supervised Learni...
4,1,Proposed Method ::: Polarity Function || Minim...
5,1,Proposed Method ::: Discourse Relation-Based E...
6,1,Proposed Method ::: Discourse Relation-Based E...
7,1,Proposed Method ::: Discourse Relation-Based E...
8,1,Proposed Method ::: Discourse Relation-Based E...
9,1,Proposed Method ::: Loss Functions || Minimall...


## Example Document

In [20]:
print(documents[0].to_plaintext())

Minimally Supervised Learning of Affective Events Using Discourse Relations
Abstract
Recognizing affective events that trigger positive or negative sentiment has a wide range of natural language processing applications but remains a challenging problem mainly because the polarity of an event is not necessarily predictable from its constituent words. In this paper, we propose to propagate affective polarity using discourse relations. Our method is simple and only requires a very small seed lexicon and a large raw corpus. Our experiments using Japanese data show that our method learns affective events effectively without manually labeled data. It also improves supervised learning results when labeled data are small.
Introduction
Affective events BIBREF0 are events that typically affect people in positive or negative ways. For example, getting money and playing sports are usually positive to the experiencers; catching cold and losing one's wallet are negative. Understanding affective even

## Metadata

### Document metadata schema

In [21]:
metadata = [document.meta for document in documents]
print(parse_json_object(metadata))

  1/1x [
      1585/1585x {
        'arxiv_id': 1585/1585x non-empty string
        'ix_counter': 1585/1585x number
        'qas': 
          1585/1585x [
              5049/5049x {
                'paper_read': (1): 3725/5049x non-empty string || (2): 1324/5049x empty string
                'topic_background': (1): 3725/5049x non-empty string || (2): 1324/5049x empty string
                'nlp_background': (1): 3876/5049x non-empty string || (2): 1173/5049x empty string
                'question': 5049/5049x non-empty string
                'search_query': (1): 3498/5049x empty string || (2): 1551/5049x non-empty string
                'question_id': 5049/5049x non-empty string
                'question_writer': 5049/5049x non-empty string
                'answers': 
                  5049/5049x [
                      7993/7993x {
                        'answer': 
                          7993/7993x {
                            'highlighted_evidence': 7993/7993x [10706/10706x non

### Node metadata schema

In [22]:
metadata = [node.meta for document in documents for node in document.nodes]
print(parse_json_object(metadata))

  1/1x [
      (1):
      109751/121115x {
        'is_evidence_for': 
          109751/109751x [
              11663/11663x {
                'evidence_ix': 11663/11663x number
                'annotation_id': 11663/11663x non-empty string
              }
          ]
      }

      (2):
      11364/121115x {
        'is_evidence_for': 
          11364/11364x [
              1092/1092x {
                'evidence_ix': 1092/1092x number
                'annotation_id': 1092/1092x non-empty string
              }
          ]
        'file': 11364/11364x non-empty string
      }
  ]



# For QASPER shallow: Get number of documents with ':::' in section titles

In [12]:
assert 'QASPER' in path and 'shallow' in path
n_with_colons = 0
for doc in documents:
    for node in doc.nodes:
        if ':::' in node.content:
            n_with_colons += 1
            break


print(f'{n_with_colons} of {len(documents)} in total.')

0 of 1585 in total.


In [11]:
':::' in documents[0].nodes[16].content

True