In [1]:
from pathlib import Path
import json

datadir = Path("data")
all_examples_file = datadir.joinpath("examples_to_clean.jsonl")

with all_examples_file.open("r", encoding="utf-8") as f:
    all_examples = [json.loads(line) for line in f]

In [2]:
all_examples

[{'text': 'Zesde Brief.',
  'labels': [],
  'source': 'Zesde Brief (met annotaties).docx'},
 {'text': 'Op de Bloksberg  des morgens van den 15 July 1816 om 4½ uur. –',
  'labels': [{'span': 'Bloksberg',
    'types': ['E53 Place', 'E19 Physical Thing']},
   {'span': 'morgens van den 15 July 1816 om 4½ uur',
    'types': ['E52 Time-Span']}],
  'source': 'Zesde Brief (met annotaties).docx'},
 {'text': 'Waardste Vriend!',
  'labels': [],
  'source': 'Zesde Brief (met annotaties).docx'},
 {'text': 'U te melden wat wy deeze dagen op onze reis door den Harz gezien hebben, u hiervan een flaauwe allerflaauwste schets te geven is volstrekt onmogelyk! ',
  'labels': [{'span': 'Harz', 'types': ['E53 Place']}],
  'source': 'Zesde Brief (met annotaties).docx'},
 {'text': 'Hoe zoude ik toch schriftelyk kunnen vermelden wat wy eenige honderd voet diep  in de mynen van Clauthal, en wat wy boven op den Bloksberg , meer dan 3400 voet  boven de oppervlakte van de Oostzee verheven gezien hebben; ',
  'labe

In [3]:
from collections import Counter

counts = Counter(t for ex in all_examples for lab in ex["labels"] for t in lab["types"])
counts.most_common()

[('E53 Place', 204),
 ('E19 Physical Thing', 115),
 ('E21 Person', 89),
 ('E54 Dimension', 40),
 ('E52 Time-Span', 25),
 ('time', 16),
 ('E86 Leaving', 9),
 ('E74 Group', 8),
 ('miscellaneous', 8),
 ('E9 Move', 5),
 ('mythological figure', 5),
 ('artwork', 4),
 ('lecture', 4),
 ('F2 Expression', 3),
 ('E 53 Place', 2),
 ('E21 Persons', 2),
 ('E18 Physical Thing', 1),
 ('P183 ends before the start of', 1),
 ('E31 Document', 1),
 ('E53', 1),
 ('Place', 1),
 ('number of students', 1),
 ('country', 1),
 ('categories of books', 1),
 ('document/lecture', 1),
 ('discipline/lecture', 1),
 ('E55 Type', 1),
 ('distance', 1),
 ('student culture', 1)]

In [4]:
# see Jan 30 mail

update_label_map = {
    "E 53 Place": "E53 Place",
    "E21 Persons": "E21 Person",
    "E18 Physical Thing": "E19 Physical Thing",
    "E53": "E53 Place",
    "Place": "E53 Place",
    "time": "E52 Time-Span",
    "mythological figure": "E21 Person",
    "number of students": "E54 Dimension",
    "country": "E53 Place",
    "categories of books": "E55 Type",
    "document/lecture": "E55 Type",
    "discipline/lecture": "E7 Activity",
    "distance": "E54 dimension",
}

In [5]:
def update_labels(example):
    labels = example["labels"]
    for lab in labels:
        new_types =  [
            update_label_map[t] if t in update_label_map else t for t in lab["types"]
        ]

        new_types = [t for t in new_types if t[0] in ("E", "F", "P")]


        lab["types"] = new_types
        
    example["labels"] = labels
    return example


updated_examples = [update_labels(example) for example in all_examples]
counts = Counter(
    t for ex in updated_examples for lab in ex["labels"] for t in lab["types"]
)

counts

Counter({'E53 Place': 209,
         'E19 Physical Thing': 116,
         'E21 Person': 96,
         'E52 Time-Span': 41,
         'E54 Dimension': 41,
         'E86 Leaving': 9,
         'E74 Group': 8,
         'E9 Move': 5,
         'F2 Expression': 3,
         'E55 Type': 3,
         'P183 ends before the start of': 1,
         'E31 Document': 1,
         'E7 Activity': 1,
         'E54 dimension': 1})

In [6]:
from collections import defaultdict

types_per_source = defaultdict(list)

for ex in updated_examples:
    types_per_source[ex["source"]] += [t for lab in ex["labels"] for t in lab["types"]]

for source, types in types_per_source.items():
    print(source)
    print(Counter(types).most_common())

Zesde Brief (met annotaties).docx
[('E53 Place', 83), ('E19 Physical Thing', 52), ('E21 Person', 29), ('E52 Time-Span', 27), ('E54 Dimension', 22), ('E86 Leaving', 6), ('E74 Group', 4), ('E9 Move', 4)]
Ontology-based Annotation.docx
[('E53 Place', 126), ('E21 Person', 67), ('E19 Physical Thing', 64), ('E54 Dimension', 19), ('E52 Time-Span', 14), ('E74 Group', 4), ('F2 Expression', 3), ('E55 Type', 3), ('E86 Leaving', 3), ('P183 ends before the start of', 1), ('E31 Document', 1), ('E7 Activity', 1), ('E54 dimension', 1), ('E9 Move', 1)]


In [None]:
with datadir.joinpath("examples_cleaned.jsonl").open("w", encoding="utf-8") as f:
    for ex in updated_examples:
        f.write(json.dumps(ex, ensure_ascii=False) + "\n")