Feb 28

It has turned out that the validation of the current
ower-fb-3 dataset are not useful as all the ground truth
classes are false for the first 300 entities.

The script for building the OWER dataset needs to be
examined and corrected. Estimated, the most frequent
classes should be true at least for every 100th entity.

# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from pathlib import Path
from collections import defaultdict
from typing import List, Tuple, Dict, Set
from dao.ower.ower_triples_db import DbTriple

from dao.classes_tsv import ClassesTsv
from dao.ower.ower_dir import OwerDir
from dao.ryn.ryn_dir import RynDir

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

# Config

In [None]:
# ryn_dataset_dir = 'data/ryn/irt.cde.cde.1.clean'
# ryn_dataset_dir = 'data/ryn/irt.cde.irt.1.clean'
# ryn_dataset_dir = 'data/ryn/irt.cde.irt.5.clean'
# ryn_dataset_dir = 'data/ryn/irt.cde.irt.15.clean'
# ryn_dataset_dir = 'data/ryn/irt.cde.irt.30.clean'
# ryn_dataset_dir = 'data/ryn/irt.fb.irt.1.clean'
ryn_dataset_dir = 'data/ryn/irt.fb.irt.5.clean'
# ryn_dataset_dir = 'data/ryn/irt.fb.irt.15.clean'
# ryn_dataset_dir = 'data/ryn/irt.fb.irt.30.clean'
# ryn_dataset_dir = 'data/ryn/irt.fb.owe.1.clean'

classes_tsv = 'data/classes-v1-mmaa.tsv'

sent_count = 3
ower_dataset_dir = 'data/ower/ower-v3-fb-irt-3'

# 1 Check files

In [None]:
#
# Assert that (input) Ryn Directory exists
#

ryn_dir = RynDir('Ryn Directory', Path(ryn_dataset_dir))
ryn_dir.check()

#
# Assert that (input) Classes TSV exists
#

classes_tsv = ClassesTsv('Classes TSV', Path(classes_tsv))
classes_tsv.check()

#
# Create (output) OWER Dataset Directory if it does not exist already
#

ower_dir = OwerDir('OWER Directory', Path(ower_dataset_dir))
ower_dir.create()

# 2 Create OWER dataset

## 2.1 Load triples from Triples TXTs

In [None]:
split_dir = ryn_dir.split_dir
cw_train_triples: List[Tuple[int, int, int]] = split_dir.cw_train_triples_txt.load_triples()
cw_valid_triples: List[Tuple[int, int, int]] = split_dir.cw_valid_triples_txt.load_triples()
ow_valid_triples: List[Tuple[int, int, int]] = split_dir.ow_valid_triples_txt.load_triples()
ow_test_triples: List[Tuple[int, int, int]] = split_dir.ow_test_triples_txt.load_triples()

train_triples = cw_train_triples + cw_valid_triples
valid_triples = ow_valid_triples
test_triples = ow_test_triples

## 2.2 Save triples to Triples DBs

In [None]:
ower_dir.train_triples_db.create_triples_table()
train_db_triples = [DbTriple(triple[0], triple[1], triple[2]) for triple in train_triples]
ower_dir.train_triples_db.insert_triples(train_db_triples)

ower_dir.valid_triples_db.create_triples_table()
valid_db_triples = [DbTriple(triple[0], triple[1], triple[2]) for triple in valid_triples]
ower_dir.valid_triples_db.insert_triples(valid_db_triples)

ower_dir.test_triples_db.create_triples_table()
test_db_triples = [DbTriple(triple[0], triple[1], triple[2]) for triple in test_triples]
ower_dir.test_triples_db.insert_triples(test_db_triples)

## 2.3 Load entity sentences

In [None]:
text_dir = ryn_dir.text_dir
train_contexts: Dict[int, Set[str]] = text_dir.cw_train_sentences_txt.load_ent_to_sentences()
valid_contexts: Dict[int, Set[str]] = text_dir.ow_valid_sentences_txt.load_ent_to_sentences()
test_contexts: Dict[int, Set[str]] = text_dir.ow_test_sentences_txt.load_ent_to_sentences()

## 2.4 Query each entity's classes

In [None]:
classes: List[Tuple[int, int]] = classes_tsv.read_classes()

train_class_to_entities = defaultdict(set)
valid_class_to_entities = defaultdict(set)
test_class_to_entities = defaultdict(set)

for class_ in classes:
    train_class_to_entities[class_] = ower_dir.train_triples_db.select_entities_with_class(class_)

for class_ in classes:
    valid_class_to_entities[class_] = ower_dir.valid_triples_db.select_entities_with_class(class_)

for class_ in classes:
    test_class_to_entities[class_] = ower_dir.test_triples_db.select_entities_with_class(class_)

print()
for k, v in train_class_to_entities.items():
    print(k, len(v))

print()
for k, v in valid_class_to_entities.items():
    print(k, len(v))
    
print()
for k, v in test_class_to_entities.items():
    print(k, len(v))

## 2.5 Save OWER TSVs

In [None]:
train_tsv_rows = []
valid_tsv_rows = []
test_tsv_rows = []

for ent in train_contexts:
    train_tsv_row = [ent]
    for class_ in classes:
        train_tsv_row.append(int(ent in train_class_to_entities[class_]))
    sentences = list(train_contexts[ent])[:sent_count]
    if len(sentences) < sent_count:
        continue
    train_tsv_row.append(sentences)
    train_tsv_rows.append(train_tsv_row)

for ent in valid_contexts:
    valid_tsv_row = [ent]
    for class_ in classes:
        valid_tsv_row.append(int(ent in valid_class_to_entities[class_]))
    sentences = list(valid_contexts[ent])[:sent_count]
    if len(sentences) < sent_count:
        continue
    valid_tsv_row.append(sentences)
    valid_tsv_rows.append(valid_tsv_row)

for ent in test_contexts:
    test_tsv_row = [ent]
    for class_ in classes:
        test_tsv_row.append(int(ent in test_class_to_entities[class_]))
    sentences = list(test_contexts[ent])[:sent_count]
    if len(sentences) < sent_count:
        continue
    test_tsv_row.append(sentences)
    test_tsv_rows.append(test_tsv_row)

In [None]:
ower_dir.train_samples_tsv.write_samples_tsv(train_tsv_rows)
ower_dir.valid_samples_tsv.write_samples_tsv(valid_tsv_rows)
ower_dir.test_samples_tsv.write_samples_tsv(test_tsv_rows)

# 3 Check OWER TSVs

In [None]:
ent_to_label = ryn_dir.split_dir.entity_labels_txt.load_rid_to_label()
converters = { 0: lambda ent: ent_to_label[int(ent)] }

df = pd.read_csv(ower_dir.train_samples_tsv._path, sep='\t', header=None, converters=converters)
df.sample(frac=1)[:100]

In [None]:
df = pd.read_csv(ower_dir.valid_samples_tsv._path, sep='\t', header=None, converters=converters)
df.sample(frac=1)[:100]

In [None]:
df = pd.read_csv(ower_dir.test_samples_tsv._path, sep='\t', header=None, converters=converters)
df.sample(frac=1)[:100]