This notebook gives an overview of a specified Ryn Dataset Directory.

# Imports

In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import pandas as pd
from IPython.core.display import display

from dao.ryn.ryn_dir import RynDir

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

# Config

In [None]:
# ryn_dataset_dir = 'data/ryn/irt.cde.cde.1.clean'
# ryn_dataset_dir = 'data/ryn/irt.cde.irt.1.clean'
# ryn_dataset_dir = 'data/ryn/irt.cde.irt.5.clean'
# ryn_dataset_dir = 'data/ryn/irt.cde.irt.15.clean'
# ryn_dataset_dir = 'data/ryn/irt.cde.irt.30.clean'
# ryn_dataset_dir = 'data/ryn/irt.fb.irt.1.clean'
ryn_dataset_dir = 'data/ryn/irt.fb.irt.5.clean'
# ryn_dataset_dir = 'data/ryn/irt.fb.irt.15.clean'
# ryn_dataset_dir = 'data/ryn/irt.fb.irt.30.clean'
# ryn_dataset_dir = 'data/ryn/irt.fb.owe.1.clean'

# Check Ryn Directory

Check that the Ryn Directory exists and that all required
files exist.

In [None]:
ryn_dir = RynDir('Ryn Directory', Path(ryn_dataset_dir))
ryn_dir.check()

# print(ryn_dir)

## 1 Ryn Split Directory

### 1.1 Entity Labels

In [None]:
ent_to_label = ryn_dir.split_dir.entity_labels_txt.load_rid_to_label()
print(len(ent_to_label), 'entities')
display(ent_to_label)

### 1.2 Relation Labels

In [None]:
rel_to_label = ryn_dir.split_dir.relation_labels_txt.load_rid_to_label()
print(len(rel_to_label), 'relations')
display(rel_to_label)

### 1.3 CW Train Triples

In [None]:
cw_train_triples = ryn_dir.split_dir.cw_train_triples_txt.load_triples()

df_cols = ['head', 'rel', 'tail']
df_data = [(ent_to_label[head], rel_to_label[rel], ent_to_label[tail])
           for head, rel, tail in cw_train_triples]

pd.DataFrame(data=df_data, columns=df_cols)

### 1.4 CW Valid Triples

In [None]:
cw_valid_triples = ryn_dir.split_dir.cw_valid_triples_txt.load_triples()

df_cols = ['head', 'rel', 'tail']
df_data = [(ent_to_label[head], rel_to_label[rel], ent_to_label[tail])
           for head, rel, tail in cw_valid_triples]

pd.DataFrame(data=df_data, columns=df_cols)

### 1.5 OW Valid Triples

In [None]:
ow_valid_triples = ryn_dir.split_dir.ow_valid_triples_txt.load_triples()

df_cols = ['head', 'rel', 'tail']
df_data = [(ent_to_label[head], rel_to_label[rel], ent_to_label[tail])
           for head, rel, tail in ow_valid_triples]

pd.DataFrame(data=df_data, columns=df_cols)

### 1.6 OW Test Triples

In [None]:
ow_test_triples = ryn_dir.split_dir.ow_valid_triples_txt.load_triples()

df_cols = ['head', 'rel', 'tail']
df_data = [(ent_to_label[head], rel_to_label[rel], ent_to_label[tail])
           for head, rel, tail in ow_test_triples]

pd.DataFrame(data=df_data, columns=df_cols)

## 2 Ryn Text Directory

### 2.1 CW Train Sentences

In [None]:
cw_train_sents = ryn_dir.text_dir.cw_train_sentences_txt.load_ent_to_sentences()

df_cols = ['entity', 'sentence']
df_data = [(ent_to_label[ent], sent) for ent, sents in cw_train_sents.items() for sent in sents]

pd.DataFrame(data=df_data, columns=df_cols)

### 2.2 OW Valid Sentences

In [None]:
ow_valid_sents = ryn_dir.text_dir.ow_valid_sentences_txt.load_ent_to_sentences()

df_cols = ['entity', 'sentence']
df_data = [(ent_to_label[ent], sent) for ent, sents in ow_valid_sents.items() for sent in sents]

pd.DataFrame(data=df_data, columns=df_cols)

### 2.3 OW Test Sentences

In [None]:
ow_test_sents = ryn_dir.text_dir.ow_test_sentences_txt.load_ent_to_sentences()

df_cols = ['entity', 'sentence']
df_data = [(ent_to_label[ent], sent) for ent, sents in ow_test_sents.items() for sent in sents]

pd.DataFrame(data=df_data, columns=df_cols)