# NELL995 dataset semantically enriched

In [2]:
from glob import glob
from tqdm import tqdm
from pickle import dump, load, HIGHEST_PROTOCOL
from os import makedirs
from os.path import exists
from urllib.request import urlopen
from bz2 import open as bz2open
from shutil import copy as copy_file
import gzip

try:
    from rdflib import Graph
except:
    !pip install rdflib

In [3]:
draft_folder=f"../draft/NELL-995+"
destination_folder=f"."

# http://rtw.ml.cmu.edu/resources/results/08m/NELL.08m.995.esv.csv.gz
# http://rtw.ml.cmu.edu/resources/results/08m/NELL.08m.995.ontology.csv.gz

In [4]:
draft_folder=f"{draft_folder}{'' if draft_folder.endswith('/') else '/'}"
destination_folder=f"{destination_folder}{'' if destination_folder.endswith('/') else '/'}"

NELL995_LP_DATASET="https://raw.githubusercontent.com/otiliastr/coper/refs/heads/master/CoPER_ConvE/data/nell-995/"
NELL995_ARCHIVE_URL = "http://rtw.ml.cmu.edu/resources/results/08m/"

NELL_LP=f"{draft_folder}LP/"
NELL=f"{draft_folder}NELL/"

dest_txt_folder=destination_folder
dest_pkl_folder=f"{destination_folder}pickle/"

In [5]:
for folder in [
    draft_folder,
    dest_txt_folder,
    dest_pkl_folder,
    NELL_LP,
    NELL
]: 
    makedirs(folder, exist_ok=True)

In [6]:
for file in tqdm([
    "train.txt",
    "test.txt",
    "dev.txt"
]):
    file_uri=f"{NELL995_LP_DATASET}{file}"
    file_path = f"{NELL_LP}{file}"
    
    if exists(file_path):
        continue
        
    with urlopen(file_uri) as file_online:
        with open(file_path, "wb") as file_local:
            file_local.write(file_online.read())

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  1.62it/s]


In [7]:
with tqdm([
    "NELL.08m.995.esv.csv.gz",
    "NELL.08m.995.ontology.csv.gz"
]) as bar:
    for file in bar:
        file_uri=f"{NELL995_ARCHIVE_URL}{file}"
        file_path = f"{NELL}{file}"
        file_unpacked_path=file_path[:-3]
    
        bar.set_description(f"Downloading {file}")
        if not exists(file_path):
            with open(file_path, "wb") as file_local:
                with urlopen(file_uri) as file_online:
                    file_local.write(file_online.read())
    
        bar.set_description(f"Unpacking {file}")
        if not exists(file_unpacked_path):
            with open(file_unpacked_path, "wb") as file_unpacked:
                with gzip.open(file_path, 'rb') as file_local:
                    file_unpacked.write(file_local.read())

Unpacking NELL.08m.995.ontology.csv.gz: 100%|████████████████████████████████████████████| 2/2 [01:05<00:00, 32.61s/it]


# ent2id, rel2id, class2id

In [9]:
rels = set([])
ents = set([])

for file in [
    "train.txt",
    "test.txt",
    "dev.txt"
]:
    file_path=f"{NELL_LP}{file}"
    nb_lines=sum(1 for _ in open(file_path, "r", encoding="utf-8"))
    with open(file_path, "r") as split:
        with tqdm(enumerate(split), total=nb_lines) as bar:
            bar.set_description(f"Extracting entities and relations from {file}")
            for i, line in bar:
                s, p, o = line.strip().split("\t")
    
                splitted_s = s.split("_")
                splitted_o = o.split("_")
    
                if len(splitted_s) > 1:
                    s = f"{splitted_s[0]}:{splitted_s[1]}:{'_'.join(splitted_s[2:])}"
    
                if len(splitted_o) > 1:
                    o = f"{splitted_o[0]}:{splitted_o[1]}:{'_'.join(splitted_o[2:])}"
    
                ents.add(s)
                ents.add(o)
                rels.add(p)

Extracting entities and relations from train.txt: 100%|████████████████████| 149678/149678 [00:00<00:00, 211918.23it/s]
Extracting entities and relations from test.txt: 100%|█████████████████████████| 3992/3992 [00:00<00:00, 255316.58it/s]
Extracting entities and relations from dev.txt: 100%|████████████████████████████████████████| 543/543 [00:00<?, ?it/s]


In [10]:
len(list(ents)), len(list(rels))

(75492, 200)

In [11]:
rel2id = {rel: i for i, rel in enumerate(list(rels))}
id2rel = {i: rel for rel, i in rel2id.items()}

with open(f"{dest_pkl_folder}rel2id.pkl", "wb") as handle:
    dump(rel2id, handle)

ent2id = {ent: i for i, ent in enumerate(list(ents))}
id2ent = {i: ent for ent, i in ent2id.items()}

with open(f"{dest_pkl_folder}ent2id.pkl", "wb") as handle:
    dump(ent2id, handle)

In [12]:
{k: v for k, v in list(ent2id.items())[:10]}

{'concept:musicgenre:west_coast': 0,
 'concept:personeurope:st___columba': 1,
 'concept:hospital:kantonsspital': 2,
 'concept:coach:randy_jones': 3,
 'concept:fish:large_fish': 4,
 'concept:female:jennifer_connelly': 5,
 'concept:musicartist:matchbox_20': 6,
 'concept:bakedgood:shortbread_cookies': 7,
 'concept:book:the_empress_file': 8,
 '40.6998250000000,-74.0140300000000': 9}

In [13]:
{k: v for k, v in list(rel2id.items())[:10]}

{'concept:drugworkedonbyagent': 0,
 'concept:plantrepresentemotion': 1,
 'concept:agriculturalproductincludingagriculturalproduct': 2,
 'concept:itemfoundinroom': 3,
 'concept:competeswith': 4,
 'concept:agriculturalproductcomingfromvertebrate': 5,
 'concept:teamplayssport': 6,
 'concept:automakerproducesmodel': 7,
 'concept:agentcollaborateswithagent': 8,
 'concept:arthropodandotherarthropod': 9}

### Encode train, test, valid splits

In [15]:
inverse_predicate_offset = max(rel2id.values()) + 1
for split in [("train", "train"), ("test", "test"), ("valid", "dev")]:
    dest_split, nell_split = split
    nb_lines=sum(1 for _ in open(f"{NELL_LP}{nell_split}.txt", "r", encoding="utf-8"))
    with open(f"{NELL_LP}{nell_split}.txt", "r", encoding='utf-8') as r:
        with open(f"{dest_txt_folder}{dest_split}2id.txt", "w+", encoding='utf-8') as w:
            with tqdm(enumerate(r), total=nb_lines) as bar:
                bar.set_description(f"Encoding {dest_split}2id.txt")
                for i, line in bar:
                    s, p, o = line.strip().split("\t")

                    splitted_s = s.split("_")
                    splitted_o = o.split("_")
                    
                    if len(splitted_s) > 1:
                        s = f"{splitted_s[0]}:{splitted_s[1]}:{'_'.join(splitted_s[2:])}"
        
                    if len(splitted_o) > 1:
                        o = f"{splitted_o[0]}:{splitted_o[1]}:{'_'.join(splitted_o[2:])}"
                    
                    encoded_s = ent2id[s]
                    encoded_p = rel2id[p]
                    encoded_o = ent2id[o]
                    
                    w.write(f"{encoded_s}\t{encoded_p}\t{encoded_o}\n")   
                    
        copy_file(f"{dest_txt_folder}{dest_split}2id.txt", f"{dest_txt_folder}{dest_split}2id_inv.txt")
        with open(f"{dest_txt_folder}{dest_split}2id.txt", "r", encoding='utf-8') as r:
            with open(f"{dest_txt_folder}{dest_split}2id_inv.txt", "a", encoding='utf-8') as a:
                with tqdm(enumerate(r), total=nb_lines) as bar:
                    bar.set_description(f"Encoding {dest_split}2id_inv.txt")
                    for i, line in bar:
                        s, p, o = line.strip().split("\t")
                        p = int(p)
                        a.write(f"{o}\t{p+inverse_predicate_offset}\t{s}\n")

Encoding train2id.txt: 100%|███████████████████████████████████████████████| 149678/149678 [00:01<00:00, 117703.65it/s]
Encoding train2id_inv.txt: 100%|███████████████████████████████████████████| 149678/149678 [00:00<00:00, 250694.44it/s]
Encoding test2id.txt: 100%|█████████████████████████████████████████████████████| 3992/3992 [00:00<00:00, 84249.92it/s]
Encoding test2id_inv.txt: 100%|████████████████████████████████████████████████| 3992/3992 [00:00<00:00, 255425.64it/s]
Encoding valid2id.txt: 100%|██████████████████████████████████████████████████████| 543/543 [00:00<00:00, 34784.38it/s]
Encoding valid2id_inv.txt: 100%|█████████████████████████████████████████████████████████████| 543/543 [00:00<?, ?it/s]


# observed_heads_original_kg, observed_tails_original_kg, observed_heads_inv, observed_tails_inv

In [17]:
observed_heads_original_kg={}
observed_heads_inv={}
observed_tails_original_kg={}
observed_tails_inv={}

def observe(d, a, b, c):
    if not a in d.keys():
        d[a]={}

    if not b in d[a].keys():
        d[a][b]=[c]
    else:
        d[a][b].append(c)


for split in ["train", "test", "valid"]:
    nb_lines=sum(1 for _ in open(f"{dest_txt_folder}{split}2id.txt", "r", encoding="utf-8"))
    with open(f"{dest_txt_folder}{split}2id.txt", "r", encoding='utf-8') as r:
        with tqdm(enumerate(r), total=nb_lines) as bar:
            bar.set_description(f"Indexing triples from {split}2id.txt")
            for i, line in bar:
                s, p, o = line.strip().split("\t")
                s, p, o = int(s), int(p), int(o)

                observe(observed_tails_original_kg, s, p, o)
                observe(observed_tails_inv, s, p, o)
                observe(observed_tails_inv, o, p + inverse_predicate_offset, s)

                observe(observed_heads_original_kg, o, p, s)
                observe(observed_heads_inv, o, p, s)
                observe(observed_heads_inv, s, p + inverse_predicate_offset, o)

Indexing triples from train2id.txt: 100%|███████████████████████████████████| 149678/149678 [00:02<00:00, 59182.39it/s]
Indexing triples from test2id.txt: 100%|████████████████████████████████████████| 3992/3992 [00:00<00:00, 63825.25it/s]
Indexing triples from valid2id.txt: 100%|█████████████████████████████████████████| 543/543 [00:00<00:00, 34753.06it/s]


In [18]:
with open(f"{dest_pkl_folder}observed_heads_original_kg.pkl", "wb") as handle:
    dump(observed_heads_original_kg, handle)

with open(f"{dest_pkl_folder}observed_heads_inv.pkl", "wb") as handle:
    dump(observed_heads_inv, handle)

with open(f"{dest_pkl_folder}observed_tails_original_kg.pkl", "wb") as handle:
    dump(observed_tails_original_kg, handle)

with open(f"{dest_pkl_folder}observed_tails_inv.pkl", "wb") as handle:
    dump(observed_tails_inv, handle)

## Class2id

In [20]:
data_subject_class={}
data_object_class={}

nb_lines=sum(1 for _ in open(f"{draft_folder}NELL/NELL.08m.995.esv.csv", "r", encoding="utf-8"))
with open(f"{draft_folder}NELL/NELL.08m.995.esv.csv", "r", encoding='utf-8') as r:
    with tqdm(enumerate(r), total=nb_lines) as bar:
        for i, line in bar:
            if i==0:
                continue
            elmts=line.strip().split("\t")

            s=elmts[0]
            o=elmts[2]

            subject_types=[
                x.strip()
                for x in elmts[-3].split(" ")
                if len(x.strip()) > 0
                and not x.strip() == "concept:everypromotedthing"
            ]
            object_types=[
                x.strip()
                for x in elmts[-2].split(" ")
                if len(x.strip()) > 0
                and not x.strip() == "concept:everypromotedthing"
            ]

            if s in ent2id.keys() and len(subject_types) > 0:
                if not s in data_subject_class:
                    data_subject_class[s]=set([])
                for x in subject_types:
                    data_subject_class[s].add(x)

            if o in ent2id.keys() and len(object_types)> 0:
                if not o in data_object_class:
                    data_object_class[o]=set([])
                for x in object_types:
                    data_object_class[o].add(x)

100%|█████████████████████████████████████████████████████████████████████| 2664732/2664732 [00:29<00:00, 90260.24it/s]


In [21]:
inst_classes =set([
    sc
    for v in data_subject_class.values()
    for sc in v
]).union(set([
    sc
    for v in data_object_class.values()
    for sc in v
]))

In [22]:
len(inst_classes)

257

In [23]:
{k: v for k, v in list(data_subject_class.items())[:10]}

{'concept:geolocatablething:plaza_de_mayo': {'concept:geolocatablething'},
 'concept:geolocatablething:buildings': {'concept:geolocatablething'},
 'concept:televisionstation:kvhp': {'concept:televisionstation'},
 'concept:televisionstation:wyin': {'concept:televisionstation'},
 'concept:televisionstation:ktbs_tv': {'concept:televisionstation'},
 'concept:televisionstation:wyle': {'concept:televisionstation'},
 'concept:televisionstation:kvii': {'concept:televisionstation'},
 'concept:televisionstation:kvie': {'concept:televisionstation'},
 'concept:televisionstation:kviq': {'concept:televisionstation'},
 'concept:televisionstation:kvia': {'concept:televisionstation'}}

In [24]:
{k: v for k, v in list(data_object_class.items())[:10]}

{'concept:city:washington_d_c': {'concept:city'},
 'concept:university:unc_asheville': {'concept:university'},
 'concept:date:n2004': {'concept:date'},
 'concept:city:vegas': {'concept:city'},
 'concept:disease:problems': {'concept:disease'},
 'concept:disease:disorder': {'concept:disease'},
 'concept:disease:syndrome': {'concept:disease'},
 'concept:disease:allergies': {'concept:disease'},
 'concept:academicfield:health': {'concept:academicfield'},
 'concept:website:new_york_times': {'concept:newspaper', 'concept:website'}}

In [25]:
list(inst_classes)[:10]

['concept:weatherphenomenon',
 'concept:perceptionevent',
 'concept:archaea',
 'concept:visualizableattribute',
 'concept:fruit',
 'concept:militaryeventtype',
 'concept:mlauthor',
 'concept:city',
 'concept:restaurant',
 'concept:architect']

## Domain / Range extraction

In [27]:
domains={}
ranges={}
rangewithindomain={}
domainwithinrange={}

nb_lines=sum(1 for _ in open(f"{draft_folder}NELL/NELL.08m.995.ontology.csv", "r", encoding="utf-8"))
with open(f"{draft_folder}NELL/NELL.08m.995.ontology.csv", "r", encoding='utf-8') as r:
    with tqdm(enumerate(r), total=nb_lines) as bar:
        for i, line in bar:
            if i==0:
                continue

            s, p, o = line.strip().split("\t")

            if o == 'concept:everypromotedthing':
                continue

            if not s in rel2id.keys():
                continue

            if p == "domain":
                if not rel2id[s] in domains:
                    domains[s]=set([])
                domains[s].add(o)

            if p == "range":
                if not rel2id[s] in ranges:
                    ranges[s]=set([])
                ranges[s].add(o)

            if p == "domainwithinrange":
                domainwithinrange[rel2id[s]]=o == "true"

            if p == "rangewithindomain":
                rangewithindomain[rel2id[s]]=o == "true"

100%|██████████████████████████████████████████████████████████████████████| 738239/738239 [00:01<00:00, 595115.57it/s]


In [28]:
set([len(v) for v in domains.values()]), set([len(v) for v in ranges.values()])

({1}, {1})

In [29]:
domains={k: list(v)[0] for k, v in domains.items()}
ranges={k: list(v)[0] for k, v in ranges.items()}

In [30]:
{k: v for k, v in list(domains.items())[:5]}

{'concept:sportfansincountry': 'concept:sport',
 'concept:thinghascolor': 'concept:visualizablething',
 'concept:athleteplayssport': 'concept:athlete',
 'concept:visualartistartform': 'concept:visualartist',
 'concept:persondiedatage': 'concept:person'}

In [31]:
{k: v for k, v in list(ranges.items())[:5]}

{'concept:sportfansincountry': 'concept:country',
 'concept:thinghascolor': 'concept:color',
 'concept:athleteplayssport': 'concept:sport',
 'concept:visualartistartform': 'concept:visualartform',
 'concept:persondiedatage': 'concept:nonneginteger'}

In [32]:
inst_classes = inst_classes.union(set(domains.values()).union(set(ranges.values())))

In [33]:
onto_subject_class={}
onto_object_class={}

for subject_id in observed_tails_original_kg.keys():
    for predicate_id in observed_tails_original_kg[subject_id].keys():
        if id2rel[predicate_id] in domains:
            if not subject_id in onto_subject_class:
                onto_subject_class[id2ent[subject_id]]=set([])
            onto_subject_class[id2ent[subject_id]].add(domains[id2rel[predicate_id]])

for object_id in observed_heads_original_kg.keys():
    for predicate_id in observed_heads_original_kg[object_id].keys():
        if id2rel[predicate_id] in ranges:
            if not object_id in onto_object_class:
                onto_object_class[id2ent[object_id]]=set([])
            onto_object_class[id2ent[object_id]].add(ranges[id2rel[predicate_id]])

In [34]:
{k: v for k, v in list(onto_subject_class.items())[-10:]}

{'concept:sportsteam:dalhousie_university': {'concept:sportsteam'},
 'concept:governmentorganization:federal': {'concept:organization'},
 'concept:coach:richard_zednik': {'concept:athlete'},
 'concept:county:duke_university': {'concept:organization'},
 'concept:company:remark': {'concept:organization'},
 'concept:sportsteam:bond_university': {'concept:sportsteam'},
 'concept:coach:lubomir_visnovsky': {'concept:athlete'},
 'concept:sportsteam:cal_poly_slo_mustangs': {'concept:sportsteam'},
 'concept:athlete:mike_wallace': {'concept:person'},
 'concept:person:brent_staples': {'concept:person'}}

In [35]:
{k: v for k, v in list(onto_object_class.items())[-10:]}

{'concept:coach:travis_ford': {'concept:person'},
 'concept:sportsteam:carnegie_mellon_university': {'concept:organization'},
 'concept:person:lower': {'concept:person'},
 'concept:coach:pat_fitzgerald': {'concept:person'},
 'concept:university:southern_cal': {'concept:organization'},
 'concept:university:uw': {'concept:organization'},
 'concept:athlete:ovechkin': {'concept:person'},
 'concept:politicianus:richard_m__daley': {'concept:person'},
 'concept:coach:john_fox': {'concept:person'},
 'concept:person:skip_bertman': {'concept:person'}}

## subclassofId

In [37]:
subclassof_onto={}

nb_lines=sum(1 for _ in open(f"{draft_folder}NELL/NELL.08m.995.ontology.csv", "r", encoding="utf-8"))
with open(f"{draft_folder}NELL/NELL.08m.995.ontology.csv", "r", encoding='utf-8') as r:
    with tqdm(enumerate(r), total=nb_lines) as bar:
        for i, line in bar:
            if i==0:
                continue

            s, p, o = line.strip().split("\t")

            if not p == "generalizations":
                continue

            if o == 'concept:everypromotedthing':
                continue

            if not s in subclassof_onto.keys():
                subclassof_onto[s]=set([])

            subclassof_onto[s].add(o)

100%|██████████████████████████████████████████████████████████████████████| 738239/738239 [00:00<00:00, 854582.10it/s]


In [38]:
subclassof={k: v for k, v in subclassof_onto.items() if k in inst_classes}

In [39]:
len(subclassof_onto), len(subclassof)

(2356, 256)

In [40]:
{k: v for k, v in list(subclassof.items())[:10]}

{'concept:televisionstation': {'concept:geolocatablething',
  'concept:mediacompany'},
 'concept:river': {'concept:geolocatablething', 'concept:location'},
 'concept:park': {'concept:attraction', 'concept:geolocatablething'},
 'concept:university': {'concept:geolocatablething', 'concept:school'},
 'concept:stateorprovince': {'concept:geolocatablething',
  'concept:geopoliticallocation',
  'concept:geopoliticalorganization'},
 'concept:building': {'concept:geolocatablething', 'concept:location'},
 'concept:skyscraper': {'concept:building'},
 'concept:retailstore': {'concept:building'},
 'concept:museum': {'concept:attraction', 'concept:building'},
 'concept:trainstation': {'concept:building'}}

In [41]:
def ancestors(c):
    if not c in subclassof_onto.keys():
        return [c]
    else:
        result = [c]
        for sc in subclassof_onto[c]:
            result.extend(ancestors(sc))
        return result

In [42]:
ancestors('concept:location'), ancestors('concept:university')

(['concept:location'],
 ['concept:university',
  'concept:school',
  'concept:organization',
  'concept:humanagent',
  'concept:agent',
  'concept:geolocatablething'])

In [43]:
subclassof_all={}

for k, v in tqdm(subclassof.items()):
    subclassof_all[k]=set([
        sc
        for c in v
        for sc in ancestors(c)
    ]).union(set([k]))

100%|████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<?, ?it/s]


In [44]:
{k: v for k, v in list(subclassof_all.items())[:4]}

{'concept:televisionstation': {'concept:agent',
  'concept:company',
  'concept:geolocatablething',
  'concept:humanagent',
  'concept:mediacompany',
  'concept:organization',
  'concept:televisionstation'},
 'concept:river': {'concept:geolocatablething',
  'concept:location',
  'concept:river'},
 'concept:park': {'concept:attraction',
  'concept:geolocatablething',
  'concept:location',
  'concept:park'},
 'concept:university': {'concept:agent',
  'concept:geolocatablething',
  'concept:humanagent',
  'concept:organization',
  'concept:school',
  'concept:university'}}

In [45]:
classes=set(subclassof_all.keys()).union(set([
    sc
    for v in subclassof_all.values()
    for sc in v
])).union(inst_classes)

In [46]:
len(classes)

267

In [47]:
class2id={c: i for i, c in enumerate(list(classes))}
id2class = {i: rel for rel, i in class2id.items()}

with open(f"{dest_pkl_folder}class2id.pkl", "wb") as handle:
    dump(class2id, handle)

In [48]:
subclassof2id={class2id[k]: [class2id[x] for x in v] for k, v in subclassof.items()}

for id in id2class.keys():
    if id in subclassof2id.keys():
        continue
    subclassof2id[id]=[]

subclassof_all2id={class2id[k]: [class2id[x] for x in v] for k, v in subclassof_all.items()}

for id in id2class.keys():
    if id in subclassof_all2id.keys():
        continue
    subclassof_all2id[id]=[]

In [49]:
inst_type={
    ent2id[ent]: [
        class2id[x]
        for x in list(
            data_subject_class.get(ent, set([])).union(
                data_object_class.get(ent, set([]))
            ).union(
                onto_subject_class.get(ent, set([]))
            ).union(
                onto_object_class.get(ent, set([]))
            )
        )
    ]
    for ent in ent2id.keys()
}

In [50]:
len(inst_type.keys()), len(ent2id.keys())

(75492, 75492)

In [51]:
{k: v for k, v in list(inst_type.items())[:10]}

{0: [46],
 1: [],
 2: [],
 3: [173, 45, 189, 87],
 4: [99, 57],
 5: [222, 19],
 6: [59],
 7: [105],
 8: [234],
 9: [145]}

In [52]:
{id2ent[k]: [id2class[x] for x in v] for k, v in list(inst_type.items())[:10]}

{'concept:musicgenre:west_coast': ['concept:musicgenre'],
 'concept:personeurope:st___columba': [],
 'concept:hospital:kantonsspital': [],
 'concept:coach:randy_jones': ['concept:coach',
  'concept:athlete',
  'concept:personaustralia',
  'concept:male'],
 'concept:fish:large_fish': ['concept:animal', 'concept:fish'],
 'concept:female:jennifer_connelly': ['concept:female', 'concept:celebrity'],
 'concept:musicartist:matchbox_20': ['concept:musicartist'],
 'concept:bakedgood:shortbread_cookies': ['concept:bakedgood'],
 'concept:book:the_empress_file': ['concept:book'],
 '40.6998250000000,-74.0140300000000': ['concept:llcoordinate']}

In [53]:
inst_type_all={
    ident: list(set([sc for c in inst_type[ident] for sc in subclassof_all2id[c]]))
    for ident, v in inst_type.items()
}

In [54]:
len(inst_type_all.keys()), len(inst_type_all.keys())

(75492, 75492)

In [55]:
{k: v for k, v in list(inst_type_all.items())[:10]}

{0: [46, 23],
 1: [],
 2: [],
 3: [45, 173, 141, 144, 210, 87, 189, 223],
 4: [99, 141, 143, 150, 23, 57],
 5: [141, 210, 19, 222, 223],
 6: [265, 59, 141, 223],
 7: [105, 174, 20, 155, 125],
 8: [65, 234, 23],
 9: [145, 23]}

In [56]:
{id2ent[k]: [id2class[x] for x in v] for k, v in list(inst_type_all.items())[:10]}

{'concept:musicgenre:west_coast': ['concept:musicgenre',
  'concept:abstractthing'],
 'concept:personeurope:st___columba': [],
 'concept:hospital:kantonsspital': [],
 'concept:coach:randy_jones': ['concept:athlete',
  'concept:coach',
  'concept:agent',
  'concept:personbylocation',
  'concept:person',
  'concept:male',
  'concept:personaustralia',
  'concept:humanagent'],
 'concept:fish:large_fish': ['concept:animal',
  'concept:agent',
  'concept:vertebrate',
  'concept:species',
  'concept:abstractthing',
  'concept:fish'],
 'concept:female:jennifer_connelly': ['concept:agent',
  'concept:person',
  'concept:celebrity',
  'concept:female',
  'concept:humanagent'],
 'concept:musicartist:matchbox_20': ['concept:organization',
  'concept:musicartist',
  'concept:agent',
  'concept:humanagent'],
 'concept:bakedgood:shortbread_cookies': ['concept:bakedgood',
  'concept:food',
  'concept:visualizableobject',
  'concept:item',
  'concept:visualizablething'],
 'concept:book:the_empress_file

In [57]:
class2id2ent2id={
    idclass: [
        ident
        for ident in inst_type_all.keys()
        if idclass in inst_type_all[ident]
    ]
    for idclass in id2class.keys()
}

In [58]:
len(class2id2ent2id.keys()), len(class2id.keys())

(267, 267)

In [59]:
{k: v[:3] for k, v in list(class2id2ent2id.items())[:5]}

{0: [8442, 11797, 16509],
 1: [31365, 32136, 34822],
 2: [186, 460, 512],
 3: [63, 405, 636],
 4: [130, 234, 870]}

In [60]:
{id2class[k]: [id2ent[x] for x in v[:3]] for k, v in list(class2id2ent2id.items())[:5]}

{'concept:weatherphenomenon': ['concept:weatherphenomenon:rain',
  'concept:weatherphenomenon:wind',
  'concept:weatherphenomenon:storm'],
 'concept:perceptionevent': ['concept:perceptionevent:flash',
  'concept:perceptionevent:sound',
  'concept:perceptionevent:light'],
 'concept:archaea': ['concept:drug:herceptin',
  'concept:drug:divalproex',
  'concept:drug:mometasone'],
 'concept:visualizableattribute': ['concept:color:maroon',
  'concept:color:cream',
  'concept:geometricshape:vertex'],
 'concept:fruit': ['concept:plant:hickory',
  'concept:fruit:sunflower_seed',
  'concept:agriculturalproduct:dry_beans']}

In [61]:
r2id2dom2id={
    rel2id[k]: class2id[v]
    for k, v in domains.items()
}

r2id2range2id={
    rel2id[k]: class2id[v]
    for k, v in ranges.items()
}

In [62]:
with open(f"{dest_pkl_folder}subclassof2id.pkl", "wb") as handle:
    dump(observed_heads_original_kg, handle)

with open(f"{dest_pkl_folder}subclassof_all2id.pkl", "wb") as handle:
    dump(observed_heads_inv, handle)

with open(f"{dest_pkl_folder}inst_type.pkl", "wb") as handle:
    dump(observed_tails_original_kg, handle)

with open(f"{dest_pkl_folder}inst_type_all.pkl", "wb") as handle:
    dump(observed_tails_inv, handle)

with open(f"{dest_pkl_folder}class2id2ent2id.pkl", "wb") as handle:
    dump(observed_tails_inv, handle)

with open(f"{dest_pkl_folder}r2id2dom2id.pkl", "wb") as handle:
    dump(r2id2dom2id, handle)

with open(f"{dest_pkl_folder}r2id2range2id.pkl", "wb") as handle:
    dump(r2id2dom2id, handle)

# Analysis

In [64]:
from pickle import load
import plotly.graph_objects as go

In [65]:
rel2id = None

with open(f"{dest_pkl_folder}rel2id.pkl", "rb") as handle:
    rel2id = load(handle)

id2rel = {v: k for k, v in rel2id.items()}

with open(f"{dest_pkl_folder}r2id2dom2id.pkl", "rb") as handle:
    r2id2dom2id = load(handle)

r2id2range2id = None

with open(f"{dest_pkl_folder}r2id2range2id.pkl", "rb") as handle:
    r2id2range2id = load(handle)

observed_heads_original_kg = None

with open(f"{dest_pkl_folder}observed_heads_original_kg.pkl", "rb") as handle:
    observed_heads_original_kg = load(handle)

class2id2ent2id = None

with open(f"{dest_pkl_folder}class2id2ent2id.pkl", "rb") as handle:
    class2id2ent2id = load(handle)

In [66]:
total=len(rel2id.values())

signed=[
    p
    for p in rel2id.values()
    if p in r2id2dom2id.keys() 
    and p in r2id2range2id.keys()
]

domain_no_range = [
    p
    for p in rel2id.values()
    if p in r2id2dom2id.keys() 
    and not p in r2id2range2id.keys()
]

range_no_domain = [
    p
    for p in rel2id.values()
    if not p in r2id2dom2id.keys() 
    and p in r2id2range2id.keys()
]

not_signed = [
    p
    for p in rel2id.values()
    if not p in r2id2dom2id.keys() 
    and not p in r2id2range2id.keys()
]

total, int(10000*len(signed)/total)/100, int(10000*len(domain_no_range)/total)/100, int(10000*len(range_no_domain)/total/100), int(10000*len(not_signed)/total)/100

(200, 95.0, 0.0, 0, 5.0)

In [67]:
total, len(signed), len(domain_no_range), len(range_no_domain), len(not_signed)

(200, 190, 0, 0, 10)

In [68]:
predicate_count={id: 0 for id in rel2id.values()}

def display_chart(splits=["train", "test", "valid"]):
    
    predicate_count={id: 0 for id in rel2id.values()}
    
    for split in splits:
        nb_lines=sum(1 for _ in open(f"{dest_txt_folder}{split}2id.txt", "r", encoding="utf-8"))
        with open(f"{dest_txt_folder}{split}2id.txt", "r", encoding='utf-8') as r:
            with tqdm(enumerate(r), total=nb_lines, disable=True) as bar:
                for i, line in bar:
                    p = int(line.strip().split("\t")[1])
                    predicate_count[p] += 1
        
    total_count=sum(predicate_count.values())
    
    signed_count=sum([
        predicate_count[p]
        for p in signed
    ])
    
    domain_no_range_count = sum([
        predicate_count[p]
        for p in domain_no_range
    ])
    
    range_no_domain_count = sum([
        predicate_count[p]
        for p in range_no_domain
    ])
    
    not_signed_count = sum([
        predicate_count[p]
        for p in not_signed
    ])
    
    def percent(number, total, precision=4):
        if total == 0:
            return "??"
        return int((10**precision)*number/total)/(10**(precision - 2))
    
    print("In number of triples:\n")
    print(total_count, signed_count, domain_no_range_count, range_no_domain_count, not_signed_count)
    
    print("\nIn percentage of triples:\n")
    print(total_count, percent(signed_count, total_count), percent(domain_no_range_count, total_count), percent(range_no_domain_count, total_count), percent(not_signed_count, total_count))


    predicate_domains_instances = {i: len(class2id2ent2id[r2id2dom2id[i]]) for i in r2id2dom2id.keys()}
    
    not_instanciated_domains = [k for k in predicate_domains_instances.keys() if predicate_domains_instances[k] == 0]
    
    not_instanciated_domains_count = sum([predicate_count[p] for p in not_instanciated_domains])
    not_instanciated_domains_for_signed_count = sum([predicate_count[p] for p in not_instanciated_domains if p in signed])
    not_instanciated_domains_for_domained_count = sum([predicate_count[p] for p in not_instanciated_domains if p in domain_no_range])
    predicate_ranges_instances = {i: len(class2id2ent2id[r2id2range2id[i]]) for i in r2id2range2id.keys()}
    
    not_instanciated_ranges = [k for k in predicate_ranges_instances.keys() if predicate_ranges_instances[k] == 0]
    
    not_instanciated_ranges_count = sum([predicate_count[p] for p in not_instanciated_ranges])
    not_instanciated_ranges_for_signed_count = sum([predicate_count[p] for p in not_instanciated_ranges if p in signed])
    not_instanciated_ranges_for_ranged_count = sum([predicate_count[p] for p in not_instanciated_ranges if p in range_no_domain])
    
    print(
        "These domained predicates have no instances:\n",
        "\t" + "\n\t".join([id2rel[p] for p in not_instanciated_domains])
    )
    
    print(
        "\n These ranged predicates have no instances:\n",
        "\t" + "\n\t".join([id2rel[p] for p in not_instanciated_ranges])
    )
    
    print("\nOverlap between these sets?", len([x for x in not_instanciated_ranges if x in not_instanciated_domains]) > 0)
    
    not_instanciated_signatures_for_signed_count = (not_instanciated_domains_for_signed_count + not_instanciated_ranges_for_signed_count)
    
    print(
        "\nImpact on signed triples: ",
        f"{not_instanciated_signatures_for_signed_count} ({percent(not_instanciated_signatures_for_signed_count, signed_count)}% of fully signed triples)"
    )
    
    print(
        "Impact on domained triples: ",
        f"{not_instanciated_domains_for_domained_count} ({percent(not_instanciated_domains_for_domained_count, domain_no_range_count)}% of domained triples)"
    )
    
    print(
        "Impact on ranged triples: ",
        f"{not_instanciated_ranges_for_ranged_count} ({percent(not_instanciated_ranges_for_ranged_count, range_no_domain_count)}% of ranged triples)"
    )

    x_divider=8
    y_divider=20
    y_offset=0.65
    
    nodes = {
        0: {
            'label': 'Total',
            'x': 0/x_divider,
            'y': 0/y_divider+y_offset
        },
        1: {
            'label': 'Full sign', 
            'x': 1/x_divider,
            'y': 4/y_divider+y_offset
        },
        4: {
            'label': 'No sign',
            'x': 2/x_divider,
            'y': -9/y_divider+y_offset
        }
    }
    #fig.write_image(f"{destination_folder}fig1.png", engine='orca', width=1500, height=450)

In [69]:
print("\n TRAIN SPLIT \n")

display_chart(splits=["train"])

print("\n TEST SPLIT \n")

display_chart(splits=["test"])

print("\n VALID SPLIT \n")

display_chart(splits=["valid"])


 TRAIN SPLIT 

In number of triples:

149678 109800 0 0 39878

In percentage of triples:

149678 73.35 0.0 0.0 26.64
These domained predicates have no instances:
 	

 These ranged predicates have no instances:
 	

Overlap between these sets? False

Impact on signed triples:  0 (0.0% of fully signed triples)
Impact on domained triples:  0 (??% of domained triples)
Impact on ranged triples:  0 (??% of ranged triples)

 TEST SPLIT 

In number of triples:

3992 3992 0 0 0

In percentage of triples:

3992 100.0 0.0 0.0 0.0
These domained predicates have no instances:
 	

 These ranged predicates have no instances:
 	

Overlap between these sets? False

Impact on signed triples:  0 (0.0% of fully signed triples)
Impact on domained triples:  0 (??% of domained triples)
Impact on ranged triples:  0 (??% of ranged triples)

 VALID SPLIT 

In number of triples:

543 543 0 0 0

In percentage of triples:

543 100.0 0.0 0.0 0.0
These domained predicates have no instances:
 	

 These ranged predica