# DB100k dataset semantically enriched

In [2]:
from glob import glob
from tqdm import tqdm
from pickle import dump, load, HIGHEST_PROTOCOL
from os import makedirs
from os.path import exists
from urllib.request import urlopen
from bz2 import open as bz2open
from shutil import copy as copy_file

try:
    from rdflib import Graph
except:
    !pip install rdflib
    !conda install -c plotly plotly-orca -y

In [3]:
draft_folder=f"../draft/DB100k-DBP"
destination_folder=f"."

In [4]:
draft_folder=f"{draft_folder}{'' if draft_folder.endswith('/') else '/'}"
destination_folder=f"{destination_folder}{'' if destination_folder.endswith('/') else '/'}"

DB93k_folder_url = "https://github.com/nicolas-hbt/benchmark-sematk/raw/refs/heads/main/datasets/DB93K/"
DB100k_folder_url = "https://github.com/iieir-km/ComplEx-NNE_AER/raw/refs/heads/master/datasets/DB100K/"
DBPedia_archive_url = "https://downloads.dbpedia.org/2016-10/core/"
DBPedia_ontology_url = "http://downloads.dbpedia.org/2016-10/dbpedia_2016-10.owl"

DB93k_draft_folder=f"{draft_folder}DB93k/"
DB100k_draft_folder=f"{draft_folder}DB100k/"
DBpedia_draft_folder=f"{draft_folder}DBpedia/"

dest_txt_folder=destination_folder
dest_pkl_folder=f"{destination_folder}pickle/"

In [5]:
for folder in [
    DB93k_draft_folder,
    DB100k_draft_folder,
    DBpedia_draft_folder,
    dest_txt_folder,
    dest_pkl_folder
]: 
    makedirs(folder, exist_ok=True)

In [6]:
for file in tqdm([
    "rel2id.pkl",
    "class2id.pkl",
    "ent2id.pkl",
    "class2id.pkl",
    "rel2dom.pkl",
    "rel2range.pkl",
    "subclassof2id.pkl"
]):
    file_uri=f"{DB93k_folder_url}{file}"
    file_path = f"{DB93k_draft_folder}{file}"
    
    if exists(file_path):
        continue
        
    with urlopen(file_uri) as file_online:
        with open(file_path, "wb") as file_local:
            file_local.write(file_online.read())

100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<?, ?it/s]


In [7]:
for file in tqdm([
    "_train.txt",
    "_test.txt",
    "_valid.txt",
]):
    file_uri=f"{DB100k_folder_url}{file}"
    file_path = f"{DB100k_draft_folder}{file}"
    
    if exists(file_path):
        continue
        
    with urlopen(file_uri) as file_online:
        with open(file_path, "wb") as file_local:
            file_local.write(file_online.read())

100%|████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<?, ?it/s]


In [8]:
if not exists(f"{DBpedia_draft_folder}dbpedia_2016-10.owl"):
    with urlopen(DBPedia_ontology_url) as file_online:
        with open(f"{DBpedia_draft_folder}dbpedia_2016-10.owl", "wb") as file_local:
            file_local.write(file_online.read())

In [9]:
with tqdm([
    "interlanguage_links_chapters_en.ttl",
    "instance_types_en.ttl",
]) as bar:
    for file in bar:
        bar.set_description(f"Downloading {file}")

        file_uri=f"{DBPedia_archive_url}{file}.bz2"
        file_path=f"{DBpedia_draft_folder}{file}.bz2"

        if exists(file_path):
            continue
            
        with urlopen(file_uri) as file_online:
            with open(file_path, "wb") as file_local:
                file_local.write(file_online.read())

Downloading interlanguage_links_chapters_en.ttl:   0%|                                           | 0/2 [00:00<?, ?it/s]


HTTPError: HTTP Error 404: Not Found

In [None]:
with tqdm([
    "interlanguage_links_chapters_en.ttl",
    "instance_types_en.ttl",
]) as bz2_bar:
    for file in bz2_bar:
        bz2_bar.set_description(f"Decompressing {file}")

        ttl_file_path=f"{DBpedia_draft_folder}{file}"
        bz2_file_path=f"{ttl_file_path}.bz2"

        if exists(ttl_file_path):
            continue
        
        with bz2open(bz2_file_path, "r") as bz2_file:
            with open(ttl_file_path, "w", encoding="utf-8") as ttl_file:
                for line in bz2_file:
                    ttl_file.write(line.decode())

In [None]:
replace={
    "www.monolithgraphics.com": "Q1969125",
    "?autoplay=true": "Q477993",
    "player.html": "Q7952744"
}
replace={}

In [None]:
entities_100=set([])
predicates_100=set([])
count_100=0
train_100=0
test_100=0
valid_100=0

for split in ["train", "test", "valid"]:
    file_path=f"{DB100k_draft_folder}_{split}.txt"
    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            s, p, o = line.strip().split("\t")
            s = replace[s] if s in replace.keys() else s
            o = replace[o] if o in replace.keys() else o
            predicates_100.add(p)
            entities_100.add(s)
            entities_100.add(o)
            count_100+=1
            if split=="train":
                train_100+=1
            if split=="test":
                test_100+=1
            if split=="valid":
                valid_100+=1

In [None]:
list(predicates_100)[:10]

# ent2id, rel2id, class2id
## rel2id

In [None]:
predicates_100=[
    f"<http://dbpedia.org/ontology/{predicate}>"
    for predicate in predicates_100
]

In [None]:
rel2id_100={p: i for i, p in enumerate(predicates_100)}
id2rel_100={i: p for p, i in rel2id_100.items()}
inverse_predicate_offset = max(rel2id_100.values()) + 1

with open(f"{dest_pkl_folder}rel2id.pkl", "wb") as handle:
    dump(rel2id_100, handle)

## ent2id

In [None]:
wikidata_dbpedia={}
nb_lines=sum(1 for _ in open(f"{DBpedia_draft_folder}interlanguage_links_chapters_en.ttl", "r", encoding="utf-8"))
with open(f"{DBpedia_draft_folder}interlanguage_links_chapters_en.ttl", "r", encoding="utf-8") as f:
    for i, line in tqdm(enumerate(f), total=nb_lines):
        if i == 0:
            continue
        s, p, o = line.strip().split(" ")[:3]
        if not s.startswith("<http://dbpedia.org/resource/"):
            continue
        if not o.startswith("<http://www.wikidata.org/entity/"):
            continue
        o = o[len("<http://www.wikidata.org/entity/"):-1]
        if not o in entities_100:
            continue
        wikidata_dbpedia[o]=s

unknown_entities_100=[
    x for x in entities_100
    if not x in wikidata_dbpedia
]

for k in unknown_entities_100:
    wikidata_dbpedia[k]=k

with open(f"{dest_pkl_folder}wikidata_dbpedia.pkl", "wb") as handle:
    dump(wikidata_dbpedia, handle)

ent2id_100 = {wikidata_dbpedia[k]: i for i, k in enumerate(wikidata_dbpedia.keys())}
id2ent_100 = {v: k for k, v in ent2id_100.items()}

with open(f"{dest_pkl_folder}ent2id.pkl", "wb") as handle:
    dump(wikidata_dbpedia, handle)

In [None]:
unknown_entities_100

### Encode train, test, valid splits

In [None]:
for split in ["train", "test", "valid"]:
    nb_lines=sum(1 for _ in open(f"{DB100k_draft_folder}_{split}.txt", "r", encoding="utf-8"))
    with open(f"{DB100k_draft_folder}_{split}.txt", "r", encoding='utf-8') as r:
        with open(f"{dest_txt_folder}{split}2id.txt", "w+", encoding='utf-8') as w:
            with tqdm(enumerate(r), total=nb_lines) as bar:
                bar.set_description(f"Encoding {split}2id.txt")
                for i, line in bar:
                    s, p, o = line.strip().split("\t")
                    s = replace[s] if s in replace.keys() else s
                    o = replace[o] if o in replace.keys() else o
                    encoded_s = ent2id_100[wikidata_dbpedia[s]]
                    encoded_p = rel2id_100[f"<http://dbpedia.org/ontology/{p}>"]
                    encoded_o = ent2id_100[wikidata_dbpedia[o]]
                    w.write(f"{encoded_s}\t{encoded_p}\t{encoded_o}\n")   
        copy_file(f"{dest_txt_folder}{split}2id.txt", f"{dest_txt_folder}{split}2id_inv.txt")
        
        with open(f"{dest_txt_folder}{split}2id.txt", "r", encoding='utf-8') as r:
            with open(f"{dest_txt_folder}{split}2id_inv.txt", "a", encoding='utf-8') as a:
                with tqdm(enumerate(r), total=nb_lines) as bar:
                    bar.set_description(f"Encoding {split}2id_inv.txt")
                    for i, line in bar:
                        s, p, o = line.strip().split("\t")
                        p = int(p)
                        a.write(f"{o}\t{p+inverse_predicate_offset}\t{s}\n")

# observed_heads_original_kg, observed_tails_original_kg, observed_heads_inv, observed_tails_inv

In [None]:
observed_heads_original_kg={}
observed_heads_inv={}
observed_tails_original_kg={}
observed_tails_inv={}

def observe(d, a, b, c):
    if not a in d.keys():
        d[a]={}

    if not b in d[a].keys():
        d[a][b]=[c]
    else:
        d[a][b].append(c)


for split in ["train", "test", "valid"]:
    nb_lines=sum(1 for _ in open(f"{dest_txt_folder}{split}2id.txt", "r", encoding="utf-8"))
    with open(f"{dest_txt_folder}{split}2id.txt", "r", encoding='utf-8') as r:
        with tqdm(enumerate(r), total=nb_lines) as bar:
            bar.set_description(f"Indexing triples from {split}2id.txt")
            for i, line in bar:
                s, p, o = line.strip().split("\t")
                s, p, o = int(s), int(p), int(o)

                observe(observed_tails_original_kg, s, p, o)
                observe(observed_tails_inv, s, p, o)
                observe(observed_tails_inv, o, p + inverse_predicate_offset, s)

                observe(observed_heads_original_kg, o, p, s)
                observe(observed_heads_inv, o, p, s)
                observe(observed_heads_inv, s, p + inverse_predicate_offset, o)

In [None]:
with open(f"{dest_pkl_folder}observed_heads_original_kg.pkl", "wb") as handle:
    dump(observed_heads_original_kg, handle)

with open(f"{dest_pkl_folder}observed_heads_inv.pkl", "wb") as handle:
    dump(observed_heads_inv, handle)

with open(f"{dest_pkl_folder}observed_tails_original_kg.pkl", "wb") as handle:
    dump(observed_tails_original_kg, handle)

with open(f"{dest_pkl_folder}observed_tails_inv.pkl", "wb") as handle:
    dump(observed_tails_inv, handle)

## Class2id

In [None]:
instance_types_100={}
classes_100=set([])

nb_lines=sum(1 for _ in open(f"{DBpedia_draft_folder}instance_types_en.ttl", "r", encoding="utf-8"))
with open(f"{DBpedia_draft_folder}instance_types_en.ttl", "r", encoding="utf-8") as f:
    for i, line in tqdm(enumerate(f), total=nb_lines):
        if i ==0:
            continue
        s, p, o = line.strip().split(" ")[:3]

        if not s in ent2id_100:
            continue

        if not o.startswith("<http://dbpedia.org/ontology/"):
            continue
        
        classes_100.add(o)
        instance_types_100[ent2id_100[s]]=o

In [None]:
ontology_graph = Graph()
ontology_graph.parse(f"{DBpedia_draft_folder}dbpedia_2016-10.owl", format="xml")

In [None]:
predicates_values = "\n".join([
    f"({p})"
    for p in predicates_100
])

In [None]:
signatures = [
    (f"<{str(predicate)}>", str(sign).split("#")[-1], f"<{str(classed)}>")
    for (predicate, sign, classed) in ontology_graph.query("""
    SELECT ?p ?sign ?x WHERE {
        VALUES (?p) {
            $PREDICATE_VALUES
        }
        VALUES (?sign) {
            (<http://www.w3.org/2000/01/rdf-schema#domain>)
            (<http://www.w3.org/2000/01/rdf-schema#range>)
        }
        ?p ?sign ?x
    }
    """.replace("$PREDICATE_VALUES", predicates_values))
]

In [None]:
rel2domain={
    predicate: domain
    for predicate, signature, domain in signatures
    if signature == "domain"
    and domain.startswith("<http://dbpedia.org/ontology/")
}

rel2range={
    predicate: range
    for predicate, signature, range in signatures
    if signature == "range"
    and range.startswith("<http://dbpedia.org/ontology/")
}

for c in rel2domain.values():
    classes_100.add(c)


for c in rel2range.values():
    classes_100.add(c)

In [None]:
{k: v for k, v in list(rel2domain.items())[:5]}

In [None]:
{k: v for k, v in list(rel2range.items())[:5]}

In [None]:
len(classes_100)

In [None]:
super_classes = {}

discovered_classes = list(classes_100)

while len(discovered_classes) > 0:
    iteration_request="""
                SELECT ?onto_class ?super_class WHERE {
                    VALUES (?onto_class) {
                        $CLASSES_VALUES
                    }
                    ?onto_class <http://www.w3.org/2000/01/rdf-schema#subClassOf> ?super_class .
                    filter(strstarts(str(?super_class), "http://dbpedia.org/ontology/"))
                }
            """.replace(
            "$CLASSES_VALUES",
            "\n".join([
                    f"{' ' * 4 * 5}({c})"
                    for c in discovered_classes
                ]
             )
        )

    iteration_superclasses = [
        (f"<{str(onto_class)}>", f"<{str(super_class)}>")
        for (onto_class, super_class) in ontology_graph.query(iteration_request)
    ]
    
    for onto_class, super_class in iteration_superclasses:
        super_classes[onto_class] = super_class

    discovered_classes = list(set([
        super_class
        for _, super_class in iteration_superclasses
        if not super_class in super_classes.keys()
    ]))

for onto_class, super_class in super_classes.items():
    classes_100.add(onto_class)
    classes_100.add(super_class)

{k: v for k, v in list(super_classes.items())[:10]}

In [None]:
len(classes_100)

In [None]:
class2id_100={c: i for i, c in enumerate(classes_100)}

with open(f"{dest_pkl_folder}class2id.pkl", "wb+") as handle:
    dump(class2id_100, handle)

id2class_100={i: c for c, i in class2id_100.items()}

In [None]:
{k: v for k, v in list(class2id_100.items())[:5]}

In [None]:
instype={
    entid: class2id_100[onto_class]
    for entid, onto_class in instance_types_100.items()
}

with open(f"{dest_pkl_folder}instype.pkl", "wb+") as handle:
    dump(instype, handle)

{k: v for k, v in list(instype.items())[:5]}

In [None]:
len(ent2id_100.keys()) - len(instype.keys())

In [None]:
[
    ent_uri
    for ent_uri, ent_id in ent2id_100.items()
    if not ent_id in instype.keys()
][:10]

In [None]:
r2id2dom2id = {
    rel2id_100[rel]: class2id_100[onto_class]
    for rel, onto_class in rel2domain.items()
}

with open(f"{dest_pkl_folder}r2id2dom2id.pkl", "wb+") as handle:
    dump(r2id2dom2id, handle)

r2id2range2id = {
    rel2id_100[rel]: class2id_100[onto_class]
    for rel, onto_class in rel2range.items()
}

with open(f"{dest_pkl_folder}r2id2range2id.pkl", "wb+") as handle:
    dump(r2id2range2id, handle)

In [None]:
{k: v for k, v in list(r2id2dom2id.items())[:5]}

In [None]:
{k: v for k, v in list(r2id2range2id.items())[:5]}

In [None]:
subclassof2id_100 = {
    class2id_100[onto_class]: class2id_100[super_class]
    for onto_class, super_class in super_classes.items()
}

with open(f"{dest_pkl_folder}subclassof2id.pkl", "wb+") as handle:
    dump(subclassof2id_100, handle)

In [None]:
def ancestors(c):
    if not c in subclassof2id_100.keys():
        return [c]
    else:
        result = [c]
        result.extend(ancestors(subclassof2id_100[c]))
        return result

def descendants(c):
    if not c in subclassof2id_100.values():
        return [c]
    else:
        result = [c]

        children = [
            child
            for child in subclassof2id_100.keys()
            if subclassof2id_100[child] == c
        ]

        for child in children:
            result.extend(descendants(child))
            
        return result

In [None]:
instype_all = {
    entid: ancestors(instype[entid]) if entid in instype else []
    for entid in ent2id_100.values()
}

with open(f"{dest_pkl_folder}instype_all.pkl", "wb+") as handle:
    dump(instype_all, handle)

In [None]:
{k: v for k, v in list(instype_all.items())[:5]}

In [None]:
{id2ent_100[k]: [id2class_100[c] for c in v] for k, v in list(instype_all.items())[:5]}

In [None]:
len([id2ent_100[k] for k, v in list(instype_all.items()) if len(v) == 0])

In [None]:
class2id2ent2id={
    class_id: [
        ent_id
        for ent_id in instype_all.keys()
        if class_id in instype_all[ent_id]
    ]
    for class_id in id2class_100.keys()
}

with open(f"{dest_pkl_folder}class2id2ent2id.pkl", "wb+") as handle:
    dump(class2id2ent2id, handle)

In [None]:
{k: v[:5] for k, v in list(class2id2ent2id.items())[:10]}

In [None]:
{id2class_100[k]: [id2ent_100[c] for c in v[:5]] for k, v in list(class2id2ent2id.items())[:5]}

In [None]:
[id2class_100[k] for k, v in list(class2id2ent2id.items()) if len(v) == 0]

In [None]:
domain_inference = list(set([
    (s, onto_class)
    
    for s in tqdm(observed_tails_original_kg.keys(), total=len(observed_tails_original_kg.keys()))
    if not id2ent_100[s] in unknown_entities_100
    
    for p in observed_tails_original_kg[s].keys()
    if p in r2id2dom2id.keys()
    
    for onto_class in ancestors(r2id2dom2id[p])
    if not onto_class in instype_all[s]
]))

range_inference = list(set([
    (o, onto_class)
    
    for o in tqdm(observed_heads_original_kg.keys(), total=len(observed_heads_original_kg.keys()))
    if not id2ent_100[o] in unknown_entities_100
    
    for p in observed_heads_original_kg[o].keys()
    if p in r2id2range2id.keys()
    
    for onto_class in ancestors(r2id2range2id[p])
    if not onto_class in instype_all[o]
    and not (o, onto_class) in domain_inference
]))

signature_inference = domain_inference + range_inference
inferred_types = {}

for entity, onto_class in tqdm(signature_inference, total=len(signature_inference)):
    if entity in inferred_types:
        inferred_types[entity].append(onto_class)
    else:
        inferred_types[entity] = [onto_class]

print("Entity types inferred by domained predicates", len(domain_inference))
print("Entity types inferred by ranged predicates", len(range_inference))
print("Entity affected by type inference", len(inferred_types.keys()))

In [None]:
print("Inferred types for England:\n")

print("\n".join([
    id2class_100[t]
    for t in inferred_types[ent2id_100['<http://dbpedia.org/resource/England>']]
]))

In [None]:
entity='<http://dbpedia.org/resource/England>'
onto_class='<http://dbpedia.org/ontology/Genre>'

domain_errors = [
    (id2rel_100[p], id2class_100[r2id2dom2id[p]]) for p in (
        observed_tails_original_kg[ent2id_100[entity]].keys()
        if ent2id_100[entity] in observed_tails_original_kg.keys()
        else []
    )
    if p in r2id2dom2id.keys()
    and r2id2dom2id[p] in descendants(class2id_100[onto_class])
]

range_errors = [
    (id2rel_100[p], id2class_100[r2id2range2id[p]]) for p in (
        observed_heads_original_kg[ent2id_100[entity]].keys()
        if ent2id_100[entity] in observed_heads_original_kg.keys()
        else []
    )
    if p in r2id2range2id.keys() and r2id2range2id[p] in descendants(class2id_100[onto_class])
]

print(f"Triples responsible for entity '{entity.split('/')[-1][:-1]}' to be entailed of type '{onto_class.split('/')[-1][:-1]}'\n")

print("\n".join([
    f"{entity} {rel} {id2ent_100[tail]}"
    for rel, _ in domain_errors
    for tail in observed_tails_original_kg[ent2id_100[entity]][rel2id_100[rel]]
] + [
    f"{id2ent_100[head]} {rel} {entity} ."
    for rel, _ in range_errors
    for head in observed_heads_original_kg[ent2id_100[entity]][rel2id_100[rel]]
]))

# Analysis

In [None]:
from pickle import load
import plotly.graph_objects as go

In [None]:
rel2id_100 = None

with open(f"{dest_pkl_folder}rel2id.pkl", "rb") as handle:
    rel2id_100 = load(handle)

id2rel_100 = {v: k for k, v in rel2id_100.items()}

wikidata_dbpedia = None

with open(f"{dest_pkl_folder}wikidata_dbpedia.pkl", "rb") as handle:
    wikidata_dbpedia = load(handle)

r2id2dom2id = None

with open(f"{dest_pkl_folder}r2id2dom2id.pkl", "rb") as handle:
    r2id2dom2id = load(handle)

r2id2range2id = None

with open(f"{dest_pkl_folder}r2id2range2id.pkl", "rb") as handle:
    r2id2range2id = load(handle)

observed_heads_original_kg = None

with open(f"{dest_pkl_folder}observed_heads_original_kg.pkl", "rb") as handle:
    observed_heads_original_kg = load(handle)

class2id2ent2id = None

with open(f"{dest_pkl_folder}class2id2ent2id.pkl", "rb") as handle:
    class2id2ent2id = load(handle)

In [None]:
total=len(rel2id_100.values())

signed=[
    p
    for p in rel2id_100.values()
    if p in r2id2dom2id.keys() 
    and p in r2id2range2id.keys()
]

domain_no_range = [
    p
    for p in rel2id_100.values()
    if p in r2id2dom2id.keys() 
    and not p in r2id2range2id.keys()
]

range_no_domain = [
    p
    for p in rel2id_100.values()
    if not p in r2id2dom2id.keys() 
    and p in r2id2range2id.keys()
]

not_signed = [
    p
    for p in rel2id_100.values()
    if not p in r2id2dom2id.keys() 
    and not p in r2id2range2id.keys()
]

total, int(10000*len(signed)/total)/100, int(10000*len(domain_no_range)/total)/100, int(10000*len(range_no_domain)/total/100), int(10000*len(not_signed)/total)/100

In [None]:
predicate_count={id: 0 for id in rel2id_100.values()}

for _, ps in observed_heads_original_kg.items():
    for p, _ in ps.items():
        predicate_count[p]+=1

total_count=sum(predicate_count.values())

signed_count=sum([
    predicate_count[p]
    for p in signed
])

domain_no_range_count = sum([
    predicate_count[p]
    for p in domain_no_range
])

range_no_domain_count = sum([
    predicate_count[p]
    for p in range_no_domain
])

not_signed_count = sum([
    predicate_count[p]
    for p in not_signed
])

def percent(number, total, precision=4):
    return int((10**precision)*number/total)/(10**(precision - 2))

print("In number of triples:\n")
print(total_count, signed_count, domain_no_range_count, range_no_domain_count, not_signed_count)

print("\nIn percentage of triples:\n")
print(total_count, percent(signed_count, total_count), percent(domain_no_range_count, total_count), percent(range_no_domain_count, total_count), percent(not_signed_count, total_count))

In [None]:
predicate_domains_instances = {i: len(class2id2ent2id[r2id2dom2id[i]]) for i in r2id2dom2id.keys()}

not_instanciated_domains = [k for k in predicate_domains_instances.keys() if predicate_domains_instances[k] == 0]

not_instanciated_domains_count = sum([predicate_count[p] for p in not_instanciated_domains])
not_instanciated_domains_for_signed_count = sum([predicate_count[p] for p in not_instanciated_domains if p in signed])
not_instanciated_domains_for_domained_count = sum([predicate_count[p] for p in not_instanciated_domains if p in domain_no_range])





predicate_ranges_instances = {i: len(class2id2ent2id[r2id2range2id[i]]) for i in r2id2range2id.keys()}

not_instanciated_ranges = [k for k in predicate_ranges_instances.keys() if predicate_ranges_instances[k] == 0]

not_instanciated_ranges_count = sum([predicate_count[p] for p in not_instanciated_ranges])
not_instanciated_ranges_for_signed_count = sum([predicate_count[p] for p in not_instanciated_ranges if p in signed])
not_instanciated_ranges_for_ranged_count = sum([predicate_count[p] for p in not_instanciated_ranges if p in range_no_domain])

print(
    "These domained predicates have no instances:\n",
    "\t" + "\n\t".join([id2rel_100[p] for p in not_instanciated_domains])
)

print(
    "\n These ranged predicates have no instances:\n",
    "\t" + "\n\t".join([id2rel_100[p] for p in not_instanciated_ranges])
)

print("\nOverlap between these sets?", len([x for x in not_instanciated_ranges if x in not_instanciated_domains]) > 0)

not_instanciated_signatures_for_signed_count = (not_instanciated_domains_for_signed_count + not_instanciated_ranges_for_signed_count)

print(
    "\nImpact on signed triples: ",
    f"{not_instanciated_signatures_for_signed_count} ({percent(not_instanciated_signatures_for_signed_count, signed_count)}% of fully signed triples)"
)

print(
    "Impact on domained triples: ",
    f"{not_instanciated_domains_for_domained_count} ({percent(not_instanciated_domains_for_domained_count, domain_no_range_count)}% of domained triples)"
)

print(
    "Impact on ranged triples: ",
    f"{not_instanciated_ranges_for_ranged_count} ({percent(not_instanciated_ranges_for_ranged_count, range_no_domain_count)}% of ranged triples)"
)

In [None]:
x_divider=4
y_divider=20
y_offset=0.65

nodes = {
    0: {
        'label': 'Total',
        'x': 0/x_divider,
        'y': 0/y_divider+y_offset
    },
    1: {
        'label': 'Full sign', 
        'x': 1/x_divider,
        'y': 2/y_divider+y_offset
    },
    2: {
        'label': 'Incomplete sign',
        'x': 1/x_divider,
        'y': -5/y_divider+y_offset
    },
    3: {
        'label': 'Half sign',
        'x': 2/x_divider,
        'y': -4/y_divider+y_offset
    },
    4: {
        'label': 'No sign',
        'x': 2/x_divider,
        'y': -9/y_divider+y_offset
    },
    5: {
        'label': 'Only domain',
        'x': 3/x_divider,
        'y': 1/y_divider+y_offset
    },
    6: {
        'label': 'Only range',
        'x': 3/x_divider,
        'y': -7/y_divider+y_offset
    },
    7: {
        'label': 'No domain instance',
        'x': 4/x_divider,
        'y': 1/y_divider+y_offset
    },
    8: {
        'label': 'Domain instance',
        'x': 4/x_divider,
        'y': -4/y_divider+y_offset
    },
    9: {
        'label': 'No range instance',
        'x': 4/x_divider,
        'y': -8/y_divider+y_offset
    },
    10: {
        'label': 'Range instance',
        'x': 2/x_divider,
        'y': 7/y_divider+y_offset
    },
    11: {
        'label': 'No sign instance',
        'x': 2/x_divider,
        'y': 3/y_divider+y_offset #3
    },
    12: {
        'label': 'Sign instance',
        'x': 2/x_divider,
        'y': 1/y_divider+y_offset
    }
}

In [None]:
links = [
    (
        'Total',
        'Full sign',
        signed_count
    ),
    (
        'Total',
        'Incomplete sign',
        total_count - signed_count
    ),
    (
        'Full sign',
        'No sign instance',
        not_instanciated_signatures_for_signed_count
    ),
    (
        'Full sign',
        'Sign instance',
        signed_count - not_instanciated_signatures_for_signed_count
    ),
    (
        'Incomplete sign',
        'No sign',
        not_signed_count
    ),
    (
        'Incomplete sign',
        'Half sign',
        total_count - signed_count - not_signed_count
    ),
    (
        'Half sign',
        'Only domain',
        domain_no_range_count
    ),
    (
        'Half sign',
        'Only range',
        range_no_domain_count
    ),
    (
        'Only domain',
        'No domain instance',
        not_instanciated_domains_for_domained_count
    ),
    (
        'Only domain',
        'Domain instance',
        domain_no_range_count - not_instanciated_domains_for_domained_count
    ),
    (
        'Only range',
        'No range instance',
        not_instanciated_ranges_for_ranged_count
    ),
    (
        'Only range',
        'Range instance',
        range_no_domain_count - not_instanciated_ranges_for_ranged_count
    )
]

links=[
    (
        [k for k in nodes.keys() if nodes[k]["label"] == source][0],
        [k for k in nodes.keys() if nodes[k]["label"] == target][0],
        value        
    )
    for source, target, value in links 
]

In [None]:
fig = go.Figure(
    data=[
        go.Sankey(
            node = dict(
                pad = 15,
                thickness = 20,
                line = dict(color = "black", width = 0.5),
                label = [nodes[i]["label"] for i in nodes.keys()],
                x = [nodes[i]["x"] for i in nodes.keys()],
                y = [nodes[i]["y"] for i in nodes.keys()],
                color = "blue"
            ),
    link = dict(
        source = [source for source, _, _ in links],
        target = [target for _, target, _ in links],
        value = [value for _, _, value in links]
  ))])

fig.update_layout(title_text=f"Domain/range coverage over triples of dataset {destination_folder.split('/')[-2]}", font_size=10)
fig.show()
fig.write_image(f"{destination_folder}fig1.png", engine='orca', width=1500, height=450)