In [1]:
import gzip
import pandas as pd
from rdflib import Graph, Namespace, URIRef
from rdflib.namespace import OWL
import re

In [2]:
ma_prop = Namespace('https://makg.org/property/')

In [3]:
g = Graph()
with gzip.open('data/input.ttl.gz', 'rt') as fp:
    g.parse(fp)
print(len(g))

http://dbpedia.org/resource/Optimum_"L"_filter does not look like a valid URI, trying to serialize this will break.
http://dbpedia.org/resource/W^X does not look like a valid URI, trying to serialize this will break.
http://dbpedia.org/resource/Capitalization_of_"Internet" does not look like a valid URI, trying to serialize this will break.
http://dbpedia.org/resource/On_the_Origin_of_the_"Influencing_Machine"_in_Schizophrenia does not look like a valid URI, trying to serialize this will break.


721710


In [4]:
ent_to_level = {
    str(s): [o.value]
    for (s, o) in g.subject_objects(ma_prop.level)
}

In [5]:
df_ents = pd.DataFrame.from_dict(ent_to_level, orient='index', columns=['level'])
df_ents.sort_values('level', inplace=True)
df_ents.reset_index(inplace=True, names='entity')
df_ents.reset_index(inplace=True, names='new_id')
df_ents.new_id += 1

In [6]:
iri_map = {
    r.entity: f'https://w3id.org/ocs/ont/C{r.new_id}'
    for r in df_ents.itertuples()
}

In [7]:
with gzip.open('data/input.ttl.gz', 'rt') as f_in, gzip.open('data/output.ttl.gz', 'wt') as f_out:
    while True:
        l = f_in.readline()
        if not l:
            break
        l = re.sub(
            r'https://makg\.org/entity/\d+',
            lambda m: iri_map[m.group(0)],
            l
        )
        f_out.write(l)

In [8]:
g_map = Graph()
for (old_iri, new_iri) in iri_map.items():
    g_map.add((URIRef(new_iri), OWL.sameAs, URIRef(old_iri)))

In [10]:
with gzip.open('data/mapping.ttl.gz', 'wb') as fp:
    g_map.serialize(destination=fp)