# This Script transforms a .ttl file with URIS containing ids to a .txt file in the "G-Care" format


In [33]:
# From: https://github.com/DE-TUM/GNCE
import os
import json
import random
from tqdm import tqdm

In [34]:
# Sets of entities and predicates
vertices = set()
predicates = set()

In [35]:
# For which dataset to load the turtle or nt file
dataset = "yago"
# Under which name to save the resulting g-care graph
gcare_graph_savename = "yago"

In [36]:
from src.utils.validation_utils.utils_prepare_gcare_inputs import dataset_to_g_care

# Declare which URI corresponds to rdf:type
rdf_type_uri = dataset_to_g_care[dataset]

if not rdf_type_uri:
    raise AssertionError("rdf type uri missing !")

dataset_location = r"C:\Users\ruben\Downloads\datasets_used_gcne\yago\graph\yago.nt"
output_location = r"C:\Users\ruben\Downloads\datasets_used_gcne\{}\{}.txt".format(dataset, gcare_graph_savename)
output_id_to_id = r"C:\Users\ruben\Downloads\datasets_used_gcne\{}\id_to_id_{}.json".format(dataset, gcare_graph_savename)
output_id_to_id_predicate = r"C:\Users\ruben\Downloads\datasets_used_gcne\{}\id_to_id_predicate_{}.json".format(dataset, gcare_graph_savename)

In [37]:
# OPTIONAL: define a set of entities to exclude from the graph. Those wont be stored as vertices,
# and related edges are also not stored
excluded_entities = set()

# excluded_query_type = 'star'
# In this case, we open the inductive test set from the graph
# Specifically, we are adding all the objects only, as the subjects are always variables,
# and the predicates are assumed to be known


# with open(f"/home/tim/Datasets/{dataset}/{excluded_query_type}/disjoint_test.json") as f:
#     test_data = json.load(f)
    
# for query in test_data:
#     excluded_entities.update([a[2] for a in query["triples"] if not "?" in a[2]])

In [38]:
len(excluded_entities)

0

In [39]:
# Add all entities and predicates from the .ttl file
n_excluded_entites = set()
l = 0
ttl_file = open(dataset_location, "r")
for line in tqdm(ttl_file):
    atoms = line.split(" ")[:-1]
    #if not atoms == []:
    if True:
        l += 1
        if not atoms[2] in excluded_entities:
            vertices.add(atoms[0])
        if not atoms[2] in excluded_entities:
            vertices.add(atoms[2])
        else:
            n_excluded_entites.update([atoms[2]])
        predicates.add(atoms[1])
        
print('Finished collecting vertices and predicates')
print(f'Excluded {len(n_excluded_entites)} entities')

58276870it [00:58, 991935.38it/s] 

Finished collecting vertices and predicates
Excluded 0 entities





In [40]:
assert len(excluded_entities)==len(n_excluded_entites)

In [41]:
# We need to map the URL ids to entity and predicate ids
id_to_id_mapping = {}
id_to_id_mapping_predicate = {}

In [42]:
# Creating Vertex Dict and save entity mappings
vertex_dict = {}
vid = 0
for vertex in tqdm(vertices):
    dvid = vertex.split("/")[-1].replace(">", "")
    #vertex_dict[vertex] = [dvid]
    vertex_dict[vertex] = [vid]
    id_to_id_mapping[vertex] = vid
    vid += 1

100%|██████████| 13000080/13000080 [00:20<00:00, 635316.08it/s] 


In [43]:
# Saving Mappings for Predicates
pid = 0
for p in predicates:
    id_to_id_mapping_predicate[p] = pid
    pid += 1

In [44]:
# Add Vertex types
ttl_file = open(dataset_location, "r")
for line in tqdm(ttl_file):
    atoms = line.split(" ")[:-1]
    if not (atoms[0] in excluded_entities) and not (atoms[2] in excluded_entities):
        # If triple has predicate rdf:type
        if atoms[1] == rdf_type_uri:
            vertex_dict[atoms[0]] += vertex_dict[atoms[2]]


58276870it [00:56, 1026717.87it/s]


In [45]:
# Add Default Label if entity has no types:
for v in vertex_dict:
    if len(vertex_dict[v]) == 1:
        vertex_dict[v].append(0)

In [46]:
# Creating Edge List
n_skipped_edges = 0
edge_list = []
ttl_file = open(dataset_location, "r")
for tp in tqdm(ttl_file):
    tp = tp.split(" ")[:-1]
    if not (tp[0] in excluded_entities) and not (tp[2] in excluded_entities):
    #edge_label = tp[1].split("/")[-1].replace(">", "") if not "?" in tp[1] else -1
        edge_list.append([vertex_dict[tp[0]][0], vertex_dict[tp[2]][0], id_to_id_mapping_predicate[tp[1]]])
    else:
        n_skipped_edges +=1
print('Finished creating edge list')
print(f'Dropped a total of {n_skipped_edges} edges')

58276870it [01:58, 492362.73it/s]

Finished creating edge list
Dropped a total of 0 edges





In [47]:
# Writing the Data File
with open(output_location, "w") as f:
    f.write("t # 1")
    f.write("\n")
    for v in tqdm(vertex_dict):
        f.write("v")
        for p in vertex_dict[v]:
            f.write(" ")
            f.write(str(p) + "")
        #f.write("v " + str(vertex_dict[v][0]) + " ")
        f.write("\n")
    for e in tqdm(edge_list):
        f.write("e " + str(e[0]) + " " + str(e[1]) + " " + str(e[2]))
        f.write("\n")

100%|██████████| 13000080/13000080 [00:50<00:00, 257930.04it/s]
100%|██████████| 58276870/58276870 [01:26<00:00, 675096.53it/s]


In [48]:
# Save to ID to ID mapping for later query transformation
with open(output_id_to_id, "w") as f:
    json.dump(id_to_id_mapping, f)

with open(output_id_to_id_predicate, "w") as f:
    json.dump(id_to_id_mapping_predicate, f)