In [1]:
import numpy as np
import re
import pandas as pd 
from tqdm import tqdm
import random as rand
import copy 
import seaborn as sns
import matplotlib.pylab as plt
import networkx as nx 
from functools import reduce
import gc
import os
import sys

# Initialisation

In [2]:
root = "./../Data/"
sub_folder = "TR_Y/"

TEMPORAL_PRECISION = "Y"
TEMPORAL_LIMIT_START = np.datetime64("1900-01-01", TEMPORAL_PRECISION)
TEMPORAL_LIMIT_END = np.datetime64("2023-05-25", TEMPORAL_PRECISION)
TODAY = np.datetime64("2023-05-25", TEMPORAL_PRECISION)

DEFAULT_START = TEMPORAL_LIMIT_START
DEFAULT_END = TEMPORAL_LIMIT_END

# Define functions

In [3]:
def process_time(time_raw:str, time_precision:str, unknown:str="Unknown") -> np.datetime64:
	if time_raw == unknown:
		return unknown
	return np.datetime64(time_raw, time_precision)

def read_quad(file_path:str, time_precision:str, 
			  threshold_start:np.datetime64, threshold_end:np.datetime64,
			   unknown:str="Unknown") -> set:
	removed = 0
	with open(file_path, "r", encoding="UTF-8") as f_r: 
		line = f_r.readline()
		graph = set()
		while line != "":
			elts = line[:-2].split("\t") # Remove \n and .
			s = elts[0]

			if s[:len("<http://www.wikidata.org/entity/Q")] == "<http://www.wikidata.org/entity/Q":
				p = elts[1]
				is_object_uri = None
				if elts[2][:len("<http://www.wikidata.org/entity/Q")] == "<http://www.wikidata.org/entity/Q":
					o = elts[2]
					is_object_uri = True
				else:
					found = re.findall('"b[0-9]+"', elts[2])
					if (len(found) == 1) and (found[0] == elts[2]):
						o = f"""<http://www.wikidata.org/entity/Unkonwn_{elts[2][1:-1]}>"""
						is_object_uri = True
					else:
						o = " ".join(elts[2:-1])
						is_object_uri = False
				timestamp = process_time(elts[-1], time_precision=time_precision, unknown=unknown)
				if (timestamp == unknown) or ((threshold_start <= timestamp) and (timestamp <= threshold_end)):
					graph.add((s, p, o, timestamp, is_object_uri, True))
				else:
					removed+=1
			else:
				removed+=1
			line = f_r.readline()
	print(f"Number removed : {removed}")
	return graph 

def read_quint(file_path:str, time_precision:str,  
			   threshold_start:np.datetime64, threshold_end:np.datetime64, unknown:str="Unknown") -> set:
	removed = 0
	with open(file_path, "r", encoding="UTF-8") as f_r: 
		line = f_r.readline()
		graph = set()
		while line != "":
			elts = line[:-2].split("\t") # Remove \n and .
			s = elts[0]

			if s[:len("<http://www.wikidata.org/entity/Q")] == "<http://www.wikidata.org/entity/Q":
				p = elts[1]
				is_object_uri = None
				if elts[2][:len("<http://www.wikidata.org/entity/Q")] == "<http://www.wikidata.org/entity/Q":
					o = elts[2]
					is_object_uri = True
				else:
					found = re.findall('"b[0-9]+"', elts[2])
					if (len(found) == 1) and (found[0] == elts[2]):
						o = f"""<http://www.wikidata.org/entity/Unkonwn_{elts[2][1:-1]}>"""
						is_object_uri = True
					else:
						o = " ".join(elts[2:-1])
						is_object_uri = False
				start = process_time(elts[-2], time_precision=time_precision, unknown=unknown)
				end = process_time(elts[-1], time_precision=time_precision, unknown=unknown)
				if (start != unknown) and (end != unknown):
					if start < end:
						timestamp = (elts[-2], elts[-1])
					else:
						timestamp = (elts[-1], elts[-2])
				else:
					timestamp = (elts[-2], elts[-1])

				if ((start == unknown) or (start >= threshold_start and start <= threshold_end))\
					and ((end == unknown) or (end >= threshold_start and end <= threshold_end)):
					graph.add((s, p, o, timestamp, is_object_uri, False))
				else:
					removed+=1
			else:
				removed+=1
			line = f_r.readline()
	print(f"Number removed : {removed}")
	return graph

def indexed_graph(list_of_graph:list): 
	indexed_entity = dict()
	for g in list_of_graph:
		for fact in g:
			s, p, o, t, is_object_uri, is_quad = fact

			namespace_wikidata = "<http://www.wikidata.org/entity/Q"

			s_reduced = s[:-1].split("/")[-1]

			if s_reduced[0] == "Q":

				if (not is_object_uri) or o[:len(namespace_wikidata)]!=namespace_wikidata:
					
					if s_reduced not in indexed_entity:
						indexed_entity[s_reduced] = [[set(), set()], set()]
					indexed_entity[s_reduced][0][1].add(fact)

				else:

					o_reduced = o[:-1].split("/")[-1]
					
					if s_reduced not in indexed_entity:
						indexed_entity[s_reduced] = [[set(), set()], set()]
					indexed_entity[s_reduced][0][0].add(fact)

					if o_reduced not in indexed_entity:	
						indexed_entity[o_reduced] = [[set(), set()], set()]
					indexed_entity[o_reduced][1].add(fact)
			
	return indexed_entity

def return_entities(list_of_graph:list): 
	entities = set()
	for g in list_of_graph:
		for fact in g:
			s, p, o, t, is_object_uri, is_quad = fact

			namespace_wikidata = "<http://www.wikidata.org/entity/Q"

			s_reduced = s[:-1].split("/")[-1]

			if s_reduced[0] == "Q":

				if (not is_object_uri) or o[:len(namespace_wikidata)]!=namespace_wikidata:
					
					entities.add(s_reduced)

				else:
					entities.add(s_reduced)

					o_reduced = o[:-1].split("/")[-1]
					entities.add(o_reduced)
			
	return entities

def load_labels(file_path:str) -> dict[str]:
	labels_per_class = dict()
	with open(file_path, "r", encoding="UTF-8") as f_r:
		line = f_r.readline()
		while line != "":
			q, label = line[:-1].split("\t")
			q = q[:-1].split("/")[-1]
			labels_per_class[q] = label
			line = f_r.readline()
	return labels_per_class

def select_seeds_bottom_percent(threshold:int, 
								percentage:float, 
								n_entities:int, 
								df_entities_per_c:pd.DataFrame, 
								ents_per_classes_and_sub_classes:dict):

	# Retrieve the classes 
	above_threshold = df_entities_per_c[df_entities_per_c["Count"] >= threshold]
	value_below = above_threshold.iloc[len(above_threshold)-round(len(above_threshold)*percentage)]["Count"]
	selected_classes = above_threshold[above_threshold["Count"] <= value_below].index

	# Retrieve the entities per classes 
	selected_entities_per_class = dict()
	for c in selected_classes:
		selected_entities_per_class[c] = set(rand.sample(list(ents_per_classes_and_sub_classes[c]), n_entities))

	return reduce(lambda x, y: x.union(y), [v for v in selected_entities_per_class.values()])

def select_seeds_uniformly_abs(threshold, percentage, n_entities, df_entities_per_c, ents_per_classes_and_sub_classes):

	# Retrieve the classes 
	above_threshold = df_entities_per_c[df_entities_per_c["Count"] >= threshold]
	value_below = above_threshold.iloc[len(above_threshold)-round(len(above_threshold)*percentage)]["Count"]
	selected_classes = above_threshold[above_threshold["Count"] <= value_below].index

	# Retrieve the entities per classes 
	selected_entities_per_class = dict()
	for c in selected_classes:
		selected_entities_per_class[c] = set(rand.sample(list(ents_per_classes_and_sub_classes[c]), min(n_entities, df_entities_per_c.loc[c]["Count"])))

	return reduce(lambda x, y: x.union(y), [v for v in selected_entities_per_class.values()])

def select_seeds_uniformly_percent_entities(threshold, percentage, p_entities, df_entities_per_c, ents_per_classes_and_sub_classes):

	# Retrieve the classes 
	above_threshold = df_entities_per_c[df_entities_per_c["Count"] >= threshold]
	value_below = above_threshold.iloc[len(above_threshold)-round(len(above_threshold)*percentage)]["Count"]
	selected_classes = above_threshold[above_threshold["Count"] <= value_below].index

	# Retrieve the entities per classes 
	selected_entities_per_class = dict()
	for c in selected_classes:
		selected_entities_per_class[c] = set(rand.sample(list(ents_per_classes_and_sub_classes[c]), max(1, round(df_entities_per_c.loc[c]["Count"]*p_entities))))

	return reduce(lambda x, y: x.union(y), [v for v in selected_entities_per_class.values()])

def compute_comparison_post_section_seeds(seeds, ents_per_classes_and_sub_classes):
	population_of_classes_post_selection = {
		c:len(ents_per_classes_and_sub_classes[c].intersection(seeds)) for c in ents_per_classes_and_sub_classes
	}

	comparison = pd.DataFrame.from_dict(population_of_classes_post_selection, orient="index", columns=["Population"]).sort_values(by="Population", ascending=False).merge(df_entities_per_c, left_index=True, right_index=True)
	comparison["Percent"] = comparison.apply(lambda x: x["Population"]/x["Count"]*100 if x["Count"] > 0 else 0, axis=1)
	return comparison

def retrieve_next_hops(seeds, already_seen, graph):
	extracted_graph = set()
	to_explore = set()

	for line in graph:
		s, _, o, _, is_object_uri, _ = line 
		if is_object_uri:
			s = s[:-1].split("/")[-1]
			o = o[:-1].split("/")[-1]

			if s in seeds or o in seeds:
				extracted_graph.add(line)
				if s not in already_seen :
					to_explore.add(s)
				if o not in already_seen :
					to_explore.add(o)
		
		else:
			s = s[:-1].split("/")[-1]

			if s in seeds:
				extracted_graph.add(line)
				if s not in already_seen :
					to_explore.add(s)

	return extracted_graph, to_explore

def extract_graph_from_seeds(seeds, graph_to_consider, nb_hops, verbose=1):

	already_seen = set()
	extracted_graph = set() 

	for _ in range(nb_hops):
		extracted_graph_tp, to_explore = retrieve_next_hops(seeds=seeds, already_seen=already_seen, graph=graph_to_consider)
		extracted_graph.update(extracted_graph_tp)
		del extracted_graph_tp
		already_seen.update(seeds)
		seeds = to_explore
		if verbose:
			print(len(extracted_graph)/len(graph_to_consider)*100)

	return extracted_graph, to_explore, already_seen

def find_the_degree_of_nodes_to_explore(to_explore, extracted_graph):
	population_of_to_be_seen_next = {
		e:0 for e in to_explore
	}
	for line in extracted_graph:
		s, _, o, _, is_object, _ = line 
		s = s[:-1].split("/")[-1]

		if s in population_of_to_be_seen_next:
			population_of_to_be_seen_next[s]+=1

		if is_object:
			o = o[:-1].split("/")[-1]

			if o in population_of_to_be_seen_next:
				population_of_to_be_seen_next[o]+=1
	return pd.Series({c:v for c,v in population_of_to_be_seen_next.items() if c[0] == "Q"})
#vc = pd.Series({c:v for c,v in population_of_to_be_seen_next.items() if c[0] == "Q"}).value_counts()
#sum(vc.iloc[[i >= 2 for i in vc.index]].values)/sum(vc.values)

def find_the_population_of_each_explored_class(already_seen, df_entities_per_c, ents_per_classes_and_sub_classes):
	# Only the fully explored nodes (i.e. to_be_explored) does not count
	population_of_classes_post_extraction = {
		c:len(ents_per_classes_and_sub_classes[c].intersection(already_seen)) for c in ents_per_classes_and_sub_classes
	}

	comparison = pd.DataFrame.from_dict(population_of_classes_post_extraction, orient="index", columns=["Population"]).merge(df_entities_per_c, left_index=True, right_index=True).sort_values(by="Count", ascending=False)
	comparison["Percent"] = comparison.apply(lambda x: x["Population"]/x["Count"]*100 if x["Count"] > 0 else 0, axis=1)
	comparison=comparison.merge(pd.DataFrame.from_dict(load_labels(f"{root}Raw/res_classes_label.nt"), orient="index", columns=["Label"]), right_index=True, left_index=True)

	return comparison

def recursivly_retrieve_data(c_initial:str, is_upper_class_of:dict, ents_per_classes:dict, already_computed:dict):

	def inner_function(c_initial:str, is_upper_class_of:dict, ents_per_classes:dict, already_checked:set, start, already_computed:dict):
		if c_initial in already_computed:
			return already_computed[c_initial]
		elif c_initial in already_checked :
			return  set()
		elif c_initial not in is_upper_class_of:
			return ents_per_classes[c_initial] if c_initial in ents_per_classes else  set()
		
		else:
			res_forward = reduce(lambda x, y: x.union(y), 
										[inner_function(c_initial=c, 
										is_upper_class_of=is_upper_class_of,
										ents_per_classes=ents_per_classes,
										already_checked=already_checked.union({c_initial}, start),
										start=start,
										already_computed=already_computed) for c in is_upper_class_of[c_initial]])
			if c_initial in ents_per_classes:
				already_computed[c_initial] = ents_per_classes[c_initial].union(res_forward)
			else:
				already_computed[c_initial] =  res_forward
			return already_computed[c_initial]

	return inner_function(c_initial=c_initial, is_upper_class_of=is_upper_class_of, ents_per_classes=ents_per_classes, already_checked=set(), start=c_initial, already_computed=already_computed)

def return_split_graph(graph:set):
	dp, op = set(), set()
	for line in graph:
		is_op = line[4]
		if is_op:
			op.add(line)
		else:
			dp.add(line)
	return dp, op

def create_folder(directory:str):
	if not os.path.exists(directory):
		os.makedirs(directory)

def create_indexes(graph:set):
	cpt_entity = 0 
	index_entity = dict()

	cpt_relation = 0 
	index_relation = dict()

	for line in graph:
		h, r, o, time, is_object, is_quad = line

		h = h.split("/")[-1][:-1]
		if h not in index_entity:
			index_entity[h] = cpt_entity
			cpt_entity += 1 
		
		r = r.split("/")[-1][:-1]
		if r not in index_relation:
			index_relation[r] = cpt_relation
			cpt_relation += 1 

		if is_object:
			o = o.split("/")[-1][:-1]
			if o not in index_entity:
				index_entity[o] = cpt_entity
				cpt_entity += 1

	return index_entity, index_relation


# Read data

In [4]:
g_quad = read_quad(f"{root}Raw/result_timestamps.quad", time_precision=TEMPORAL_PRECISION,
				   threshold_start=TEMPORAL_LIMIT_START, threshold_end=TEMPORAL_LIMIT_END)
g_quint = read_quint(f"{root}Raw/result_intervals.quintuplet", time_precision=TEMPORAL_PRECISION,
				   threshold_start=TEMPORAL_LIMIT_START, threshold_end=TEMPORAL_LIMIT_END)

  return np.datetime64(time_raw, time_precision)


Number removed : 260698
Number removed : 767659


In [5]:
#entities = return_entities([g_quad, g_quint])

#classes_per_ents = dict()
#ents_per_classes = dict()
#with open(f"{root}Raw/res_classes.nt", "r", encoding="UTF-8") as f_r:
#	line = f_r.readline()

#	while line != "":
		
#		s,_,o = line[:-2].split("\t")
#		s = s[1:-1].split("/")[-1]
#		o = o[1:-1].split("/")[-1]

#		if s in entities:
#			if s not in classes_per_ents:
#				classes_per_ents[s] = set()
#			classes_per_ents[s].add(o)

#			if o not in ents_per_classes:
#				ents_per_classes[o] = set()
#			ents_per_classes[o].add(s)

#		line = f_r.readline()
	
#del entities

In [6]:
#is_upper_class_of = dict()
#with open(f"{root}Raw/Hierarchy_classes.nt", "r", encoding="UTF-8") as f_r:
#	line = f_r.readline()
#	while line != "":
#		s,_,o = line[:-2].split("\t")
#		s = s[:-1].split("/")[-1]
#		o = o[:-1].split("/")[-1]
#		if s[0]=="Q" and o[0]=="Q":
#			if s not in is_upper_class_of:
#				is_upper_class_of[s] = set()
#			is_upper_class_of[s].add(o)
#		line = f_r.readline()

In [7]:
#elts_per_c = dict()
#for c in tqdm(list(ents_per_classes.keys())):
#	elts_per_c[c] = recursivly_retrieve_data(c, is_upper_class_of=is_upper_class_of, ents_per_classes=ents_per_classes, already_computed=elts_per_c)


#ids = {}
#with open("./SaveRecursion/ids", "w", encoding="UTF-8") as f_w:

#	for i, e in enumerate(classes_per_ents):
#		f_w.write(f"{i}\t{e}\n")
#		ids[e] = i

#with open("./SaveRecursion/res_recursion", "w", encoding="UTF-8") as f_w:
#	for c in elts_per_c:
#		f_w.write(f"{c}\t")
#		for e in elts_per_c[c]:
#			f_w.write(f"{ids[e]} ")
#		f_w.write("\n")

#del elts_per_c

# Store the temp data

In [8]:
ex_dp, ex_op = return_split_graph(g_quad.union(g_quint))
len(ex_dp), len(ex_op)

create_folder(f"{root}{sub_folder}")

with open(f"{root}{sub_folder}metadata.txt", "w", encoding="UTF-8") as f_w:
	f_w.write(f"Sampling method : Uniform_sampling_abs\n")
	f_w.write(f"Temporal granularity : {TEMPORAL_PRECISION}\n")
	f_w.write(f"Temporal limit START : {TEMPORAL_LIMIT_START}\n")
	f_w.write(f"Temporal limit END : {TEMPORAL_LIMIT_END}\n")
	f_w.write(f"Number timestamped facts : {len(g_quad)}\n")
	f_w.write(f"Number interval facts : {len(g_quint)}\n")

create_folder(f"{root}{sub_folder}Temporary/")

with open(f"{root}{sub_folder}Temporary/OP_prepro.nt", "w", encoding="UTF-8") as f_w:
	for line in ex_op:
		h, r, o, time, _, is_quad = line 
		h = h.split("/")[-1][:-1]
		r = r.split("/")[-1][:-1]
		o = o.split("/")[-1][:-1]
		if is_quad ==False:
			f_w.write(f"{h}\t{r}\t{o}\t{time[0]}\t{time[1]}.\n")
		else:
			f_w.write(f"{h}\t{r}\t{o}\t{time}.\n")

with open(f"{root}{sub_folder}Temporary/DP_prepro.nt", "w", encoding="UTF-8") as f_w:
	for line in ex_dp:
		h, r, o, time, _, is_quad = line 
		h = h.split("/")[-1][:-1]
		r = r.split("/")[-1][:-1]
		if is_quad ==False:
			f_w.write(f'{h}\t{r}\t{o}\t{time[0]}\t{time[1]}.\n')
		else:
			f_w.write(f'{h}\t{r}\t{o}\t{time}.\n')