In [1]:
import numpy as np
import os
import random as rand
from tqdm import tqdm 
import re 
import TemporalRepresentation as tp
import traceback

# Read raw data

In [2]:
 
def process_time(time_raw:str, time_precision:str, Unknown:str="Unknown") -> np.datetime64:
	if time_raw == Unknown:
		return Unknown
	return np.datetime64(time_raw, time_precision)

def read_data(path:str, type_data:str="OP", time_precision:str="D", Unknown:str="Unknown") -> set:
	is_OP = type_data == "OP"
	result_set = set()
	with open(f"{path}", "r", encoding="UTF-8") as f_r:
		line = f_r.readline()
		try:
			while line != "":
				split = line[:-2].split("\t")
				if len(split) == 4:
					result_set.add(tuple([
						split[0], 
						split[1], 
						split[2], 
						process_time(time_raw=split[3], time_precision=time_precision), 
						is_OP
					]))
				else:
					start = process_time(time_raw=split[3], time_precision=time_precision)
					end = process_time(time_raw=split[4], time_precision=time_precision)
					if start != Unknown and end != Unknown:
						if start > end : 
							print(line)
						elif start == end :
							end = start + np.timedelta64(1, time_precision)

					result_set.add(tuple([
						split[0], 
						split[1], 
						split[2], 
						(start, end), 
						is_OP
					]))
				line = f_r.readline()
		except Exception as e :
			print(e)
			print(line)

	return result_set

def index_entities(data_raw:set, today, time_precision) -> dict[str, tp.Entity]:
	result_dict = dict[str, tp.Entity]()
	for line in data_raw:
		
		(h, r, v, time, is_op) = line

		if h not in result_dict:
			result_dict[h] = tp.Entity(h, today, time_precision)
		
		if type(time) == tuple:
			
			temporal_representation = tp.Interval(
					time[0] if time[0] != "Unknown" else None, 
					time[1] if time[1] != "Unknown" else None)
		else:
			temporal_representation = tp.Timestamp(time)

		result_dict[h].add_triple(tp.Triple(h,r,v,temporal_representation, is_op), is_head=True)

		if is_op:
			if v not in result_dict:
				result_dict[v] = tp.Entity(v, today, time_precision)
			result_dict[v].add_triple(tp.Triple(v,"-"+r,h,temporal_representation, is_op), is_head=False)
		
	return result_dict

def index_data(data_raw:set, Unknown:str="Unknown") -> dict:
	result_dict = dict()
	timestamp_seen = dict()
	for line in data_raw:
		
		(h, r, v, time, is_op) = line

		if type(time) != tuple:
			if time == Unknown:
				time_tp = time+"_Timestamp"
			else:
				time_tp = time
			if time_tp not in timestamp_seen:
				timestamp_seen[time_tp] = 0
			timestamp_seen[time_tp] += 1
		else:
			for i, time_sub in enumerate(time):
				if time_sub == Unknown:
					time_tp = time_sub+f"_Interval_{i}"
				else:
					time_tp = time_sub

				if time_tp not in timestamp_seen:
					timestamp_seen[time_tp] = 0
				timestamp_seen[time_tp] += 1


		if h not in result_dict:
			result_dict[(h,r,v)] = set()

		result_dict[(h,r,v)].add(time)
		
	return result_dict, timestamp_seen

In [3]:
def uniform_negative_sampling(is_interval:bool, 
							  allowed_timestamps:list, 
							  temporal_precision:str):
	# Non overlapping

	if is_interval:
		
		available_sub_allowed_timestamps = []
		for i, sub_allowed_timestamps in enumerate(allowed_timestamps):
			if sub_allowed_timestamps[1] - sub_allowed_timestamps[0] >= 1:
				available_sub_allowed_timestamps.append(i)

		if len(available_sub_allowed_timestamps) == 0:
			return None 

		id_sub_allowed_timestamps = rand.choice(available_sub_allowed_timestamps)
		chosen_sub_allowed_timestamps = allowed_timestamps[id_sub_allowed_timestamps]
		

		if rand.randint(0,1) == 0: # choose start first
			neg_start_int = rand.choice(range(chosen_sub_allowed_timestamps[0],chosen_sub_allowed_timestamps[1]))
			neg_end_int = rand.choice(range(neg_start_int+1, chosen_sub_allowed_timestamps[1]+1))

			return tp.Interval(
						start = np.datetime64(neg_start_int, temporal_precision), 
						end=np.datetime64(neg_end_int, temporal_precision)
					)
		else:
			neg_end_int = rand.choice(range(chosen_sub_allowed_timestamps[0]+1,chosen_sub_allowed_timestamps[1]+1))
			neg_start_int = rand.choice(range(chosen_sub_allowed_timestamps[0], neg_end_int))
			
			return tp.Interval(
						start=np.datetime64(neg_start_int, temporal_precision), 
						end=np.datetime64(neg_end_int, temporal_precision)
					)
		
	else:
		id_sub_allowed_timestamps = rand.choice(range(len(allowed_timestamps)))
		chosen_sub_allowed_timestamps = allowed_timestamps[id_sub_allowed_timestamps]

		if len(chosen_sub_allowed_timestamps) == 0:
			#print("Not possible")
			return None 
		
		date_int = rand.choice(range(chosen_sub_allowed_timestamps[0], chosen_sub_allowed_timestamps[1]+1))
		return tp.Timestamp(
					date=np.datetime64(date_int, temporal_precision)
				)
	
def uniform_negative_sampling_unknown_boosted(is_interval, allowed_timestamps:list, start_unknown_allowed:bool, end_unknown_allowed:bool, norm_count_unknown:dict):
	# Non overlapping
	
	if is_interval:
		
		available_sub_allowed_timestamps = []
		for i, sub_allowed_timestamps in enumerate(allowed_timestamps):
			if len(sub_allowed_timestamps) >= 2:
				available_sub_allowed_timestamps.append(i)

		if len(available_sub_allowed_timestamps) == 0:
			print("Not possible")
			return None 

		id_sub_allowed_timestamps = rand.choice(available_sub_allowed_timestamps)
		chosen_sub_allowed_timestamps = allowed_timestamps[id_sub_allowed_timestamps]
		if (id_sub_allowed_timestamps == 0) and (start_unknown_allowed):
			if rand.random() < norm_count_unknown["start"]:
				neg_end_idx = rand.choice(range(len(chosen_sub_allowed_timestamps[1:])))
				return ("Unknown", chosen_sub_allowed_timestamps[neg_end_idx+1])
			
		elif (id_sub_allowed_timestamps == len(chosen_sub_allowed_timestamps)) and (end_unknown_allowed):
			if rand.random() < norm_count_unknown["end"]:
				neg_start_idx = rand.choice(range(len(chosen_sub_allowed_timestamps[:-1])))
				return (chosen_sub_allowed_timestamps[neg_start_idx], "Unknown")

		if rand.randint(0,1) == 0: # choose start first
			neg_start_idx = rand.choice(range(len(chosen_sub_allowed_timestamps[:-1])))
			neg_end_idx = rand.choice(range(len(chosen_sub_allowed_timestamps[neg_start_idx+1:])))

			return (chosen_sub_allowed_timestamps[neg_start_idx], chosen_sub_allowed_timestamps[neg_start_idx+1+neg_end_idx])
		else:
			neg_end_idx = rand.choice(range(len(chosen_sub_allowed_timestamps[1:])))
			neg_start_idx = rand.choice(range(len(chosen_sub_allowed_timestamps[:neg_end_idx+1])))
			
			return (chosen_sub_allowed_timestamps[neg_start_idx], chosen_sub_allowed_timestamps[neg_end_idx+1])
		

	else:
		chosen_sub_allowed_timestamps = np.concatenate(allowed_timestamps)

		if len(chosen_sub_allowed_timestamps) == 0:
			print("Not possible")
			return None 
		
		idx = rand.choice(range(len(chosen_sub_allowed_timestamps)))
		return chosen_sub_allowed_timestamps[idx]
	
def find_time_interval_allowed(allowed_ts:list, used_ts:list):
	allowed_ts_local = [i for i in allowed_ts]
	for u in used_ts:
		try: 
			if type(u) == tuple:
				if u[0] != "Unknown":
					s = allowed_ts.index(u[0])
				else:
					s = 0
				if u[1] != "Unknown":
					e = allowed_ts.index(u[1])
				else:
					e = len(allowed_ts)
				allowed_ts_local[s:e+1] = [None]*(e-s)
			else:
				allowed_ts_local[allowed_ts.index(u)] = None
		except:
			print(used_ts)
				
	last_interval = []
	new_allowed = []
	for i in allowed_ts_local:
		if i != None:
			last_interval.append(i)
		else:
			if len(last_interval) != 0:
				new_allowed.append(last_interval)
				last_interval = []

	if len(last_interval) != 0:
		new_allowed.append(last_interval)

	if len(new_allowed) != 0:
		return new_allowed, allowed_ts[0] in new_allowed[0], allowed_ts[-1] in new_allowed[-1]
	else:
		return new_allowed, False, False
	
def remove_used_intervals(list_of_tuples, max_id):
	allowed = [[0,max_id]]
	try:
		for time in list_of_tuples:
			# print(allowed)
			if type(time) == tuple:
				# if time[0] == "Unknown":
				# 	time = (0, time[1])
				# if time[1] == "Unknown":
				# 	time = (time[0], max_id)
				i = 0 
				idx_start = None
				idx_end = None
				while i < len(allowed):

					if not idx_start:
						if (allowed[i][0] <= time[0]) \
								and (time[0] <= allowed[i][1]):
							idx_start = i
						elif (0 < i) \
								and (allowed[i-1][1] < time[0]) \
								and (time[0] < allowed[i][0]):
							idx_start = i - 0.5
						elif (0 < i) \
								and (time[0] > allowed[i][1]):
							idx_start = i + 0.5
						elif (0==i)\
								and (time[0] < allowed[i][0]):
							idx_start = i-0.5
						elif (len(allowed)-1==i)\
								and (time[0] > allowed[i][1]):
							idx_start = i+0.5

					if not idx_end:
						if (allowed[i][0] <= time[1]) \
								and (time[1] <= allowed[i][1]):
							idx_end = i
						elif (i < len(allowed)-1) \
								and (allowed[i][1] < time[1]) \
								and (time[1] < allowed[i+1][0]):
							idx_end = i + 0.5
						elif (i == len(allowed)-1) \
								and (allowed[i][1] < time[1]):
							idx_end = i + 0.5
						elif (i == len(allowed)-1) \
								and (allowed[i][0] > time[1]):
							idx_end = i - 0.5

					i += 1
				if idx_start != idx_end:
					if type(idx_start) == int and type(idx_end) == int:

						to_pop_idx_start = allowed[idx_start][0] > time[0]-1
						allowed[idx_start][1] = time[0]-1
						
						to_pop_idx_end = allowed[idx_end][1] < time[1]+1
						allowed[idx_end][0] = time[1]+1

						for i in range(idx_start+1, idx_end):
							allowed.pop(i)

						if to_pop_idx_start:
							allowed.pop(idx_start)
						if to_pop_idx_end:
							allowed.pop(idx_start+1)
					
					elif type(idx_start) == int:

						to_pop_idx_start = allowed[idx_start][0] > time[0]-1
						allowed[idx_start][1] = time[0]-1
						for i in range(idx_start+1, np.floor(idx_end).astype(int)):
							allowed.pop(i)

						if to_pop_idx_start:
							allowed.pop(idx_start)
					
					elif type(idx_end) == int:
						
						to_pop_idx_end = allowed[idx_end][1] < time[1]+1
						allowed[idx_end][0] = time[1]+1

						for i in range(np.ceil(idx_start).astype(int), idx_end):
							allowed.pop(i)

						if to_pop_idx_end:
							allowed.pop(np.ceil(idx_start).astype(int))
					
					else:
						for i in range(np.ceil(idx_start).astype(int), np.ceil(idx_end).astype(int)):
							allowed.pop(i)
				else:
					if type(idx_start) == int :
						tp = allowed.pop(idx_start)
						if tp[1] >= time[1]+1:
							allowed.insert(idx_start,[time[1]+1, tp[1]])
						if tp[0] <= time[0]-1:
							allowed.insert(idx_start,[tp[0], time[0]-1])
					else:
						None

			
			else:
				i = 0 
				idx_timestamp = None
				while i < len(allowed):

					if not idx_timestamp:
						if (allowed[i][0] <= time) \
								and (time <= allowed[i][1]):
							idx_timestamp = i
						elif (0 < i) \
								and (allowed[i-1][1] < time) \
								and (time < allowed[i][0]):
							idx_timestamp = i - 0.5

					i += 1
					
				if type(idx_timestamp) == int:
					tp = allowed.pop(idx_timestamp)
					if time+1 <= tp[1]:
						allowed.insert(idx_timestamp,[time+1, tp[1]])
					if tp[0] <= time-1:
						allowed.insert(idx_timestamp,[tp[0], time-1])
			# print(allowed)
	except:
		print(list_of_tuples, max_id)
	return allowed

def unused_value_ranges(ranges, lifespan:tp.Interval):
	"""
	Finds the ranges of values not covered by the input list of ranges.
	
	:param ranges: List of tuples [(start, end), ...] where start <= end
	:param max_value: Maximum value to consider
	:return: A list of tuples representing ranges of unused values
	"""
	max_value = lifespan.end.astype(int)
	min_value = lifespan.start.astype(int)

	# Flatten the ranges into a set for fast lookup
	covered = set()
	for time_interval in ranges:
		start, end = time_interval.get_start(), time_interval.get_end()
		covered.update(range(start.astype(int), end.astype(int) + 1))  # Include 'end'

	# Iterate through all possible values from 0 to max_value
	unused = []
	start = None
	for value in range(min_value, max_value + 1):
		if value not in covered:
			if start is None:
				start = value  # Start a new range
		elif start is not None:
			unused.append((start, value - 1))  # End the current range
			start = None

	# If there's an open range at the end
	if start is not None:
		unused.append((start, max_value))

	return unused

def unused_value_ranges_within_interval(ranges, min_value:int, max_value:int):
	"""
	Finds the ranges of values not covered by the input list of ranges.
	
	:param ranges: List of tuples [(start, end), ...] where start <= end
	:param max_value: Maximum value to consider
	:return: A list of tuples representing ranges of unused values
	"""

	def generate_coverage(ranges):
		# Flatten the ranges into a set for fast lookup
		covered = set()
		for time_interval in ranges:
			if time_interval.type_of_temporal_representation == "Interval":
				start, end = time_interval.get_start(), time_interval.get_end()
				covered.update(range(start.astype(int), end.astype(int) + 1))  # Include 'end'
			else:
				covered.update({time_interval.date})
		return covered
	
	def generate_allowed_time(covered, min_value, max_value):

		if min_value < max_value:
			#if max_value - min_value < 20_000_000:
			#	ranges = np.array(range(min_value,max_value))
			#	to_be_sps = np.where([True if i in covered else False for i in ranges], 0, 1)
			#	diff = np.diff(to_be_sps)
			#	s_ind = diff > 0
			#	if to_be_sps[0] == 1:
			#		s_ind = np.concatenate([[True], s_ind])
			#	else:
			#		s_ind = np.concatenate([[False], s_ind])

			#	e_ind = diff < 0 
			#	if to_be_sps[-1] == 1:
			#		e_ind = np.concatenate([e_ind, [True]])
			#	else:
			#		e_ind = np.concatenate([e_ind, [False]])

			#	starts = ranges[s_ind]
			#	ends = ranges[e_ind]
			
			#	return [(starts[i], ends[i]) for i in range(len(starts))]

			#else:
			## Iterate through all possible values from 0 to max_value
			unused = []
			start = None
			for value in range(min_value, max_value + 1):
				if value not in covered:
					if start is None:
						start = value  # Start a new range
				elif start is not None:
					unused.append((start, value - 1))  # End the current range
					start = None
			
			# If there's an open range at the end
			if start is not None:
				unused.append((start, max_value))

			return unused
			
		else:
			return []

		

	covered = generate_coverage(ranges)
	unused = generate_allowed_time(covered, min_value, max_value)

	return unused

def index_graph_per_fact(file:str):
	indexed_graph_per_fact = dict()
	timestamps_seen = set()
	count_unknown = {"start":0, "end":0}
	count_line = 0
	with open(file, "r", encoding="UTF-8") as f_r:
		line = f_r.readline()
		count_line+=1
		while line != "":
			elts = line[:-2].split("\t")
			h, r, v = elts[:3]
			if (h,r,v) not in indexed_graph_per_fact:
				indexed_graph_per_fact[(h,r,v)] = set()
			if len(elts[3:]) == 1:
				indexed_graph_per_fact[(h,r,v)].add(np.datetime64(elts[3]))
				timestamps_seen.add(np.datetime64(elts[3]))
			else:
				if elts[3] == "Unknown":
					count_unknown["start"] += 1
					indexed_graph_per_fact[(h,r,v)].add(("Unknown", np.datetime64(elts[4])))
					timestamps_seen.add(np.datetime64(elts[4]))
				elif elts[4] == "Unknown":
					count_unknown["end"] += 1
					indexed_graph_per_fact[(h,r,v)].add((np.datetime64(elts[3]), "Unknown"))
					timestamps_seen.add(np.datetime64(elts[3]))
				else:
					indexed_graph_per_fact[(h,r,v)].add((np.datetime64(elts[3]), np.datetime64(elts[4])))
					timestamps_seen.add(np.datetime64(elts[3]))
					timestamps_seen.add(np.datetime64(elts[4]))

			line = f_r.readline()
			count_line+=1

	return indexed_graph_per_fact, timestamps_seen, {k:v/count_line for k, v in count_unknown.items()}

def generate_data(indexed_graph_per_fact, timestamps_seen):
	data = []
	for fact, times_used in tqdm(indexed_graph_per_fact.items()):
		indexes_times_used = [(timestamps_seen.index(i[0]) if i[0]!= "Unknown" else 0, timestamps_seen.index(i[1]) if i[1]!= "Unknown" else len(timestamps_seen)-1) if type(i) == tuple else (timestamps_seen.index(i), timestamps_seen.index(i)) for i in times_used]
		for time_used in times_used: 
			# interval_allowed_compressed = remove_used_intervals(indexes_times_used, len(timestamps_seen)-1)
			# interval_allowed = []
			# for (start, end) in interval_allowed_compressed:
			# 	if len(timestamps_seen[start:end+1]):
			# 		interval_allowed += [timestamps_seen[start:end+1]]
			interval_allowed_compressed = unused_value_ranges(indexes_times_used, len(timestamps_seen)-1)
			interval_allowed = []
			for (start, end) in interval_allowed_compressed:
				if len(timestamps_seen[start:end+1]):
					interval_allowed += [timestamps_seen[start:end+1]]			
					
			if len(interval_allowed) :#and len(interval_allowed[-1]):
				corrupted_time = uniform_negative_sampling(type(time_used) == tuple,
									interval_allowed, 
									0 in interval_allowed_compressed[0], 
									len(timestamps_seen) in interval_allowed_compressed[-1])
				
				if corrupted_time != None:
					data.append((True, (fact, time_used)))
					data.append((False, (fact, corrupted_time)))
				else:
					print("Passed")
					
			else:
				print("Passed")
	
	return data

def split_data(data, train_size=80, valid_size=10, test_size=10):
	assert(train_size+valid_size+test_size == 100)
	train, valid, test = [], [], []
	for i in range(int(len(data)/2)):
		random = rand.randint(a=1,b=train_size+valid_size+test_size)
		if random <= train_size:
			train.append(data[i*2])
			train.append(data[i*2+1])
		elif random <= train_size+valid_size:
			valid.append(data[i*2])
			valid.append(data[i*2+1])
		else:
			test.append(data[i*2])
			test.append(data[i*2+1])
	return train, valid, test

def write_data(data, name_file):
	with open(f"{name_file}.nt", "w", encoding="UTF-8") as f_out:
		with open(f"{name_file}.gt", "w", encoding="UTF-8") as f_out_gt:
			for sample in data : 
				(h,r,v) = sample[1][0]
				time = sample[1][1]
				gt = sample[0]
				
				f_out.write(f"{h}\t{r}\t{v}\t")
				if type(time) == tuple:
					f_out.write(f"{time[0]}\t{time[1]}.\n")
				else:
					f_out.write(f"{time}.\n")
				f_out_gt.write(f"{gt}\n")

def generate_data_V2(indexed_graph_per_entities:dict[str:tp.Entity], temporal_precision:str, 
					 start_allowed_value, end_allowed_value, default_start, default_end, margin:float=0.15):
	#output = open("./data_output.tp", "w", encoding="UTF-8")
	#output_gt = open("./data_output.gt", "w", encoding="UTF-8")
	data_output_DP = []
	data_output_OP = []
	passed_stats = 0
	for entity_id in tqdm(sorted(indexed_graph_per_entities.keys())):
		entity = indexed_graph_per_entities[entity_id]

		lifespan = entity.get_lifespan()
		min_value = lifespan.start.astype(int)
		max_value = lifespan.end.astype(int)

		# Add the margin to the limit of temporal values
		number_days_outside = int(lifespan.day_in_the_interval(temporal_precision,
														default_start, 
														default_end)*margin)
		min_value = max(min_value-number_days_outside, start_allowed_value)
		max_value = min(max_value+number_days_outside, end_allowed_value)

		for relation in entity.triples_per_r_as_head:
			time_sequence = tp.ordered_time_sequence_first_start(
									entity=entity, 
									r=relation, 
									temporal_granularity=temporal_precision,
									as_head=True
								)
			
			interval_allowed = unused_value_ranges_within_interval(
									ranges=time_sequence, 
									min_value=min_value, 
									max_value=max_value
								)
					
			if len(interval_allowed):

				for triple in entity.triples_per_r_as_head[relation]: 
					corrupted_time = uniform_negative_sampling(
									is_interval=triple.date.type_of_temporal_representation == "Interval",
									allowed_timestamps=interval_allowed, 
									temporal_precision=temporal_precision
								)
			
					if corrupted_time != None:

						data_output = data_output_OP if triple.is_object else data_output_DP

						data_output.append((True, triple))
						data_output.append((False, tp.Triple(
												head=triple.head,
												relation=triple.relation,
												value=triple.value,
												date=corrupted_time,
												is_object=triple.is_object
											)))
					else:
						passed_stats+=1
					
			else:
				passed_stats+=1
	
	return data_output_OP, data_output_DP, passed_stats


In [4]:
root = "./../Data/"

names = [("Large", 4), ("Medium", 8), ("Small", 10), ("Extra_Small", 15)][-1:]

for temporal_precision in ["Y", "D"]:
	temporal_limit_end = np.datetime64("2023-05-25", temporal_precision)
	today = np.datetime64("2023-05-25", temporal_precision)
	for type_data in ["TR", "FD"]:
		sub_folder = f"{type_data}_{temporal_precision}/"

		if type_data == "TR":
			temporal_limit_start  = np.datetime64("1900-01-01", temporal_precision)
		else:
			temporal_limit_start  = np.datetime64("-1000-01-01", temporal_precision)

		default_start = temporal_limit_start
		default_end = temporal_limit_end
			
		for name, limit in names:

			data = set(read_data(path=f"{root}{sub_folder}Temporary/OP_reduced_{limit}.nt", 
														type_data="OP",
														time_precision=temporal_precision)).union(
															read_data(path=f"{root}{sub_folder}Temporary/DP_reduced_{limit}.nt",
																type_data="DP",
																time_precision=temporal_precision)
														)

			ie = index_entities(data, today=today, time_precision=temporal_precision)
			data_OP, data_DP, passed_stats = generate_data_V2({k:ie[k] for k in list(ie.keys())}, temporal_precision=temporal_precision, 
															start_allowed_value=temporal_limit_start.astype(int), end_allowed_value=temporal_limit_end.astype(int),
															default_start=default_start, default_end=default_end)
			print(passed_stats)
			for i in range(0, len(data_OP), 2):
				head = data_OP[i][1].head
				pos_date = data_OP[i][1].date
				neg_date = data_OP[i+1][1].date

				if type(pos_date) == tp.Interval:
					pos_start = pos_date.get_start() if pos_date.get_start() != None else ie[head].get_lifespan().get_start()
					pos_end = pos_date.get_end() if pos_date.get_end()!= None else ie[head].get_lifespan().get_end()
					if (pos_start <= neg_date.get_start() and neg_date.get_start() <= pos_end) \
						or ( pos_start <= neg_date.get_end() and neg_date.get_end() <= pos_end):
						print(data_OP[i], data_OP[i+1])

			for data_type_name, data_type in [("OP",data_OP), ("DP",data_DP)]:
				train, valid, test = split_data(data_type)
				for data_split_name, data_split in [('train', train), ('valid', valid), ('test', test)]:

					with open(f"{root}{sub_folder}{name}/{data_type_name}_{data_split_name}.txt", "w", encoding="UTF-8") as data_w:
						with open(f"{root}{sub_folder}{name}/{data_type_name}_{data_split_name}.gt", "w", encoding="UTF-8") as gt_w:
							for gt, fact in data_split:
								gt_w.write(f"{gt}\n")
								data_w.write(f"{fact.output_to_file()}\n")
			

100%|██████████| 443/443 [00:00<00:00, 998.61it/s] 


108


100%|██████████| 464/464 [00:00<00:00, 996.91it/s] 


94


100%|██████████| 443/443 [00:05<00:00, 80.91it/s] 


11


100%|██████████| 464/464 [00:09<00:00, 50.55it/s] 


0
