In [10]:
import pandas as pd
import numpy as np
from tqdm import tqdm


In [11]:
def process_time(str_time:str):
	prefix = "numpy.datetime64"
	if str_time[:len(prefix)] == prefix:
		return np.datetime64(str_time[len(prefix)+2:-2])
	else:
		return np.nan

def from_str_fact_to_tuple_fact(str_fact:str):
	s, p, o, st, en = str_fact[1:-8].split(", ")[0:5]
	return int(s), int(p), int(o), process_time(st[1:]), process_time(en)

def compute_stats(data):
	if len(data):
		acc_overall = sum(data.apply(lambda x: x["GT"] == x["Dec"], axis=1).values)/len(data)*100
		
		reduced_pos = data[data["GT"] == True]
		if len(reduced_pos):
			acc_pos = sum(reduced_pos.apply(lambda x: x["GT"] == x["Dec"], axis=1).values)/len(reduced_pos)*100
		else:
			acc_pos = np.NAN

		reduced_neg = data[data["GT"] == False]
		if len(reduced_neg):
			acc_neg = sum(reduced_neg.apply(lambda x: x["GT"] == x["Dec"], axis=1).values)/len(reduced_neg)*100
		else:
			acc_neg = np.NAN
			
		return {
			"acc_overall":acc_overall,
			"acc_pos":acc_pos, 
			"acc_neg":acc_neg,
			"size":len(data),
			"computed":True
		}
	else:
		return {
			"computed":False
		}

In [12]:
root = "./Outputs_work/"

codes = {
	0: True, 
	1: False, 
	3: "No rules"
}

pos = pd.read_csv(f"{root}True_output.txt",sep="\t", names=["Fact_str", "Code", "Comment", "Conf", "Cov"])
pos["GT"] = [True for _ in range(len(pos.index))]
pos.index = [f"{i}_p" for i in pos.index]
neg = pd.read_csv(f"{root}False_output.txt",sep="\t", names=["Fact_str", "Code", "Comment", "Conf", "Cov"])
neg["GT"] = [False for _ in range(len(neg.index))]
neg.index = [f"{i}_n" for i in neg.index]

merged = pd.concat([pos,neg])
merged["Dec"] = merged["Code"].map(lambda x: codes[x] if x in codes else None)
merged = merged.merge(right_index=True, left_index=True, right=pd.DataFrame(merged["Fact_str"].map(lambda x: from_str_fact_to_tuple_fact(x)).to_list(), columns=["Head", "Relation", "Value", "Start", "End"], index=merged.index))
merged

Unnamed: 0,Fact_str,Code,Comment,Conf,Cov,GT,Dec,Head,Relation,Value,Start,End
0_p,"(10002, 26, 296, (numpy.datetime64('2015-03-22...",0,True,1.0,10,True,True,10002,26,296,2015-03-22,NaT
1_p,"(10008, 7, 10123, (numpy.datetime64('2015-03-2...",0,True,1.0,21,True,True,10008,7,10123,2015-03-22,NaT
3_p,"(10016, 29, 229, (numpy.datetime64('1907-01-01...",1,False,0.5,2,True,False,10016,29,229,1907-01-01,1907-01-02
10_p,"(10023, 7, 19376, (numpy.datetime64('2015-03-2...",0,True,1.0,17,True,True,10023,7,19376,2015-03-22,NaT
11_p,"(10026, 10, 500, (numpy.datetime64('1920-01-01...",3,No rules could be used,_,_,True,No rules,10026,10,500,1920-01-01,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...
11364_n,"(9983, 3, 32, (numpy.datetime64('1985-06-03'),...",1,False,0.0,4,False,False,9983,3,32,1985-06-03,1985-08-21
11365_n,"(9986, 8, 7833, (numpy.datetime64('1948-12-22'...",3,No rules could be used,_,_,False,No rules,9986,8,7833,1948-12-22,1951-12-07
11366_n,"(9986, 70, 14457, (numpy.datetime64('1921-12-1...",3,No rules could be used,_,_,False,No rules,9986,70,14457,1921-12-12,1923-01-07
11367_n,"(9990, 3, 563, (numpy.datetime64('1911-05-06')...",3,No rules could be used,_,_,False,No rules,9990,3,563,1911-05-06,1912-01-12


In [13]:
ents = set(merged["Head"].values).union(set(merged["Value"].values))

In [14]:
root_data = "./../Retrieve_Data/Data/Uniform_sampling_abs_after_1800/Finished/"

entities_per_class = dict()
with open(f"{root_data}ClassPerEntityExtended.txt", "r", encoding="UTF-8") as f_r:
	line = f_r.readline()
	while line != "":
		entity, classes = line.split("\t")
		entity_int = int(entity)
		if entity_int in ents:
			for c in classes.split(" "):
				c_int = int(c)
				if c_int not in entities_per_class:
					entities_per_class[c_int] = set()
				entities_per_class[c_int].add(entity_int)
		line = f_r.readline()

In [15]:
merged_decided = merged[(merged["Dec"] == True) + (merged["Dec"] == False)]

In [16]:
compute_stats(merged_decided)

{'acc_overall': 61.73816293732229,
 'acc_pos': 45.47924901185771,
 'acc_neg': 78.02524127691166,
 'size': 8089,
 'computed': True}

In [17]:
stats_per_class = {
	c:compute_stats(merged_decided[merged_decided[["Head", "Value"]].apply(lambda x: len(set(x).intersection(entities_per_class[c]))!=0, axis=1)])
	for c in tqdm(entities_per_class)
}

100%|██████████| 3381/3381 [01:01<00:00, 55.10it/s]


In [18]:
np.average([stats_per_class[c]["acc_overall"] for c in stats_per_class if stats_per_class[c]["computed"]])

59.695149098642254