# EventKG Filtering

Exploring EventKG using:
- Dask for reading and filtering 
- csv format for saving

- EventKG 3.0
- pandas==1.4.2
- dask==2022.3.0

Aim = select content to later explore in GraphDB, getting rid of the parser error

In [2]:
import os
import csv
import pandas as pd
import dask.dataframe as dd

In [3]:
#TO DO: change path to the EventKG dataset if stored elsewhere
EVENTKG_FOLDER = './eventkg/'

In [4]:
def sep_col(x_content):
	sep = x_content.strip().split(" <")
	val = sep[1] if len(sep) > 1 else ""
	return val

def basic_preprocess(x_content):
	return x_content.replace("<", "").strip()

def process_object(x_content):
	sep = x_content.strip().split(" <")
	return basic_preprocess(sep[0])

def read_nq(folder=None, path=None, preprocess=True,
			columns=["subject", "predicate", "object", "meta"]):
	if not (folder or path):
		raise ValueError("Either `folder` or `path` must be specified")
	
	if folder:
		to_read = f'{folder}/*.nq'
	else:
		to_read = path

	df=dd.read_csv(to_read, sep='>',
				   names=["subject", "predicate", "object", "meta", "."],
				   on_bad_lines='skip')

	if preprocess:
		df.subject = df.subject.apply(basic_preprocess, meta=('subject', 'str'))
		df.predicate = df.predicate.apply(basic_preprocess, meta=('predicate', 'str'))
		df.meta = df[["object", "meta"]].apply(lambda row: basic_preprocess(row.meta) \
															if row.meta.strip() != '.' \
															else sep_col(row.object), meta=('meta', 'str'), axis=1)
		df.object = df.object.apply(process_object, meta=('object', 'str'))

	return df[columns]

## 1. relations_base.nq

Only keeping in this file the following predicates: `sem:hasSubEvent`, `sem:hasBeginTimeStamp` and `sem:hasEndTimeStamp`.

(prefix sem: <http://semanticweb.cs.vu.nl/2009/11/sem/>)

In [7]:
PREDICATES_TO_KEEP = [
    " <http://semanticweb.cs.vu.nl/2009/11/sem/hasSubEvent",
    " <http://semanticweb.cs.vu.nl/2009/11/sem/hasBeginTimeStamp",
    " <http://semanticweb.cs.vu.nl/2009/11/sem/hasEndTimeStamp"
]

In [9]:
df_relations_base_not_preprocessed = read_nq(path=os.path.join(EVENTKG_FOLDER, 'relations_base.nq'),
                                             preprocess=False)
df_relations_base_sem = \
    df_relations_base_not_preprocessed \
        [df_relations_base_not_preprocessed.predicate.isin(PREDICATES_TO_KEEP)]
# df_relations_base_sem.meta = df_relations_base_sem.meta.apply(
#     lambda x: x if x == ' .' else f"{x}> .", meta=('meta', 'str'))
df_relations_base_sem = df_relations_base_sem.compute()

In [10]:
for _, row in df_relations_base_sem[df_relations_base_sem.predicate==' <http://semanticweb.cs.vu.nl/2009/11/sem/hasBeginTimeStamp'].head(1).iterrows():
    print('>'.join([row.subject, row.predicate, row.object, row.meta]))

for _, row in df_relations_base_sem[df_relations_base_sem.predicate==' <http://semanticweb.cs.vu.nl/2009/11/sem/hasSubEvent'].head(1).iterrows():
    print('>'.join([row.subject, row.predicate, row.object, row.meta]))

<http://eventKG.l3s.uni-hannover.de/resource/entity_12435257> <http://semanticweb.cs.vu.nl/2009/11/sem/hasBeginTimeStamp> "1925-12-14"^^<http://www.w3.org/2001/XMLSchema#date> <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_en
<http://eventKG.l3s.uni-hannover.de/resource/event_4157> <http://semanticweb.cs.vu.nl/2009/11/sem/hasSubEvent> <http://eventKG.l3s.uni-hannover.de/resource/event_391150> .


In [24]:
df_relations_base_sem \
    .to_csv('relations_base_filtered.nq', sep=">", index=False, header=False,
            quoting=csv.QUOTE_NONE, escapechar="", line_terminator='\n')

In [25]:
process = lambda line: line if line[-2] == '.' else line.replace("\n", "> .\n")

lines = open('relations_base_filtered.nq')
lines=[process(line) for line in lines]
f = open('relations_base_filtered.nq', "w+")
f.write("".join(lines))
f.close()

## 2. events.nq

Retrieving in this file mapping from EventKG to generic KGs (DBpedia, Wikidata, YAGO)
1. Retrieving from `relations_base.nq` URIs of events that are ?subject or ?object in triples similar to `(?subject, sem:hasSubEvent, ?object)`
2. Retrieving from `events.nq` triples (?s, owl:sameAs, ?o) s.t. ?s is an event from step 1.

`PREFIX owl: <http://www.w3.org/2002/07/owl#>`



In [42]:
df_relations_base_only_events = read_nq(path='relations_base_filtered.nq', preprocess=False)
df_relations_base_only_events = df_relations_base_only_events \
    [df_relations_base_only_events.predicate == ' <http://semanticweb.cs.vu.nl/2009/11/sem/hasSubEvent'].compute()

events_subject = df_relations_base_only_events.subject.unique()
events_object = df_relations_base_only_events.object.unique()
events_object = [elt[1:] for elt in events_object]

  df = reader(bio, **kwargs)


In [46]:
df_events = read_nq(path=os.path.join(EVENTKG_FOLDER, 'events.nq'),
                    preprocess=False,
                    columns=["subject", "predicate", "object", "meta", "."])
df_events.head(2)

Unnamed: 0,subject,predicate,object,meta,.
0,<http://eventKG.l3s.uni-hannover.de/resource/e...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#type,<http://semanticweb.cs.vu.nl/2009/11/sem/Event,<http://eventKG.l3s.uni-hannover.de/graph/eve...,.
1,<http://eventKG.l3s.uni-hannover.de/resource/e...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#type,<http://eventKG.l3s.uni-hannover.de/schema/Ev...,<http://eventKG.l3s.uni-hannover.de/graph/eve...,.


In [47]:
df_events_filtered = df_events[((df_events.subject.isin(events_subject)) | (df_events.subject.isin(events_object))) & \
                               (df_events.predicate == " <http://www.w3.org/2002/07/owl#sameAs")]
df_events_filtered.head(2)

Unnamed: 0,subject,predicate,object,meta,.
2,<http://eventKG.l3s.uni-hannover.de/resource/e...,<http://www.w3.org/2002/07/owl#sameAs,<http://www.wikidata.org/entity/Q1048601,<http://eventKG.l3s.uni-hannover.de/graph/wik...,.
3,<http://eventKG.l3s.uni-hannover.de/resource/e...,<http://www.w3.org/2002/07/owl#sameAs,<http://yago-knowledge.org/resource/1981_Wimb...,<http://eventKG.l3s.uni-hannover.de/graph/yago,.


In [48]:
df_events_filtered = df_events_filtered.compute()

In [49]:
columns = df_events_filtered.columns
df_events_filtered['concat'] = df_events_filtered.apply(
    lambda row: " ".join([row[x] for x in columns]), axis=1
)

df_events_filtered['to_keep'] = df_events_filtered.concat.apply(
    lambda x: 0 if any(elt in x for elt in ['"', '\\', '”']) else 1
)
df_events_filtered.head(2)

Unnamed: 0,subject,predicate,object,meta,.,concat,to_keep
2,<http://eventKG.l3s.uni-hannover.de/resource/e...,<http://www.w3.org/2002/07/owl#sameAs,<http://www.wikidata.org/entity/Q1048601,<http://eventKG.l3s.uni-hannover.de/graph/wik...,.,<http://eventKG.l3s.uni-hannover.de/resource/e...,1
3,<http://eventKG.l3s.uni-hannover.de/resource/e...,<http://www.w3.org/2002/07/owl#sameAs,<http://yago-knowledge.org/resource/1981_Wimb...,<http://eventKG.l3s.uni-hannover.de/graph/yago,.,<http://eventKG.l3s.uni-hannover.de/resource/e...,1


In [50]:
df_events_filtered \
    [df_events_filtered.to_keep == 1] \
        [["subject", "predicate", "object", "meta", "."]] \
            .to_csv('events_filtered.nq', sep='>', index=False, header=False)

In [55]:
print("Different KG used:")
for elt in df_events_filtered.meta.unique():
    print(elt)

Different KG used:
 <http://eventKG.l3s.uni-hannover.de/graph/wikidata
 <http://eventKG.l3s.uni-hannover.de/graph/yago
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_en
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_de
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_fr
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_it
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_nl
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_es
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_no
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_sl
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_pl
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_ru
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_pt
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_da
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_hr
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_bg
 <http://eventKG.l3s.uni-hannover.de/graph/dbpedia_ro


In [58]:
df_events_filtered.groupby('meta').agg({"subject": "count"}).sort_values(by="subject", ascending=False)

Unnamed: 0_level_0,subject
meta,Unnamed: 1_level_1
<http://eventKG.l3s.uni-hannover.de/graph/wikidata,268010
<http://eventKG.l3s.uni-hannover.de/graph/dbpedia_en,110001
<http://eventKG.l3s.uni-hannover.de/graph/yago,89503
<http://eventKG.l3s.uni-hannover.de/graph/dbpedia_it,72788
<http://eventKG.l3s.uni-hannover.de/graph/dbpedia_fr,48951
<http://eventKG.l3s.uni-hannover.de/graph/dbpedia_de,48156
<http://eventKG.l3s.uni-hannover.de/graph/dbpedia_es,40712
<http://eventKG.l3s.uni-hannover.de/graph/dbpedia_ru,34588
<http://eventKG.l3s.uni-hannover.de/graph/dbpedia_pl,30594
<http://eventKG.l3s.uni-hannover.de/graph/dbpedia_nl,26941
