## Event types across EventKG

In [1]:
import io
import requests
from tqdm import tqdm
import pandas as pd

### Number of events per coarser events

In [4]:
import pandas as pd
import plotly.express as px

df = pd.read_csv("dbpedia-sub-events.csv")
fig = px.histogram(df, x='nbSubEvent', histnorm='percent')
fig.show()

In [2]:
import plotly.graph_objects as go

import numpy as np

x = df.nbSubEvent.values
fig = go.Figure(data=[go.Histogram(x=x, cumulative_enabled=True, histnorm='percent')])

fig.show()

In [5]:
for val in [1, 5, 10, 30, 50, 100]:
    print(f"# of events with strictly more than {val} sub events: {df[df.nbSubEvent > val].shape}")

# of events with strictly more than 1 sub events: (10905, 2)
# of events with strictly more than 5 sub events: (2355, 2)
# of events with strictly more than 10 sub events: (1333, 2)
# of events with strictly more than 30 sub events: (538, 2)
# of events with strictly more than 50 sub events: (208, 2)
# of events with strictly more than 100 sub events: (111, 2)


### Type of events from EventKG

In [7]:
ENDPOINT = "http://eventkginterface.l3s.uni-hannover.de/sparql"
HEADERS = {
    "Accept": "text/csv"
}

In [8]:
QUERY_TYPE_TEMPLATE = """
SELECT DISTINCT ?eventType 
WHERE
{
?event owl:sameAs <event-to-replace> .
?event rdf:type sem:Event .
?event rdf:type ?eventType .
FILTER( strStarts( str(?eventType), "http://dbpedia") ) .
}
"""

In [43]:
def get_response_df(event, nb):
    response = requests.get(
        ENDPOINT, headers=HEADERS,
        params={"query": QUERY_TYPE_TEMPLATE.replace("event-to-replace", event)})
    curr_df = pd.read_csv(io.StringIO(response.content.decode('utf-8')))
    curr_df["event"] = event
    curr_df["nbSubEvent"] = nb
    return curr_df

In [44]:
def get_all_event_types(df_events):
    final_df = pd.DataFrame(columns=['eventType', 'event', "nbSubEvent"])
    with tqdm(total=df_events.shape[0]) as pbar:
        for _, row in df_events.iterrows():
            pbar.update(1)
            final_df = pd.concat(
                [
                    final_df,
                    get_response_df(event=row.eventKG, nb=row.nbSubEvent)],
                axis=0)
        pbar.close()
    return final_df

In [5]:
threshold = 10
df_events = df[df.nbSubEvent > threshold]
#df_types = get_all_event_types(df_events)

In [6]:
df_types.to_csv("events_types.csv")

In [8]:
threshold = 10
df_types = pd.read_csv("events_types.csv")
df_types = df_types[[col for col in df_types.columns if "Unnamed" not in col]]
print(f"# of events with strictly more than {threshold} sub events: {df_events.shape[0]}")
print(f"# of these events with a type: {df_types.event.unique().shape[0]}")
print(f"# of unique event types: {df_types.eventType.unique().shape[0]}")
df_types

# of events with strictly more than 10 sub events: 1333
# of these events with a type: 967
# of unique event types: 30


Unnamed: 0,eventType,event,nbSubEvent
0,http://dbpedia.org/ontology/MilitaryConflict,http://dbpedia.org/resource/World_War_II,1437
1,http://dbpedia.org/ontology/HistoricalPeriod,http://dbpedia.org/resource/World_War_II,1437
2,http://dbpedia.org/ontology/Activity,http://dbpedia.org/resource/World_War_II,1437
3,http://dbpedia.org/ontology/MilitaryConflict,http://dbpedia.org/resource/Coalition_Wars,996
4,http://dbpedia.org/ontology/Activity,http://dbpedia.org/resource/Coalition_Wars,996
...,...,...,...
2245,http://dbpedia.org/ontology/MilitaryConflict,http://dbpedia.org/resource/Western_Allied_inv...,11
2246,http://dbpedia.org/ontology/Work,http://dbpedia.org/resource/Western_Allied_inv...,11
2247,http://dbpedia.org/ontology/Award,http://dbpedia.org/resource/Western_Allied_inv...,11
2248,http://dbpedia.org/ontology/Activity,http://dbpedia.org/resource/Western_Allied_inv...,11


In [9]:
grouped = df_types.groupby('eventType').agg({"event": "nunique", "nbSubEvent": ["max", "min"]})
grouped.sort_values(by=('event', 'nunique'), ascending=False)

Unnamed: 0_level_0,event,nbSubEvent,nbSubEvent
Unnamed: 0_level_1,nunique,max,min
eventType,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
http://dbpedia.org/ontology/Activity,592,1437,11
http://dbpedia.org/ontology/Contest,476,434,11
http://dbpedia.org/ontology/SportsEvent,471,178,11
http://dbpedia.org/ontology/MilitaryConflict,258,1437,11
http://dbpedia.org/ontology/Olympics,135,384,11
http://dbpedia.org/ontology/TennisTournament,73,29,11
http://dbpedia.org/ontology/OlympicEvent,55,47,11
http://dbpedia.org/ontology/CyclingRace,40,67,11
http://dbpedia.org/ontology/MotorsportSeason,32,37,11
http://dbpedia.org/ontology/Sport,21,67,11


In [10]:
grouped.to_csv("grouped_event_types.csv")

In [18]:
elt = "http://dbpedia.org/ontology/Election"	

In [20]:
df_types[df_types.eventType == elt].event.values

array(['http://dbpedia.org/resource/2020_United_States_presidential_election',
       'http://dbpedia.org/resource/2014_Irish_local_elections'],
      dtype=object)

In [14]:
events = df_types[df_types.eventType == elt].event.values
df_types[df_types.event.isin(events)]

Unnamed: 0,eventType,event,nbSubEvent
20,http://dbpedia.org/ontology/Olympics,http://dbpedia.org/resource/2016_Summer_Olympics,384
21,http://dbpedia.org/ontology/Contest,http://dbpedia.org/resource/2016_Summer_Olympics,384
24,http://dbpedia.org/ontology/Olympics,http://dbpedia.org/resource/2012_Summer_Olympics,369
25,http://dbpedia.org/ontology/Contest,http://dbpedia.org/resource/2012_Summer_Olympics,369
26,http://dbpedia.org/ontology/Olympics,http://dbpedia.org/resource/2008_Summer_Olympics,351
...,...,...,...
2232,http://dbpedia.org/ontology/SportsEvent,http://dbpedia.org/resource/Speed_skating_at_t...,11
2233,http://dbpedia.org/ontology/Olympics,http://dbpedia.org/resource/Speed_skating_at_t...,11
2234,http://dbpedia.org/ontology/OlympicEvent,http://dbpedia.org/resource/Speed_skating_at_t...,11
2235,http://dbpedia.org/ontology/SportsEvent,http://dbpedia.org/resource/Speed_skating_at_t...,11


In [15]:
df_types[df_types.event.isin(events)].groupby('eventType').agg({"event": "nunique"})

Unnamed: 0_level_0,event
eventType,Unnamed: 1_level_1
http://dbpedia.org/ontology/Activity,47
http://dbpedia.org/ontology/Article,1
http://dbpedia.org/ontology/Contest,76
http://dbpedia.org/ontology/OlympicEvent,7
http://dbpedia.org/ontology/Olympics,135
http://dbpedia.org/ontology/SportsEvent,66


In [16]:
manual_filter_out = [
    "http://dbpedia.org/ontology/WomensTennisAssociationTournament",
    "http://dbpedia.org/ontology/Building",
    "http://dbpedia.org/ontology/Settlement",
    "http://dbpedia.org/ontology/Rebellion",
    "http://dbpedia.org/ontology/Painting",
    "http://dbpedia.org/ontology/Media",
    "http://dbpedia.org/ontology/Profession",
    "http://dbpedia.org/ontology/OldTerritory",
    "http://dbpedia.org/ontology/HistoricalPeriod",
    "http://dbpedia.org/ontology/FootballMatch",
    "http://dbpedia.org/ontology/Award",
    "http://dbpedia.org/ontology/Work",
    "http://dbpedia.org/ontology/Name",
    "http://dbpedia.org/ontology/InternationalFootballLeagueEvent",
    "http://dbpedia.org/ontology/Article",
    "http://dbpedia.org/ontology/Sport",
    "http://dbpedia.org/ontology/Activity"
]

In [17]:
grouped[~grouped.index.isin(manual_filter_out)].index.unique()

Index(['http://dbpedia.org/ontology/Contest',
       'http://dbpedia.org/ontology/CyclingRace',
       'http://dbpedia.org/ontology/Election',
       'http://dbpedia.org/ontology/Event',
       'http://dbpedia.org/ontology/MilitaryConflict',
       'http://dbpedia.org/ontology/MotorsportSeason',
       'http://dbpedia.org/ontology/OlympicEvent',
       'http://dbpedia.org/ontology/Olympics',
       'http://dbpedia.org/ontology/Organisation',
       'http://dbpedia.org/ontology/SoccerTournament',
       'http://dbpedia.org/ontology/SportsEvent',
       'http://dbpedia.org/ontology/TennisTournament',
       'http://dbpedia.org/ontology/Tournament'],
      dtype='object', name='eventType')

In [141]:
df_types[~df_types.eventType.isin(manual_filter_out)].event.unique().shape

(965,)

In [146]:
events_no_label = set(df_events.eventKG.unique()).difference(df_types.event.unique())
df_events[df_events.eventKG.isin(events_no_label)]

Unnamed: 0,eventKG,nbSubEvent
3,http://dbpedia.org/resource/World_War_I,665
24,http://dbpedia.org/resource/Vietnam_War,290
36,http://dbpedia.org/resource/IndyCar_Series,205
47,http://dbpedia.org/resource/Miami_Open_(tennis),169
49,http://dbpedia.org/resource/Swimming_at_the_20...,167
...,...,...
1326,http://dbpedia.org/resource/Volleyball_at_the_...,11
1328,http://dbpedia.org/resource/Weightlifting_at_t...,11
1329,http://dbpedia.org/resource/Weightlifting_at_t...,11
1330,http://dbpedia.org/resource/Weightlifting_at_t...,11


In [147]:
for _, row in df_events[df_events.eventKG.isin(events_no_label)].iterrows():
    print(row.eventKG, row.nbSubEvent)

http://dbpedia.org/resource/World_War_I 665
http://dbpedia.org/resource/Vietnam_War 290
http://dbpedia.org/resource/IndyCar_Series 205
http://dbpedia.org/resource/Miami_Open_(tennis) 169
http://dbpedia.org/resource/Swimming_at_the_2004_Summer_Paralympics 167
http://dbpedia.org/resource/Russian_Civil_War 164
http://dbpedia.org/resource/Sixty_Years'_War 161
http://dbpedia.org/resource/Stockholm_Open 149
http://dbpedia.org/resource/Swimming_at_the_2012_Summer_Paralympics 149
http://dbpedia.org/resource/ATP_Finals 141
http://dbpedia.org/resource/Swimming_at_the_2008_Summer_Paralympics 141
http://dbpedia.org/resource/War_in_Afghanistan_(2001–present) 137
http://dbpedia.org/resource/Paris_Masters 135
http://dbpedia.org/resource/Iraq_War 120
http://dbpedia.org/resource/Barcelona_Open_(tennis) 104
http://dbpedia.org/resource/Iraqi_Civil_War_(2014–2017) 98
http://dbpedia.org/resource/Reconquista 90
http://dbpedia.org/resource/Women's_Stuttgart_Open 88
http://dbpedia.org/resource/Pacific_Coast_C