# Process Raw Knowledge Graphs in Clickhouse

In [1]:
from pandas import DataFrame
from IPython.display import HTML, display

from config import clickhouse_execute
from src.utils_clickhouse import load_from_ch_to_df

In [6]:
DROP_TABLE = False
CREATE_TABLE = False
INSERT_INTO_TABLES = False

## Drop and Create the knowledge_graph_raw table

In [3]:
if DROP_TABLE:
    clickhouse_execute('DROP TABLE IF EXISTS knowledge_graph_processed')
    clickhouse_execute('DROP TABLE IF EXISTS subject_processed_agg')
    clickhouse_execute('DROP TABLE IF EXISTS filtered_processed_nodes')
    clickhouse_execute('DROP TABLE IF EXISTS knowledge_graph_processed_filtered')

if CREATE_TABLE:
    clickhouse_execute('''
            CREATE TABLE IF NOT EXISTS knowledge_graph_processed(
                source String,
                subject String,
                predicate String,
                object String)
            ENGINE = ReplacingMergeTree()
            ORDER BY (source, subject, predicate, object);
            ''')
    clickhouse_execute('''
            CREATE TABLE IF NOT EXISTS subject_processed_agg(
                subject String,
                number Int64)
            ENGINE = ReplacingMergeTree()
            ORDER BY (subject);
            ''')
    clickhouse_execute('''
            CREATE TABLE IF NOT EXISTS filtered_processed_nodes(
                node String)
            ENGINE = ReplacingMergeTree()
            ORDER BY (node);
            ''')
    clickhouse_execute('''
            CREATE TABLE IF NOT EXISTS knowledge_graph_processed_filtered(
                source String,
                subject String,
                predicate String,
                object String)
            ENGINE = ReplacingMergeTree()
            ORDER BY (source, subject, predicate, object);
            ''')

## Process Knowledge Graph

In [4]:
if INSERT_INTO_TABLES:
    clickhouse_execute('''
            INSERT INTO knowledge_graph_processed
            SELECT
                source,
                startsWith(subject, '<')? splitByChar('=', splitByChar('#', splitByChar('/', replace(subject, '>', ''))[-1])[-1])[-1]: replace(subject, '"', '') AS subject,
                startsWith(predicate, '<')? splitByChar('=', splitByChar('#', splitByChar('/', replace(predicate, '>', ''))[-1])[-1])[-1]: replace(predicate, '"', '') AS predicate,
                startsWith(object, '<')? splitByChar('=', splitByChar('#', splitByChar('/', replace(object, '>', ''))[-1])[-1])[-1]: replace(object, '"', '') AS object
            FROM knowledge_graph_raw;
            ''')
    clickhouse_execute('''
            INSERT INTO subject_processed_agg
            SELECT
                subject,
                count() AS number
            FROM knowledge_graph_processed
            GROUP BY subject;
            ''')
    clickhouse_execute('''
            INSERT INTO filtered_processed_nodes
            SELECT t1.node AS node
            FROM (
                SELECT
                    subject AS node
                FROM subject_processed_agg) t1
            ALL INNER JOIN (
                SELECT
                    DISTINCT object AS node
                FROM knowledge_graph_processed
                PREWHERE subject != object) t2
            USING node;
            ''')
    clickhouse_execute('''
            INSERT INTO knowledge_graph_processed_filtered
            SELECT *
            FROM knowledge_graph_processed
            PREWHERE subject IN (SELECT node FROM filtered_processed_nodes)
                 AND object IN (SELECT node FROM filtered_processed_nodes)
                 AND subject != object;
            ''')

## Knowledge Graph Parameters

In [5]:
def base_graph_parameters(knowledge_graph_table_name):
    return load_from_ch_to_df('''
            SELECT item,
                   count_unique_items
            FROM (
                SELECT
                    'Links' AS item,
                    count() AS count_unique_items
                FROM {0}
                UNION ALL
                SELECT
                    'Subjects' AS item,
                    count(DISTINCT subject) AS count_unique_items
                FROM {0}
                UNION ALL
                SELECT
                    'Predicates' AS item,
                    count(DISTINCT predicate) AS count_unique_items
                FROM {0}
                UNION ALL
                SELECT
                    'Objects' AS item,
                    count(DISTINCT object) AS count_unique_items
                FROM {0})
                ORDER BY count_unique_items DESC;
               '''.format(knowledge_graph_table_name),
               columns=['Item types', 'Number of unique items'])

def number_of_links_by_type_items(item_type, knowledge_graph_table_name, limit=10):
    return load_from_ch_to_df('''
                SELECT
                    {0},
                    count() AS number_of_links
                FROM {1}
                GROUP BY {0}
                ORDER BY number_of_links DESC
                LIMIT {2};
                '''.format(item_type, knowledge_graph_table_name, limit),
                columns=[item_type.capitalize()+' item', 'Number of Links'])

def print_graph_parameters(knowledge_graph_table_name):
    display(HTML('Base Graph Parameters'))
    display(HTML(base_graph_parameters(knowledge_graph_table_name).to_html(index=False)))
    for item_type in ['subject', 'predicate', 'object']:
        display(HTML(f'Top {item_type.capitalize()}s in the Graph'))
        display(HTML(number_of_links_by_type_items(item_type, knowledge_graph_table_name).to_html(index=False)))

### Raw Graph
As in graph source

In [6]:
print_graph_parameters('knowledge_graph_raw')

Item types,Number of unique items
Links,125638794
Objects,15728434
Subjects,12072467
Predicates,554


Subject item,Number of Links
<http://caligraph.org/ontology/Person>,1151
<http://caligraph.org/ontology/Person_associated_with_places>,615
<http://caligraph.org/resource/Peregrine_falcon>,610
<http://caligraph.org/resource/Osprey>,597
<http://caligraph.org/resource/Barn_swallow>,595
<http://caligraph.org/resource/Forced_Labour_Convention>,590
<http://caligraph.org/resource/Ruddy_turnstone>,576
<http://caligraph.org/resource/Cattle_egret>,574
<http://caligraph.org/resource/Sanderling>,570
<http://caligraph.org/resource/Single_Convention_on_Narcotic_Drugs>,559


Predicate item,Number of Links
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>,74980726
<http://www.w3.org/ns/prov#wasDerivedFrom>,21442564
<http://www.w3.org/2000/01/rdf-schema#label>,9213457
<http://www.w3.org/2002/07/owl#sameAs>,6306410
<http://caligraph.org/ontology/birthPlace>,1616471
<http://dbpedia.org/ontology/birthPlace>,1150300
<http://www.w3.org/2000/01/rdf-schema#subClassOf>,1146629
<http://caligraph.org/ontology/country>,775437
<http://caligraph.org/ontology/isPartOf>,651285
<http://caligraph.org/ontology/location>,607541


Object item,Number of Links
<http://www.w3.org/2002/07/owl#NamedIndividual>,8346439
<http://caligraph.org/ontology/Agent>,2511127
<http://caligraph.org/ontology/Person>,1855754
<http://caligraph.org/ontology/Place>,1383518
<http://caligraph.org/ontology/Time_period>,1140006
<http://dbpedia.org/ontology/Agent>,982775
<http://caligraph.org/ontology/Career_station>,977005
<http://en.wikipedia.org/wiki/Category:Living_people>,794362
<http://caligraph.org/ontology/Living_person>,793669
<http://www.w3.org/2002/07/owl#Class>,756236


### Processed Graph
Removed URI prefix

In [7]:
print_graph_parameters('knowledge_graph_processed')

Item types,Number of unique items
Links,125617414
Objects,15079945
Subjects,8175366
Predicates,283


Subject item,Number of Links
Person,1163
Senior_High_School,675
Peregrine_falcon,619
Person_associated_with_places,615
Barn_swallow,601
Osprey,598
Forced_Labour_Convention,593
Ruddy_turnstone,577
Cattle_egret,575
Sanderling,570


Predicate item,Number of Links
type,75356823
wasDerivedFrom,21440327
label,9213389
sameAs,6303533
birthPlace,2766752
subClassOf,1146576
country,1111207
location,1077068
isPartOf,808021
team,648365


Object item,Number of Links
NamedIndividual,8342946
Agent,3493351
Person,2574223
Place,1929907
Time_period,1140006
Career_station,977005
Work,866273
Species,795959
Category:Living_people,794342
Living_person,793649


### Processed Filtered Graph
Removed links in the Processed Graph if an object does not appear in a role of a subject.

In [8]:
print_graph_parameters('knowledge_graph_processed_filtered')

Item types,Number of unique items
Links,3284488
Subjects,861781
Objects,360328
Predicates,206


Subject item,Number of Links
S,328
Osprey,314
Madonna_(entertainer),290
2,288
Winston_Churchill,273
World_Health_Organization,270
Mallard,265
Ruff,260
World_Heritage_Site,252
Michael_Jackson,236


Predicate item,Number of Links
type,1653595
subClassOf,1025896
onProperty,110457
hasValue,110179
birthPlace,76072
country,43819
genre,43109
occupation,41464
isPartOf,31106
hometown,30841


Object item,Number of Links
Person,102624
Agent,68336
Place,49607
Populated_place,41292
United_States,39524
Organisation,31093
Settlement,25113
Social_group,23027
birthPlace,21662
Organization,21034


In [8]:
clickhouse_execute('SET max_memory_usage = 200000000000;')
load_from_ch_to_df('''
    SELECT
        count(DISTINCT subject) as "Number of Subjects"
    FROM knowledge_graph_wikidata_raw;
''')

Unnamed: 0,0
0,189553387


In [10]:
execute('SET max_memory_usage = 200000000000;')
res = clickhouse_execute('''
    SELECT
        count(DISTINCT object) as "Number of Objects"
    FROM knowledge_graph_wikidata_raw;
''')
res

EOFError: Unexpected EOF while reading bytes

In [5]:
clickhouse_execute('SET max_memory_usage = 200000000000;')
load_from_ch_to_df('''
    SELECT
        count(DISTINCT predicate) as "Number of Predicate"
    FROM knowledge_graph_wikidata_raw;
''')

Unnamed: 0,0
0,8193
