# Import Knowledge Graphs in RDF standard (nt files)

In [2]:
from clickhouse_driver import Client
from os import system
from pandas import DataFrame

In [19]:
CLICKHOUSE_HOST = 'localhost'
COMPRESSION = 'lz4'
DROP_TABLE_CALIGRAPH = False
CREATE_TABLE_CALIGRAPH = False
DROP_TABLE_WIKIDATA = True
CREATE_TABLE_WIKIDATA = True
client = Client(host=CLICKHOUSE_HOST)
execute = client.execute
DIR_URL = 'CaLiGraph/'

## Drop and Create the knowledge_graph_raw table

In [20]:
def load_from_ch_to_df(sql_query, columns=None):
    return DataFrame(execute(sql_query), columns=columns)

if DROP_TABLE_CALIGRAPH:
    execute('''
            DROP TABLE IF EXISTS knowledge_graph_raw
            ''')
if CREATE_TABLE_CALIGRAPH:
    execute('''
            CREATE TABLE IF NOT EXISTS knowledge_graph_raw(
                    source String,
                    subject String,
                    predicate String,
                    object String
                )
            ENGINE = ReplacingMergeTree()
            ORDER BY (source, subject, predicate, object)
            ''')

if DROP_TABLE_WIKIDATA:
    execute('''
            DROP TABLE IF EXISTS knowledge_graph_wikidata_raw
            ''')
if CREATE_TABLE_WIKIDATA:
    execute('''
            CREATE TABLE IF NOT EXISTS knowledge_graph_wikidata_raw(
                    source String,
                    subject String,
                    predicate String,
                    object String
                )
            ENGINE = ReplacingMergeTree()
            ORDER BY (source, subject, predicate, object)
            ''')


## Upload data to ClickHouse
### Bash scripts
#### Upload CSV
`cat test_insert.csv | clickhouse-client --query "
    INSERT INTO knowledge_graph_raw
    SELECT
        'test_insert.csv',
        subject,
        predicate,
        object
    FROM input('index String, subject String, predicate String, object String')
    FORMAT CSV" `
#### Upload NT
`cat test_insert.nt | clickhouse-client --query "
    INSERT INTO knowledge_graph_raw
    SELECT
        'test_insert.nt',
        subject,
        object = '' OR object is Null? Null: predicate,
        object = '' OR object is Null? predicate: object
    FROM (
        SELECT
            replace(subject_, 'http://', '') as subject,
            replace(predicate_, 'http://', '') as predicate,
            replace(object_, 'http://', '') as object
        FROM input('subject_ String, predicate_ String, object_ String')
        )
    FORMAT Regexp
    SETTINGS format_regexp='[<|\\"](.+?)[>|\\"] [<|\\"](.+?)[>|\\"] [<|\\"](.+?)[>|\\"]\s\.', format_regexp_escaping_rule='Escaped'" `
#### Upload NT without transformation
`cat test_insert.nt | clickhouse-client --query "
    INSERT INTO knowledge_graph_raw
    SELECT
        'test_insert.nt',
        subject_,
        object_ = '' OR object_ is Null? Null: predicate_,
        object_ = '' OR object_ is Null? predicate_: object_
    FROM input('subject_ String, predicate_ String, object_ String')
    FORMAT Regexp
    SETTINGS format_regexp='([<|\\"].+?[>|\\"])\s([<|\\"].+?[>|\\"])\s([<|\\"].+?[>|\\"])\s\.', format_regexp_escaping_rule='Escaped'" `
#### Upload NT from gz-archive
`zcat latest-truthy.nt.gz | clickhouse-client --query "
    INSERT INTO knowledge_graph_wikidata_raw
    SELECT
        'latest-truthy.nt.gz',
        subject,
        object = '' OR object is Null? Null: predicate,
        object = '' OR object is Null? predicate: object
    FROM (
        SELECT
        replace(subject_, 'http://', '') as subject,
        replace(predicate_, 'http://', '') as predicate,
        replace(object_, 'http://', '') as object
        FROM input('subject_ String, predicate_ String, object_ String') )
        FORMAT Regexp
        SETTINGS format_regexp='[<|\"|_](.+?)[\s][<|\"|_](.+?)[\s][<|\"|_](.+?)[\s\.]',
                 format_regexp_escaping_rule='Escaped'
`

### File list for uploading

In [4]:
file_names = ['caligraph-instances_dbpedia-mapping.nt',
              'caligraph-instances_relations.nt',
              'caligraph-instances_provenance.nt',
              'caligraph-instances_types.nt',
              'caligraph-instances_labels.nt',
              'caligraph-instances_transitive-types.nt',
              'caligraph-ontology.nt',
              'caligraph-ontology_provenance.nt',
              'caligraph-ontology_dbpedia-mapping.nt',
              'dbpedia_caligraph-relations.nt',
              'dbpedia_caligraph-types.nt',
              'dbpedia_caligraph-instances.nt']

### Code

In [17]:
def upload_to_ch(file_name, dir_url=DIR_URL, transformation=True, gz=False, file_type='nt', target_table='knowledge_graph_raw'):
    file_url = dir_url + file_name
    if file_type == 'csv':
        insert_query = '''INSERT INTO {0}
                   SELECT
                        '{1}',
                        subject,
                        predicate,
                        object
                   FROM input('index String, subject String, predicate String, object String')
                   FORMAT CSV
                '''.format(target_table, file_name)
    elif file_type == 'nt' and transformation and gz:
        insert_query = '''INSERT INTO {0}
                   SELECT
                        '{1}',
                        subject,
                        object = '' OR object is Null? Null: predicate,
                        object = '' OR object is Null? predicate: object
                   FROM (
                        SELECT
                            replace(subject_, 'http://', '') as subject,
                            replace(predicate_, 'http://', '') as predicate,
                            replace(object_, 'http://', '') as object
                        FROM input('subject_ String, predicate_ String, object_ String') )
                        FORMAT Regexp
                        SETTINGS format_regexp='[<|\"|_](.+?)[\s][<|\"|_](.+?)[\s][<|\"|_](.+?)[\s\.]', format_regexp_escaping_rule='Escaped'
                '''.format(target_table, file_name)
    elif file_type == 'nt' and transformation:
        insert_query = '''INSERT INTO {0}
                   SELECT
                        '{1}',
                        subject,
                        object = '' OR object is Null? Null: predicate,
                        object = '' OR object is Null? predicate: object
                   FROM (
                        SELECT
                            replace(subject_, 'http://', '') as subject,
                            replace(predicate_, 'http://', '') as predicate,
                            replace(object_, 'http://', '') as object
                        FROM input('subject_ String, predicate_ String, object_ String') )
                        FORMAT Regexp
                        SETTINGS format_regexp='[<|\\"](.+?)[>|\\"] [<|\\"](.+?)[>|\\"] [<|\\"](.+?)[>|\\"]\s\.', format_regexp_escaping_rule='Escaped'
                '''.format(target_table, file_name)

    elif file_name[-2:] == 'nt':
        insert_query = '''INSERT INTO knowledge_graph_raw
                   SELECT
                        '{0}',
                        subject_,
                        object_ = '' OR object_ is Null? Null: predicate_,
                        object_ = '' OR object_ is Null? predicate_: object_
                   FROM input('subject_ String, predicate_ String, object_ String')
                        FORMAT Regexp
                        SETTINGS format_regexp='([<|\\"].+?[>|\\"])\s([<|\\"].+?[>|\\"])\s([<|\\"].+?[>|\\"])\s\.', format_regexp_escaping_rule='Escaped'
                '''.format(file_name)
    else:
        print('Unsupported file type')
        return
    insert_query = insert_query.replace('\n', '')

    bash_command = '''cat {0} | clickhouse-client --query "{1}" '''.format(file_url, insert_query)
    if gz:
        bash_command = 'z' + bash_command
    result = system(bash_command)
    if result==0:
        print(f'{file_name} successfully uploaded')
    else:
        print(f'{file_name} not uploaded, error code {result}')

In [6]:
for file_name in file_names:
    upload_to_ch(file_name, transformation=False)

caligraph-instances_dbpedia-mapping.nt successfully uploaded
caligraph-instances_relations.nt successfully uploaded
caligraph-instances_provenance.nt successfully uploaded
caligraph-instances_types.nt successfully uploaded
caligraph-instances_labels.nt successfully uploaded
caligraph-instances_transitive-types.nt successfully uploaded
caligraph-ontology.nt successfully uploaded
caligraph-ontology_provenance.nt successfully uploaded
caligraph-ontology_dbpedia-mapping.nt successfully uploaded
dbpedia_caligraph-relations.nt successfully uploaded
dbpedia_caligraph-types.nt successfully uploaded
dbpedia_caligraph-instances.nt successfully uploaded


In [7]:
print(f"Number of links: {execute('SELECT count() FROM knowledge_graph_raw')[0][0]:>,}")

Number of links: 125,638,829


In [8]:
print('Example of data')
load_from_ch_to_df('SELECT * FROM knowledge_graph_raw LIMIT 20', columns=['source', 'subject', 'predicate', 'object'])

Example of data


Unnamed: 0,source,subject,predicate,object
0,caligraph-instances_dbpedia-mapping.nt,<http://caligraph.org/resource/!!!>,<http://www.w3.org/2002/07/owl#sameAs>,<http://dbpedia.org/resource/!!!>
1,caligraph-instances_dbpedia-mapping.nt,<http://caligraph.org/resource/!!!_(album)>,<http://www.w3.org/2002/07/owl#sameAs>,<http://dbpedia.org/resource/!!!_(album)>
2,caligraph-instances_dbpedia-mapping.nt,<http://caligraph.org/resource/!!Destroy-Oh-Bo...,<http://www.w3.org/2002/07/owl#sameAs>,<http://dbpedia.org/resource/!!Destroy-Oh-Boy!!>
3,caligraph-instances_dbpedia-mapping.nt,<http://caligraph.org/resource/!Action_Pact!>,<http://www.w3.org/2002/07/owl#sameAs>,<http://dbpedia.org/resource/!Action_Pact!>
4,caligraph-instances_dbpedia-mapping.nt,<http://caligraph.org/resource/!Arriba!_La_Pac...,<http://www.w3.org/2002/07/owl#sameAs>,<http://dbpedia.org/resource/!Arriba!_La_Pacha...
5,caligraph-instances_dbpedia-mapping.nt,<http://caligraph.org/resource/!Hero>,<http://www.w3.org/2002/07/owl#sameAs>,<http://dbpedia.org/resource/!Hero>
6,caligraph-instances_dbpedia-mapping.nt,<http://caligraph.org/resource/!Hero_(album)>,<http://www.w3.org/2002/07/owl#sameAs>,<http://dbpedia.org/resource/!Hero_(album)>
7,caligraph-instances_dbpedia-mapping.nt,<http://caligraph.org/resource/!Kung_language>,<http://www.w3.org/2002/07/owl#sameAs>,<http://dbpedia.org/resource/!Kung_language>
8,caligraph-instances_dbpedia-mapping.nt,<http://caligraph.org/resource/!Oka_Tokat>,<http://www.w3.org/2002/07/owl#sameAs>,<http://dbpedia.org/resource/!Oka_Tokat>
9,caligraph-instances_dbpedia-mapping.nt,<http://caligraph.org/resource/!PAUS3>,<http://www.w3.org/2002/07/owl#sameAs>,<http://dbpedia.org/resource/!PAUS3>


## Upload Wikidata from gz-archive

In [9]:
upload_to_ch(file_name='latest-truthy.nt.gz',
             dir_url='Wikidata/',
             gz=True,
             transformation=True,
             target_table='knowledge_graph_wikidata_raw')