# Upload Knowledge Graph to Bostrom network

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from IPython.display import HTML, display

from config import ipfs_client
from src.utils_cyber import create_cls

/Users/sergenedashkovsky/Library/Python/3.9/lib/python/site-packages/ipfshttpclient/client/__init__.py:75: VersionMismatch: Unsupported daemon version '0.16.0' (not in range: 0.5.0 ≤ … < 0.9.0)


## Read and Preprocess Ontology Data

In [2]:
def read_and_preprocess_ontology(ontology_file_name: str, show_aggregations: bool = True):
    _ontology_df = pd.read_csv(ontology_file_name, sep=';')
    display(HTML(_ontology_df.head(50).to_html(index=False)))
    _ontology_df = _ontology_df[_ontology_df.subject != _ontology_df.object].drop_duplicates()
    _ontology_df["subject"] = _ontology_df["subject"].map(
        lambda x: x.replace('_', ' ').lower().split(' (')[0].split(',')[0])
    _ontology_df["object"] = _ontology_df["object"].map(
        lambda x: x.replace('_', ' ').lower().split(' (')[0].split(',')[0])
    _subject_object_list = set(list(_ontology_df["subject"]) + list(_ontology_df["object"]))
    print(f'Number of items: {len(_subject_object_list):>,}')
    print(f'Number of links: {len(_ontology_df):>,}')

    if show_aggregations:
        print('\nTop subjects')
        display(HTML(
            _ontology_df.groupby('subject')['object']
                .agg(np.count_nonzero).reset_index().sort_values('object', ascending=False).head(10)
                .to_html(index=False)))

        print('\nTop objects')
        display(HTML(
            _ontology_df.groupby('object')['subject']
                .agg(np.count_nonzero).reset_index().sort_values('subject', ascending=False).head(10)
                .to_html(index=False)))

    print('\nUpload CIDs to IPFS')
    ipfs_hashes_dict = {name: ipfs_client.add_str(name) for name in tqdm(_subject_object_list)}
    _ontology_df.loc[:, 'subject_ipfs_hash'] = _ontology_df['subject'].map(lambda x: ipfs_hashes_dict[x])
    _ontology_df.loc[:, 'object_ipfs_hash'] = _ontology_df['object'].map(lambda x: ipfs_hashes_dict[x])

    return _ontology_df

In [3]:
ontology_df = read_and_preprocess_ontology(ontology_file_name='data/caligraph-instances_relations.nt.csv')
ontology_df.to_csv('data/caligraph-instances_relations_for_upload.nt.csv')

subject,object
20th_Century_Studios,California
20th_Century_Studios,Entertainment
20th_Century_Studios,Film
20th_Century_Studios,Los_Angeles
20th_Century_Studios,Marketing
20th_Century_Studios,Mass_media
20th_Century_Studios,Subsidiary
20th_Century_Studios,The_Walt_Disney_Company
20th_Century_Studios,United_States
20th_Century_Studios,United_States


Number of items: 2,702
Number of links: 8,185

Top subjects


subject,object
world war ii,62
english language,60
history,60
french language,42
russian language,36
david bowie,34
arabic,33
world war i,33
bob dylan,31
john lennon,28



Top objects


object,subject
united states,518
united kingdom,326
city,302
england,232
japan,208
canada,201
entertainment,201
india,114
mass media,100
california,100



Upload CIDs to IPFS


  0%|          | 0/2702 [00:00<?, ?it/s]

## Create cyberlinks

In [4]:
link_candidates = ontology_df[['object_ipfs_hash', 'subject_ipfs_hash']].values.tolist()
link_candidates = link_candidates[0:]
link_candidates[:5]

[['QmSmYJeaQUMYMuAu6SASVsgf9etYnVJLx73rQNfuVL5vtk',
  'QmT4PYK545HFrBuntx4V6YvL61om2pXp1r7Kc12DVNm1m7'],
 ['QmTL8q8XdwaQ8UgfiF1V8ioh77rAjjn39ggBi65jbxgDJS',
  'QmT4PYK545HFrBuntx4V6YvL61om2pXp1r7Kc12DVNm1m7'],
 ['QmYqc8qLxBMsD4vsEEGcxE7nDgZUsRp62wNx1xgsXmuQqt',
  'QmT4PYK545HFrBuntx4V6YvL61om2pXp1r7Kc12DVNm1m7'],
 ['Qmd7dBwwfb2UVfFur72xmTSTPx1AwS1AYxZc8MrUPcnKyu',
  'QmT4PYK545HFrBuntx4V6YvL61om2pXp1r7Kc12DVNm1m7'],
 ['QmQhZNPQMyjmK6H4gqwRhKLK2QL5k8Epk4iXKXuDM7GKEi',
  'QmT4PYK545HFrBuntx4V6YvL61om2pXp1r7Kc12DVNm1m7']]

In [None]:
NUMBER_CYBERLINK_IN_TX = 1000

link_candidates_chunks = [link_candidates[i: i+ NUMBER_CYBERLINK_IN_TX] for i in range(0, len(link_candidates), NUMBER_CYBERLINK_IN_TX)]

res = []
for link_candidates_chunk in tqdm(link_candidates_chunks):
    res.append(create_cls(link_candidates=link_candidates_chunk, print_message=True))


  0%|          | 0/164 [00:00<?, ?it/s]

{'code': 0,
 'codespace': '',
 'data': None,
 'gas_used': 1157565,
 'gas_wanted': 2200000,
 'height': 2702133,
 'info': None,
 'logs': [{'events': [{'attributes': [{'key': 'particleFrom',
                                       'value': 'QmSmYJeaQUMYMuAu6SASVsgf9etYnVJLx73rQNfuVL5vtk'},
                                      {'key': 'particleTo',
                                       'value': 'QmT4PYK545HFrBuntx4V6YvL61om2pXp1r7Kc12DVNm1m7'},
                                      {'key': 'neuron',
                                       'value': 'bostrom1cj8j6pc3nda8v708j3s4a6gq2jrnue7j857m9t'}],
                       'type': 'cyberlink'},
                      {'attributes': [{'key': 'action',
                                       'value': '/cyber.graph.v1beta1.MsgCyberlink'},
                                      {'key': 'module', 'value': 'graph'},
                                      {'key': 'sender',
                                       'value': 'bostrom1cj8j6pc3nda8v708j3s4a6g