In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from src.utils_ipfs import upload_text
from src.utils_cyber import create_cls

## Read CaLiGraph Ontology Preprocessing Data

In [2]:
caligraph_ontology_df = pd.read_csv('data/dbpedia_caligraph-relations.nt.csv', sep=';')
caligraph_ontology_df.head()

Unnamed: 0,subject,object
0,20th_Century_Studios,California
1,20th_Century_Studios,Entertainment
2,20th_Century_Studios,Los_Angeles
3,20th_Century_Studios,Marketing
4,20th_Century_Studios,Mass_media


In [3]:
caligraph_ontology_df["subject"] = caligraph_ontology_df["subject"].map(lambda x: x.replace('_', ' ').lower())
caligraph_ontology_df["object"] = caligraph_ontology_df["object"].map(lambda x: x.replace('_', ' ').lower())
subject_object_list = set(list(caligraph_ontology_df["subject"]) + list(caligraph_ontology_df["object"]))
print(f'Number of items: {len(subject_object_list):>,}')
print(f'Number of links: {len(caligraph_ontology_df):>,}')

caligraph_ontology_df.groupby('subject')['object'].agg(np.count_nonzero).reset_index().sort_values('object', ascending=False).head(10)

Number of items: 1,841
Number of links: 5,824


Unnamed: 0,subject,object
586,history,89
1534,world war ii,65
426,english language,59
478,french language,42
1533,world war i,34
58,arabic,33
155,bob dylan,31
1475,video game,30
1167,russian language,29
1090,prince (musician,27


In [4]:
caligraph_ontology_df.groupby('object')['subject'].agg(np.count_nonzero).reset_index().sort_values('subject', ascending=False).head(10)

Unnamed: 0,object,subject
936,united states,500
177,city,204
419,japan,201
142,canada,179
935,united kingdom,169
264,entertainment,168
261,england,147
810,singapore,76
527,mass media,72
392,india,70


## Upload to IPFS

In [5]:
ipfs_hashes_dict = {name: upload_text(name, print_message=False) for name in tqdm(subject_object_list)}

  0%|          | 0/1841 [00:00<?, ?it/s]

In [6]:
caligraph_ontology_df['subject_ipfs_hash'] = caligraph_ontology_df['subject'].map(lambda x: ipfs_hashes_dict[x][0])
caligraph_ontology_df['object_ipfs_hash'] = caligraph_ontology_df['object'].map(lambda x: ipfs_hashes_dict[x][0])

In [7]:
caligraph_ontology_df.to_csv('dbpedia_caligraph_relations.nt.csv.csv')
# caligraph_ontology_df = pd.read_csv('caligraph_ontology.csv')

## Upload to cyber

In [10]:
link_candidates = caligraph_ontology_df[['subject_ipfs_hash', 'object_ipfs_hash']].values.tolist()
create_cls(link_candidates)

  0%|          | 0/5824 [00:00<?, ?it/s]

In [9]:
# link_candidates = caligraph_ontology_df[['object_ipfs_hash', 'subject_ipfs_hash']].values.tolist()
# create_cls(link_candidates, account_name='groovybear')
