In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from src.utils_ipfs import upload_text
from src.utils_cyber import create_cls

## Read CaLiGraph Ontology Preprocessing Data

In [None]:
caligraph_ontology_df = pd.read_csv('data/caligraph-ontology.nt.csv', sep=';')
caligraph_ontology_df.head()

In [3]:
caligraph_ontology_df["subject"] = caligraph_ontology_df["subject"].map(lambda x: x.replace('_', ' ').lower())
caligraph_ontology_df["object"] = caligraph_ontology_df["object"].map(lambda x: x.replace('_', ' ').lower())
subject_object_list = set(list(caligraph_ontology_df["subject"]) + list(caligraph_ontology_df["object"]))
print(f'Number of items: {len(subject_object_list):>,}')
print(f'Number of links: {len(caligraph_ontology_df):>,}')

caligraph_ontology_df.groupby('subject')['object'].agg(np.count_nonzero).reset_index().sort_values('object', ascending=False).head(10)

Number of items: 1,350
Number of links: 6,130


Unnamed: 0,subject,object
104,artist,50
142,badminton player,47
53,ambassador,45
22,academic journal,43
41,airport,42
164,body of water,42
39,airline,41
144,bank,40
42,album,39
558,formula one racer,38


In [4]:
caligraph_ontology_df.groupby('object')['subject'].agg(np.count_nonzero).reset_index().sort_values('subject', ascending=False).head(10)

Unnamed: 0,object,subject
117,class,1206
609,work,189
584,unit of work,187
566,topical concept,183
564,time period,176
520,specie,157
535,sports season,157
563,thing,127
467,results of a sport competition,112
388,person function,111


## Upload to IPFS

In [5]:
ipfs_hashes_dict = {name: upload_text(name, print_message=False) for name in tqdm(subject_object_list)}

In [7]:
caligraph_ontology_df['subject_ipfs_hash'] = caligraph_ontology_df['subject'].map(lambda x: ipfs_hashes_dict[x][0])
caligraph_ontology_df['object_ipfs_hash'] = caligraph_ontology_df['object'].map(lambda x: ipfs_hashes_dict[x][0])

In [8]:
# caligraph_ontology_df.to_csv('caligraph_ontology.csv')
# caligraph_ontology_df = pd.read_csv('caligraph_ontology.csv')

## Upload to cyber

In [None]:
link_candidates = caligraph_ontology_df[['subject_ipfs_hash', 'object_ipfs_hash']].values.tolist()
create_cls(link_candidates)