# Training TransE model in Ampligraph
07/08/20. HC contains ~30% people/orgs connected to Wikidata.

## 1. Import data

In [1]:
from ampligraph.datasets import load_from_csv, load_from_ntriples
import numpy as np

In [2]:
data_folder = "/Volumes/Kalyan_SSD/SMG/"
csv_name = "hc_dump_100820.csv"
nt_name = "hc_dump_100820.nt"

#X = load_from_csv(data_folder, csv_name, sep='\t')
X = load_from_ntriples(data_folder, nt_name)
X[:5, ]

array([['<https://collection.sciencemuseumgroup.org.uk/objects/co8437506>',
        '<http://xmlns.com/foaf/0.1/maker>',
        '<https://collection.sciencemuseumgroup.org.uk/people/cp127630>'],
       ['<https://collection.sciencemuseumgroup.org.uk/objects/co58651>',
        '<http://www.w3.org/2001/XMLSchema#additionalType>',
        'polariscope'],
       ['<https://collection.sciencemuseumgroup.org.uk/objects/co415797>',
        '<http://xmlns.com/foaf/0.1/maker>',
        '<https://collection.sciencemuseumgroup.org.uk/people/cp1390>'],
       ['<https://collection.sciencemuseumgroup.org.uk/objects/co86084>',
        '<http://www.w3.org/2001/XMLSchema#material>',
        'earthenware (tin glazed)'],
       ['<https://collection.sciencemuseumgroup.org.uk/objects/co192556>',
        '<http://www.w3.org/2001/XMLSchema#name>',
        'Two packets of Cephos powders, England, 1955-1975']],
      dtype=object)

In [3]:
X = X.astype(np.str)

# np.unique below doesn't work with a mix of str & float
entities = np.unique(np.concatenate([X[:,0], X[:,2]]))
relations = np.unique(X[:,1])
relations[0:5]

array(['<http://www.w3.org/2001/XMLSchema#additionalType>',
       '<http://www.w3.org/2001/XMLSchema#birthDate>',
       '<http://www.w3.org/2001/XMLSchema#birthPlace>',
       '<http://www.w3.org/2001/XMLSchema#deathDate>',
       '<http://www.w3.org/2001/XMLSchema#deathPlace>'], dtype='<U495')

## 2. Train-test split

In [4]:
from ampligraph.evaluation import train_test_split_no_unseen

In [5]:
X_train, X_test = train_test_split_no_unseen(X, test_size=1000) 

print('Train set size: ', X_train.shape)
print('Test set size: ', X_test.shape)

Train set size:  (1258813, 3)
Test set size:  (1000, 3)


## 3. Train TransE

In [6]:
from ampligraph.latent_features import TransE

In [7]:
model = TransE(batches_count=500, 
               seed=42,
               epochs=20, 
               k=50, 
               optimizer='sgd', 
               loss='pairwise', 
               loss_params={'margin':5}, 
               verbose=True)

positives_filter = X

In [18]:
model.fit(X_train)



Average Loss:   2.360491: 100%|██████████| 20/20 [06:15<00:00, 18.76s/epoch]


## 4. Evaluate

In [16]:
from ampligraph.evaluation import evaluate_performance

In [19]:
# 50k entities used for synthetic negative creation as recommended in https://github.com/Accenture/AmpliGraph/issues/186
subset = np.random.choice(entities, size=50000) 

In [None]:
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=positives_filter,   # Corruption strategy filter defined above 
                             entities_subset=subset,
                             corrupt_side=("s", "o"), # corrupt subj and obj separately while evaluating
                             verbose=True)

In [18]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))


MRR: 0.00
Hits@10: 0.00
Hits@3: 0.00
Hits@1: 0.00


## 5. Inference

In [22]:
from ampligraph.discovery import query_topn

In [23]:
# when was charles babbage born?
subj = "<https://collection.sciencemuseumgroup.org.uk/people/cp36993>" # babbage
pred = "<http://www.w3.org/2001/XMLSchema#birthDate>"
# correct answer is 1791
query_topn(model, top_n=10, head=subj, relation=pred, tail=None, ents_to_consider=None, rels_to_consider=None)



100%|██████████| 616233/616233 [04:13<00:00, 2431.99it/s]


(array([['<https://collection.sciencemuseumgroup.org.uk/people/cp36993>',
         '<http://www.w3.org/2001/XMLSchema#birthDate>',
         '1917.0^^<http://www.w3.org/2001/XMLSchema#double>'],
        ['<https://collection.sciencemuseumgroup.org.uk/people/cp36993>',
         '<http://www.w3.org/2001/XMLSchema#birthDate>',
         '1820.0^^<http://www.w3.org/2001/XMLSchema#double>'],
        ['<https://collection.sciencemuseumgroup.org.uk/people/cp36993>',
         '<http://www.w3.org/2001/XMLSchema#birthDate>',
         '1810.0^^<http://www.w3.org/2001/XMLSchema#double>'],
        ['<https://collection.sciencemuseumgroup.org.uk/people/cp36993>',
         '<http://www.w3.org/2001/XMLSchema#birthDate>',
         '1788.0^^<http://www.w3.org/2001/XMLSchema#double>'],
        ['<https://collection.sciencemuseumgroup.org.uk/people/cp36993>',
         '<http://www.w3.org/2001/XMLSchema#birthDate>',
         '1872.0^^<http://www.w3.org/2001/XMLSchema#double>'],
        ['<https://collection.

In [24]:
# what are the materials of this metal perfume bottle?
subj = "<https://collection.sciencemuseumgroup.org.uk/objects/co133199>"
pred = "<http://www.w3.org/2001/XMLSchema#material>"

query_topn(model, top_n=10, head=subj, relation=pred, tail=None, ents_to_consider=None, rels_to_consider=None)



100%|██████████| 616233/616233 [04:11<00:00, 2454.13it/s]


(array([['<https://collection.sciencemuseumgroup.org.uk/objects/co133199>',
         '<http://www.w3.org/2001/XMLSchema#material>', 'glass'],
        ['<https://collection.sciencemuseumgroup.org.uk/objects/co133199>',
         '<http://www.w3.org/2001/XMLSchema#material>', 'steel'],
        ['<https://collection.sciencemuseumgroup.org.uk/objects/co133199>',
         '<http://www.w3.org/2001/XMLSchema#material>', 'paper'],
        ['<https://collection.sciencemuseumgroup.org.uk/objects/co133199>',
         '<http://www.w3.org/2001/XMLSchema#material>', 'wood'],
        ['<https://collection.sciencemuseumgroup.org.uk/objects/co133199>',
         '<http://www.w3.org/2001/XMLSchema#material>', 'brass'],
        ['<https://collection.sciencemuseumgroup.org.uk/objects/co133199>',
         '<http://www.w3.org/2001/XMLSchema#material>', 'metal'],
        ['<https://collection.sciencemuseumgroup.org.uk/objects/co133199>',
         '<http://www.w3.org/2001/XMLSchema#material>', 'metal (unknown)'

## appendix: memory consumption

In [20]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes (MB)
unit = 1e6
var_list = sorted([(x, sys.getsizeof(globals().get(x))/unit) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)
var_list

[('X', 7483.289332),
 ('positives_filter', 7483.289332),
 ('X_train', 7477.349332),
 ('entities', 1220.141436),
 ('subset', 99.000096),
 ('X_test', 5.940112),
 ('relations', 0.029796),
 ('TransE', 0.002008),
 ('var_list', 0.000272),
 ('load_from_csv', 0.000144),
 ('load_from_ntriples', 0.000144),
 ('train_test_split_no_unseen', 0.000144),
 ('np', 8.8e-05),
 ('data_folder', 7.3e-05),
 ('csv_name', 6.7e-05),
 ('nt_name', 6.6e-05),
 ('model', 6.4e-05),
 ('unit', 2.4e-05)]

In [21]:
sum([item[1] for item in var_list])

23769.042530000006