In [None]:
import requests
import numpy as np
!pip install "tensorflow>=1.14.0,<2.0"
!pip install "tensorflow-gpu>=1.14.0,<2.0"
!pip install  ampligraph

In [2]:
import ampligraph
import numpy as np
ampligraph.__version__
import pandas as pd
from ampligraph.datasets import load_from_ntriples
from ampligraph.datasets import load_from_csv

In [4]:
Y=load_from_csv('/content','1500data.csv',sep=',',add_reciprocal_rels=False)

In [5]:
print(Y)

[['Common symptoms' 'include' 'fever']
 ['Management' 'involves' 'treatment']
 ['The World Health Organization' 'declared' 'outbreak']
 ...
 ['Helmand' 'had' 'two']
 ['the Ministry of Public Health' 'confirmed' '106 new cases']
 ['Baghlan' 'had' 'six']]


In [6]:
Y.shape

(1459, 3)

In [7]:
entities = np.unique(np.concatenate([Y[:, 0], Y[:, 2]]))
entities.size

2370

In [8]:
relations = np.unique(Y[:, 1])
relations.size

616

In [15]:
from ampligraph.evaluation import train_test_split_no_unseen 

Y_train, Y_test = train_test_split_no_unseen(Y, test_size=55) 

In [16]:
print('Train set size: ', Y_train.shape)
print('Test set size: ', Y_test.shape)

Train set size:  (1404, 3)
Test set size:  (55, 3)


In [17]:
print(Y_train[:200])

[['Common symptoms' 'include' 'fever']
 ['Management' 'involves' 'treatment']
 ['The World Health Organization' 'declared' 'outbreak']
 ['minority' 'develop' 'noticeable symptoms']
 ['Cardiovascular complications' 'include' 'heart failure']
 ['Sputum' 'carry' 'large amounts']
 ['indicative' 'suggest' 'underlying immunopathology']
 ['people' 'have' 'classical serum biomarkers']
 ['The US Food and Drug Administration' 'approved' 'test']
 ['guidelines' 'recommend' 'medication']
 ['Face coverings' 'limit' 'volume']
 ['the United States Environmental Protection Agency' 'maintains' 'list']
 ['Intensivists' 'compiled' 'treatment recommendations']
 ['availability' 'affect' 'mortality']
 ['study' 'reported' 'earliest date']
 ['Official publications' 'reported' 'earliest onset']
 ['Wuhan Central Hospital' 'sent'
  'bronchoalveolar lavage fluid BAL sample']
 ['the National Health Commission of China' 'issued' 'notice']
 ['China' 'reported' 'nearly 140 new cases']
 ['Italy' 'overtook' 'China']
 ['

In [18]:
from ampligraph.latent_features import DistMult

In [19]:
model = DistMult(batches_count=10, 
                seed=0, 
                epochs=20, 
                k=10, 
                eta=10,
                optimizer='adam', 
                optimizer_params={'lr':1e-3},
                loss='multiclass_nll', 
                regularizer='LP', 
                regularizer_params={'p':3, 'lambda':1e-5}, 
                verbose=True)

In [20]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

model.fit(Y_train, early_stopping = False)

Average Loss:   2.373535: 100%|██████████| 20/20 [00:00<00:00, 32.16epoch/s]


In [21]:
model.get_embeddings(['The World Health Organization','outbreak'], embedding_type='entity')

array([[ 0.02188077,  0.16830604, -0.16845378,  0.14712408,  0.0920139 ,
         0.14495565,  0.16449702,  0.16962336, -0.12333542, -0.12468348],
       [-0.02864343, -0.1320078 ,  0.09050266, -0.08716419, -0.08889943,
         0.12126279,  0.10065684, -0.1495471 ,  0.06595367, -0.12121323]],
      dtype=float32)

In [22]:
model.get_embeddings(['common symptoms','cough'], embedding_type='entity')

array([[ 0.03686009,  0.12477501,  0.15571064, -0.08261333,  0.0190071 ,
         0.1030189 , -0.05289743,  0.13939205, -0.12855546, -0.06819809],
       [-0.1344412 ,  0.05155814,  0.09203613,  0.09430119,  0.15649106,
        -0.09254248, -0.11649413, -0.10214522,  0.0510114 , -0.13438725]],
      dtype=float32)

In [23]:
positives_filter = Y

In [24]:
from ampligraph.latent_features import save_model, restore_model

In [25]:
save_model(model, './covid_model.pkl')

In [26]:
from ampligraph.evaluation import evaluate_performance

In [27]:
from ampligraph.discovery import find_duplicates

In [28]:
entities = np.unique(Y_train[:, 2])
dups, _ = find_duplicates(entities, model, mode='entity', tolerance=0.4)
print(list(dups)[:3])



In [29]:
from ampligraph.discovery import query_topn

In [30]:
query_topn(model, top_n=5,
            head=None, relation='include', tail='cough',
           ents_to_consider=None, rels_to_consider=None)

(array([['people', 'include', 'cough'],
        ['place', 'include', 'cough'],
        ['aerosolized tobramycin', 'include', 'cough'],
        ['Surgery', 'include', 'cough'],
        ['Symptom', 'include', 'cough']], dtype='<U153'),
 array([0.03273855, 0.03041483, 0.02913952, 0.02884051, 0.02759999],
       dtype=float32))

In [31]:
ranks = evaluate_performance(Y_test, 
                             model=model, 
                             filter_triples=positives_filter,   # Corruption strategy filter defined above 
                             use_default_protocol=True, # corrupt subj and obj separately while evaluating
                             verbose=True)



100%|██████████| 55/55 [00:00<00:00, 441.47it/s]


In [32]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score
mr = mr_score(ranks)
mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))
print("MR: %.2f" % (mr))
hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))

MRR: 0.02
MR: 1278.19
Hits@10: 0.04
Hits@3: 0.01
Hits@1: 0.01


In [33]:
from ampligraph.utils import create_tensorboard_visualizations

In [34]:
create_tensorboard_visualizations(model,r"./")

In [35]:
X_unseen = np.array([
    ['guidelines', 'recommend', 'medication'],
  ['black musician', 'wears', 'royal livery'],
   ['Common effects', 'include', 'chest infections'],
 #   ['pandemic', 'include', 'global social economic disruption']

])

In [36]:
unseen_filter = np.array(list({tuple(i) for i in np.vstack((positives_filter, X_unseen))}))

In [37]:
ranks_unseen = evaluate_performance(
    X_unseen, 
    model=model, 
    filter_triples=unseen_filter,   # Corruption strategy filter defined above 
    corrupt_side = 's+o',
    use_default_protocol=False, # corrupt subj and obj separately while evaluating
    verbose=True
)

100%|██████████| 3/3 [00:00<00:00, 38.24it/s]


In [38]:
scores = model.predict(X_unseen)

In [39]:
from scipy.special import expit
probs = expit(scores)

In [40]:
pd.DataFrame(list(zip([' '.join(x) for x in X_unseen], 
                      ranks_unseen, 
                      np.squeeze(scores),
                      np.squeeze(probs))), 
             columns=['statement', 'rank', 'score', 'prob']).sort_values("score")

Unnamed: 0,statement,rank,score,prob
2,Common effects include chest infections,4381,-0.014421,0.496395
1,black musician wears royal livery,11,0.008373,0.502093
0,guidelines recommend medication,7,0.018678,0.504669
