In [1]:
!gdown --id "10LV9QbZOkUyOzR4nh8hxesoKJhpmvpM9"   # citation vectors
!gdown --id "1-23aNm7j0bnycvyd_OaQfofVYPTewgOI"   # abstract vectors
!gdown --id "1NyUQwgUNj9bFsiCnZ2TfKmWn5r-Y6wav"   # TitlesIdAbstractsEmbedIds
!wget 'https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/huggingface_pytorch/scibert_scivocab_uncased.tar'
!tar -xvf 'scibert_scivocab_uncased.tar'

Downloading...
From: https://drive.google.com/uc?id=10LV9QbZOkUyOzR4nh8hxesoKJhpmvpM9
To: /content/CitationSimilarityVectors106Epochs.npy
2.59GB [00:40, 63.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-23aNm7j0bnycvyd_OaQfofVYPTewgOI
To: /content/AbstractSimVectors.npy
2.59GB [01:01, 42.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1NyUQwgUNj9bFsiCnZ2TfKmWn5r-Y6wav
To: /content/TitlesIdsAbstractsEmbedIdsCOMPLETE_12-30-19.json.gzip
432MB [00:06, 68.7MB/s]
--2019-12-30 23:29:18--  https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/huggingface_pytorch/scibert_scivocab_uncased.tar
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.252.32
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.252.32|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 442460160 (422M) [application/x-tar]
Saving to: ‘scibert_scivocab_uncased.tar’


2019-12-30 23:29:31 (33.8 MB/s) - ‘scibert_scivocab_uncase

In [2]:
!pip install transformers --quiet

[K     |████████████████████████████████| 450kB 2.8MB/s 
[K     |████████████████████████████████| 860kB 62.3MB/s 
[K     |████████████████████████████████| 1.0MB 52.9MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [3]:
%tensorflow_version 2.x
import numpy as np
import tensorflow as tf
from time import time
from tqdm import tqdm_notebook as tqdm
from transformers import BertTokenizer
import pandas as pd
from pprint import pprint

print('TensorFlow:', tf.__version__)

TensorFlow 2.x selected.
TensorFlow: 2.1.0-rc1


In [4]:
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.MirroredStrategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
REPLICAS:  1


In [6]:
workers = ['/GPU:0']
workers

['/GPU:0']

In [0]:
class Index:
    def __init__(self, embeddings, worker):
        self.embeddings = tf.math.l2_normalize(embeddings, axis=1)
        self.worker = worker

    @tf.function
    def search(self, query_vector):
      with tf.device(worker):
        dot_product = tf.reduce_sum(tf.multiply(self.embeddings, query_vector), axis=1)
        distances = 1 - dot_product
        sorted_indices =  tf.argsort(distances)
        nearest_distances = tf.gather(distances, sorted_indices)
        return nearest_distances[:20], sorted_indices[:20]

In [0]:
citations_embeddings = np.load('CitationSimilarityVectors106Epochs.npy')
abstract_embeddings = np.load('AbstractSimVectors.npy')
assert citations_embeddings.shape == abstract_embeddings.shape

In [9]:
vecs_per_index = citations_embeddings.shape[0]
print('Vectors per index :', vecs_per_index)

Vectors per index : 1262996


In [10]:
## Place 1/8 of total embeddings on each TPU core
citation_indices = []
abstract_indices = []
for i, worker in enumerate(workers):
  with tf.device(worker):
    print('Building index with {} vectors on {}'.format(citations_embeddings.shape[0], worker))
    citation_indices.append(Index(citations_embeddings, worker))
    abstract_indices.append(Index(abstract_embeddings, worker))

Building index with 1262996 vectors on /GPU:0


In [0]:
def search(xq, top_k=10):
  cD, cI = [], []
  aD, aI = [], []
  for i in range(1):
    print('Search running on {}'.format(citation_indices[i].worker))
    cd, cidx = citation_indices[i].search(xq)
    ad, aidx = abstract_indices[i].search(xq)

    cD.extend(cd.numpy())
    aD.extend(ad.numpy())

    cI.extend(i*vecs_per_index + cidx.numpy())
    aI.extend(i*vecs_per_index + aidx.numpy())

  cid_sorted = np.argsort(cD)[:top_k]
  aid_sorted = np.argsort(aD)[:top_k]

  cD = np.array(cD)[cid_sorted]
  aD = np.array(aD)[aid_sorted]

  cI = np.array(cI)[cid_sorted]
  aI = np.array(aI)[aid_sorted]
  return cD, aD, cI, aI

In [0]:
model = tf.saved_model.load('gs://tfworld/saved_models')
tokenizer = BertTokenizer(vocab_file='scibert_scivocab_uncased/vocab.txt')

df = pd.read_json('/content/TitlesIdsAbstractsEmbedIdsCOMPLETE_12-30-19.json.gzip', compression = 'gzip')
embed2Title = pd.Series(df['title'].values,index=df['EmbeddingID']).to_dict()
embed2Abstract = pd.Series(df['paperAbstract'].values,index=df['EmbeddingID']).to_dict()
embed2Paper = pd.Series(df['id'].values,index=df['EmbeddingID']).to_dict()

In [14]:
# embed_id = 70
title ='Money' #embed2Title[embed_id]
# abstract = embed2Abstract[embed_id]
abstract = '''Financial modelling for assets'''

abstract_encoded = tokenizer.encode(abstract, max_length=512, pad_to_max_length=True)
abstract_encoded = tf.constant(abstract_encoded, dtype=tf.int32)[None, :]
print('Title : ')
pprint(title)
print('\nAbstract : ')
pprint(abstract)

Title : 
'Money'

Abstract : 
'Financial modelling for assets'


In [16]:
s = time()
bert_output = model(abstract_encoded)
xq = tf.nn.l2_normalize(bert_output, axis=1)
e_p = time()

cD, aD, cI, aI = search(xq, top_k=5)
e_s = time()
print('\n'*2)
print('Prediction time  :', np.round(e_p-s, 3), 'secs')
print('Search time      :', np.round(e_s-e_p, 3), 'secs')
print('Total time       :', np.round(e_s - s, 3), 'secs')

print('\n'*2)
print('*'*80)
for i in range(len(cI)):
  print('Title : ')
  pprint(embed2Title[cI[i]])
  print('\n')
  pprint('Link: semanticscholar.org/paper/'+embed2Paper[cI[i]])
  print('*'*80, )
print('\nNeighbours       :', cI )
print('Distances        :', np.round(cD, 4))

print('\n'*4)
print('*'*80)
for i in range(len(aI)):
  print('Abstract : ')
  pprint(embed2Abstract[aI[i]])
  print('\n')
  pprint('Link: semanticscholar.org/paper/'+embed2Paper[aI[i]])
  print('*'*80)
print('\nNeighbours       :', aI )
print('Distances        :', np.round(aD, 4))

Search running on /GPU:0



Prediction time  : 0.048 secs
Search time      : 0.066 secs
Total time       : 0.114 secs



********************************************************************************
Title : 
('Approach for improving receiver performance in loss-free handovers in DVB-H '
 'networks')


'Link: semanticscholar.org/paper/85fabe4a29a5eea59d78e3e5005c43837ad61837'
********************************************************************************
Title : 
'Qualification of the Joints for the ITER Central Solenoid'


'Link: semanticscholar.org/paper/b2c88eb44c152fb189edf2113466621a2d54dcee'
********************************************************************************
Title : 
('Solar energy as alternative power supply for communication system IEEE '
 '802.15.4 standard')


'Link: semanticscholar.org/paper/dcb499aefe2182be7f25033579f0bf17ef8bf20c'
********************************************************************************
Title : 
('Combinatorial invariance of Kazhdan–