<a href="https://colab.research.google.com/github/Santosh-Gupta/NaturalLanguageRecommendations/blob/master/notebooks/inference_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install transformers --quiet

In [28]:
!wget 'https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/huggingface_pytorch/scibert_scivocab_uncased.tar'
!tar -xvf 'scibert_scivocab_uncased.tar'

--2019-12-28 08:15:15--  https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/huggingface_pytorch/scibert_scivocab_uncased.tar
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.232.232
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.232.232|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 442460160 (422M) [application/x-tar]
Saving to: ‘scibert_scivocab_uncased.tar.1’


2019-12-28 08:15:39 (18.2 MB/s) - ‘scibert_scivocab_uncased.tar.1’ saved [442460160/442460160]

scibert_scivocab_uncased/
scibert_scivocab_uncased/vocab.txt
scibert_scivocab_uncased/pytorch_model.bin
scibert_scivocab_uncased/config.json


In [0]:
from google.colab import drive
drive.mount('/gdrive')
drive_base_path = '/gdrive/My Drive/'

In [30]:
%tensorflow_version 2.x
import os
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Lambda, Dense, Activation, Concatenate, Dropout
from transformers import TFBertModel
from time import time
print('TensorFlow:', tf.__version__)

TensorFlow: 2.1.0-rc1


In [0]:
embedding_dim = 512
model_files_pattern = 'gs://tfworld/model_files_2/*'

In [0]:
def create_model(drop_out):
    textIds = tf.keras.Input(shape=(512,), dtype=tf.int32)    # from bert tokenizer
    citation = tf.keras.Input(shape=(512,))                   # normalized word2vec outputs
    
    bert_model = TFBertModel.from_pretrained('scibert_scivocab_uncased', from_pt=True)
    
    textOut = bert_model(textIds)
    textOutMean = tf.reduce_mean(textOut[0], axis=1)
    textOutSim = Dense(units=embedding_dim, activation='tanh', name='DenseTitle')(textOutMean)
    textOutSim = Dropout(drop_out)(textOutSim)
    
    citationSim = Dense(units=embedding_dim, activation='tanh', name='DenseCitation')(citation)
    citationSim = Dropout(drop_out)(citationSim)

    # Get dot product of each of title x citation combinations
    dotProduct = tf.reduce_sum(tf.multiply(textOutSim[:, None, :], citationSim), axis=-1)
    
    # Softmax to make sure each row has sum == 1.0
    probs = tf.nn.softmax(dotProduct, axis=-1)

    model = tf.keras.Model(inputs=[textIds, citation], outputs=[probs])
    return model

In [0]:
model = create_model(drop_out=.2)
model.load_weights('gs://tfworld/model_files_2/epoch_06_1.96')

In [36]:
inference_model = tf.keras.Model(inputs=[model.inputs[0]],
                                 outputs=[model.get_layer('DenseTitle').output])


([<tf.Tensor 'input_4:0' shape=(None, 512) dtype=int32>],
 [<tf.Tensor 'DenseTitle_1/Identity:0' shape=(None, 512) dtype=float32>])

In [0]:
citation_projection_model = tf.keras.Sequential([tf.keras.Input(shape=(512,), dtype=tf.float32),
                                                 model.get_layer('DenseCitation')])

In [39]:
inference_model.summary(), citation_projection_model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 512)]             0         
_________________________________________________________________
tf_bert_model_1 (TFBertModel ((None, 512, 768), (None, 109918464 
_________________________________________________________________
tf_op_layer_Mean_1 (TensorFl [(None, 768)]             0         
_________________________________________________________________
DenseTitle (Dense)           (None, 512)               393728    
Total params: 110,312,192
Trainable params: 110,312,192
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
DenseCitation (Dense)        (None, 512)               262656    
Total params: 262,656
T

(None, None)

In [0]:
model_output_dir = drive_base_path+'tfworld/'
os.mkdir(model_output_dir)

In [46]:
inference_model.save(model_output_dir + 'inference_model', save_format='tf')
citation_projection_model.save(model_output_dir + 'citations_projection_model', save_format='tf')

INFO:tensorflow:Assets written to: /gdrive/My Drive/tfworld/inference_model/assets
INFO:tensorflow:Assets written to: /gdrive/My Drive/tfworld/citations_projection_model/assets


In [48]:
!zip -r "/gdrive/My Drive/tfworld.zip" "/gdrive/My Drive/tfworld"

  adding: gdrive/My Drive/tfworld/ (stored 0%)
  adding: gdrive/My Drive/tfworld/inference_model/ (stored 0%)
  adding: gdrive/My Drive/tfworld/inference_model/variables/ (stored 0%)
  adding: gdrive/My Drive/tfworld/inference_model/variables/variables.data-00000-of-00002 (deflated 13%)
  adding: gdrive/My Drive/tfworld/inference_model/variables/variables.data-00001-of-00002 (deflated 7%)
  adding: gdrive/My Drive/tfworld/inference_model/variables/variables.index (deflated 78%)
  adding: gdrive/My Drive/tfworld/inference_model/assets/ (stored 0%)
  adding: gdrive/My Drive/tfworld/inference_model/saved_model.pb (deflated 92%)
  adding: gdrive/My Drive/tfworld/citations_projection_model/ (stored 0%)
  adding: gdrive/My Drive/tfworld/citations_projection_model/variables/ (stored 0%)
  adding: gdrive/My Drive/tfworld/citations_projection_model/variables/variables.data-00000-of-00002 (deflated 61%)
  adding: gdrive/My Drive/tfworld/citations_projection_model/variables/variables.data-00001-o

In [0]:
abstract_model = tf.saved_model.load(model_output_dir + 'inference_model')
citations_model = tf.saved_model.load(model_output_dir + 'citations_projection_model')

In [0]:
abstractIds = tf.random.uniform(shape=(1, 512), maxval=500, dtype=tf.int32).numpy()
citation_vector = tf.random.uniform(shape=(1, 512), minval=-1, maxval=1, dtype=tf.float32).numpy()

In [51]:
abstract_model(abstractIds), citations_model(citation_vector)

(<tf.Tensor: shape=(1, 512), dtype=float32, numpy=
 array([[ 0.09522425, -0.39225134, -0.07253961,  0.41595373,  0.02025263,
         -0.18624307, -0.03280202,  0.26274082, -0.29510048,  0.34520406,
          0.17984556, -0.5795249 , -0.41815358, -0.00525578,  0.13193037,
         -0.1338952 ,  0.30299622,  0.06318647,  0.366908  , -0.03760731,
         -0.1498311 ,  0.23948044,  0.26345256, -0.06667762,  0.03558221,
         -0.02908579, -0.13135633,  0.3858654 , -0.61231977,  0.4223364 ,
          0.21263365,  0.13307944, -0.25682324,  0.18017827,  0.14683186,
          0.12212363, -0.3591733 ,  0.09848753,  0.07014494, -0.00300246,
          0.25513083, -0.28837565, -0.13576348,  0.15486088,  0.02679499,
         -0.02881716,  0.01973225, -0.37663153, -0.07709761,  0.00374446,
         -0.239358  , -0.00438497,  0.0910695 ,  0.76998264, -0.16129208,
          0.07133216, -0.27428988,  0.47261432,  0.14465924, -0.24612027,
         -0.04334822, -0.31286556, -0.23582418, -0.33774894, 