In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
!pip install transformers

In [0]:
import numpy as np
import pandas as pd 
import transformers
from transformers import BertTokenizer
import urllib
from random import randrange
import tarfile
import tensorflow as tf
from tqdm import tqdm_notebook as tqdm
import os

print('TensorFlow:', tf.__version__)

TensorFlow: 2.1.0-rc1


In [0]:
url = 'https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/huggingface_pytorch/scibert_scivocab_uncased.tar'
urllib.request.urlretrieve(url, 'scibert_scivocab_uncased.tar')
tar = tarfile.open("scibert_scivocab_uncased.tar")
tar.extractall()
tar.close()

In [0]:
embeds = np.load('/content/gdrive/My Drive/NaturalLanguageRecommendations/CS/CombAfterNormalizaton106Epochs.npy')
df = pd.read_json( '/content/gdrive/My Drive/NaturalLanguageRecommendations/CS/TitlesAbstractsEmbedIds.json.gzip', compression = 'gzip' )
embed2Abstract = pd.Series(df['paperAbstract'].values,index=df['EmbeddingID']).to_dict()
embed2Abstract

{235092: 'This paper proposes simple analytic formulas for proportional-integral-derivative (PID) controller tuning for typical process models. The formulas are obtained in a similar way to the simple internal model control (SIMC) tuning rules, while the leading analysis is more delicate. Compared to SIMC counterparts, the new tuning formulas lead to better load disturbance rejection while giving similar setpoint response and peak sensitivity.',
 868823: 'Due to the advantages of high efficiency and high torque density, the Permanent Magnet Synchronous Machine (PMSM) has been widely used in direct-drive applications such as all-electric propulsion system. The loss and efficiency of PMSM are the main restriction for performance improvement. This paper focuses on analyzing the influence of structure parameters on the loss and efficiency of PMSM, including the split-ratio, the thickness of PMs, the pole-slot combination and the utilization of Halbach PM array, based on the Magnetic Circui

In [0]:
tokenizer = BertTokenizer.from_pretrained('scibert_scivocab_uncased')
tokenizer.encode("Let's see all hidden-states and attentions on this text")

[102,
 1293,
 2505,
 112,
 1461,
 355,
 8033,
 579,
 1898,
 137,
 3577,
 30113,
 191,
 238,
 3267,
 103]

In [0]:
class TFrecordWriter:
    def __init__(self,
                 n_samples,
                 n_shards,
                 output_dir='',
                 prefix=''):
        self.n_samples = n_samples
        self.n_shards = n_shards
        self.step_size = self.n_samples//self.n_shards + 1
        self.prefix = prefix
        self.output_dir = output_dir
        self.buffer = []
        self.file_count = 1
        
    def make_example(self, title, vector):
        feature = {
            'title': tf.train.Feature(int64_list=tf.train.Int64List(value=title)),
            'citation': tf.train.Feature(float_list=tf.train.FloatList(value=vector))
        }
        return tf.train.Example(features=tf.train.Features(feature=feature))
        
    def write_tfrecord(self, tfrecord_path):
        print('writing {} samples in {}'.format(len(self.buffer), tfrecord_path))
        with tf.io.TFRecordWriter(tfrecord_path) as writer:
            for (title, vector) in tqdm(self.buffer):
                example = self.make_example(title, vector)
                writer.write(example.SerializeToString())
    
    def push(self, title, vector):
        self.buffer.append([title, vector])
        if len(self.buffer) == self.step_size:
            fname = self.prefix + '_000' + str(self.file_count) + '.tfrecord'
            tfrecord_path = os.path.join(self.output_dir, fname)
            self.write_tfrecord(tfrecord_path)
            self.clear_buffer()
            self.file_count += 1
            
    def flush_last(self):
        if len(self.buffer):
            fname = self.prefix + '_000' + str(self.file_count) + '.tfrecord'
            tfrecord_path = os.path.join(self.output_dir, fname)
            self.write_tfrecord(tfrecord_path)
            
    def clear_buffer(self):
        self.buffer = []

In [0]:
path_to_saveTrain = '/content/gdrive/My Drive/NaturalLanguageRecommendations/CS/TfRecordsTrain/'
tfrecord_writerTrain = TFrecordWriter(int(1262996*.80)+1 , 512, path_to_saveTrain, 'train')

path_to_saveEval = '/content/gdrive/My Drive/NaturalLanguageRecommendations/CS/TfRecordsEval/'
tfrecord_writerEval = TFrecordWriter(int(1262996*.20)+1 , 512, path_to_saveEval, 'eval')

for key, value in embed2Abstract.items():
    if key%5000 == 0:
        print(key)

    vector = embeds[key]
    input_ids = tokenizer.encode(value,  max_length=512, pad_to_max_length = True)

    #Generate random ints that range from 0 to 9
    splitter = randrange(10)
    if splitter < 8:
        tfrecord_writerTrain.push(input_ids, vector)
    else:
        tfrecord_writerEval.push(input_ids, vector)

tfrecord_writerTrain.flush_last()
tfrecord_writerEval.flush_last()