In [None]:
!pip install transformers

In [None]:
# Constants
TRAINING_SET_PATH = './training_set.json'
TEST_SET_PATH = './test_set.json'
BERT_PATH = './scibert_scivocab_uncased'
TRAINING_REPRESENTATIONS_PATH = './scibert_training_representations.npz'
MAX_SEQUENCE_LENGTH = 512
BATCH_SIZE = 16

In [None]:
## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

!cp '/path_to_your_directory_on_google_drive/training_set.json' $TRAINING_SET_PATH
!cp '/path_to_your_directory_on_google_drive/test_set.json' $TEST_SET_PATH

In [None]:
# Downloading the scibert model
!wget -qO- https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/pytorch_models/scibert_scivocab_uncased.tar | tar --transform 's/^dbt2-0.37.50.3/dbt2/' -xv
!tar -xzf ./scibert_scivocab_uncased/weights.tar.gz -C ./scibert_scivocab_uncased/
!mv ./scibert_scivocab_uncased/bert_config.json ./scibert_scivocab_uncased/config.json

In [None]:
import json

def read_json(input_path):
    with open(input_path, encoding='utf-8') as f:
        json_data = json.load(f)

    return json_data

In [None]:
import torch
from transformers import BertModel, BertTokenizer
from scipy.sparse import csr_matrix
from torch.utils.data import Dataset

class ClusteringDataset(Dataset):
    def __init__(self, df):
        self.data = df

    def __len__(self):
        return len(self.data.index)

    def __getitem__(self, idx):
        instances = self.data['text']
        
        return instances.iloc[[idx]].values[0]

class SciBERT:
    DEFAULT_PATH = 'allenai/scibert_scivocab_uncased'

    @staticmethod
    def model(path=DEFAULT_PATH):
        return BertModel.from_pretrained(path)

    @staticmethod
    def tokenizer(path=DEFAULT_PATH):
        return BertTokenizer.from_pretrained(path)

class SciBERTVectorizer:

  def __init__(self):
    pass

  def fit(self, tokenizer, model):
    self.model = model
    self.tokenizer = tokenizer

    return self

  def transform(self, text_batch):
    text_tokenized = self.tokenizer(text=text_batch, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, truncation=True, return_tensors='pt')
    outputs = self.model(text_tokenized['input_ids'])

    return outputs['last_hidden_state'].squeeze(0)

In [None]:
tokenizer = SciBERT.tokenizer(BERT_PATH)
model = SciBERT.model(BERT_PATH)

vectorizer = SciBERTVectorizer().fit(tokenizer, model)

In [None]:
# Import and process the training data
import pandas as pd

train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])

In [None]:
from torch.utils.data import DataLoader
from scipy import sparse

training_data = ClusteringDataset(train_df)
training_data_loader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=False)

for i, batch in enumerate(training_data_loader):
  print('batch {}/{}'.format(i + 1, len(training_data_loader)))
  batch_vector_representations = vectorizer.transform(batch).detach().numpy().mean(1)
  batch_vector_representations = sparse.csr_matrix(batch_vector_representations)

  try:
    scibert_representations = sparse.vstack((scibert_representations, batch_vector_representations))
  except:
    scibert_representations = batch_vector_representations

  print(scibert_representations.shape)

sparse.save_npz(TRAINING_REPRESENTATIONS_PATH, scibert_representations)
!cp $TRAINING_REPRESENTATIONS_PATH '/path_to_your_directory_on_google_drive/'$TRAINING_REPRESENTATIONS_PATH 