# SciBERT Embeddings
Using this notebook you can simply compute the embeddings of your dataset, store them in an `.npz` format and finally download them locally.

Please store you data files in a Google Drive directory of yours (``MAIN_DRIVE_DIR``).


|       Variable       | Description |
|:--------------------:|:--------------------------------------------------------:|
|``MAIN_DRIVE_DIR`` | Name of your main directory in your Google Drive |
|  `TRAINING_SET_PATH` | Path to your training set inside the `MAIN_DRIVE_DIR `|
|  `TEST_SET_PATH` | Path to your test set inside the `MAIN_DRIVE_DIR `|
|  `TRAINING_REPRESENTATION_PATH` | Path to your training set representations the ``MAIN_DRIVE_DIR``|
|  `TEST_REPRESENTATION_PATH` | Path to your test set representations the ``MAIN_DRIVE_DIR``|





In [None]:
!pip install sentence-transformers

In [None]:
# Constants
TRAINING_SET_PATH = './training_set.json'
TEST_SET_PATH = './test_set.json'
TRAINING_REPRESENTATIONS_PATH = './scibert_training_representations.npz'
TEST_REPRESENTATIONS_PATH = './scibert_test_representations.npz'
MAIN_DRIVE_DIR = 'TODO'

In [None]:
## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/'$TRAINING_SET_PATH $TRAINING_SET_PATH
!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/'$TEST_SET_PATH $TEST_SET_PATH

In [None]:
import json

def read_json(input_path):
    with open(input_path, encoding='utf-8') as f:
        json_data = json.load(f)

    return json_data

In [None]:
import torch
from scipy.sparse import csr_matrix
from torch.utils.data import Dataset

class ClusteringDataset(Dataset):
    def __init__(self, df):
        self.data = df

    def __len__(self):
        return len(self.data.index)

    def __getitem__(self, idx):
        instances = self.data['text']
        
        return '[CLS] ' + instances.iloc[[idx]].values[0] + ' [SEP]'

In [None]:
from sentence_transformers import SentenceTransformer

vectorizer = SentenceTransformer('allenai/scibert_scivocab_uncased')

In [None]:
import pandas as pd

from torch.utils.data import DataLoader
from scipy import sparse

for representation_path, set_path in zip([TRAINING_REPRESENTATIONS_PATH, TEST_REPRESENTATIONS_PATH], [TRAINING_SET_PATH, TEST_SET_PATH]):
    
    print('Processing {}. Will be saved in {}'.format(set_path, representation_path))
             
    data_json = read_json(set_path)
    data_df = pd.json_normalize(data_json['instances'])

    dataset = ClusteringDataset(data_df)
    data_loader = DataLoader(dataset, batch_size=16, shuffle=False)
    scibert_representations = None

    for i, batch in enumerate(data_loader):
      print('batch {}/{}'.format(i + 1, len(data_loader)))
      batch_vector_representations = vectorizer.encode(batch)
      batch_vector_representations = sparse.csr_matrix(batch_vector_representations)

      try:
        scibert_representations = sparse.vstack((scibert_representations, batch_vector_representations))
      except:
        scibert_representations = batch_vector_representations

      print(scibert_representations.shape)

    sparse.save_npz(representation_path, scibert_representations)
    !cp $representation_path '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/scibert/'$representation_path

In [None]:
from google.colab import files

for representation_path in [TRAINING_REPRESENTATIONS_PATH, TEST_REPRESENTATIONS_PATH]:
    files.download(representation_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>