## Import libraries

In [None]:
!pip install transformers

In [None]:
import os
import re
import csv
import gzip
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from random import randint
from collections import Counter

import torch
from transformers import AutoTokenizer, AutoModel

import networkx as nx

## SciBERT

In [None]:
G = nx.read_edgelist('data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)

abstracts = dict()
with open('data/abstracts.txt', 'r',  encoding="utf8") as f:
    for line in f:
        node, abstract = line.split('|--|')
        abstracts[int(node)] = abstract

In [None]:
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')

device = 'cuda' if torch.cuda.is_available() else 'cpu'  

model.to(device)
model.eval()

text2vec = dict()

for i in tqdm(range(0, len(nodes))):
    node_id = nodes[i]
    abstract = abstracts[node_id]

    tokens = tokenizer.encode(abstract, padding=True, truncation=True, max_length=16, add_special_tokens=True, return_tensors='pt').to(device)
    token_embeddings = model(tokens)[0].detach().cpu().numpy()
    token_embeddings = token_embeddings.squeeze(0)
    text2vec[node_id] = token_embeddings

In [None]:
file = gzip.GzipFile('embeddings/abstract_embeddings.emb', 'wb')
file.write(pickle.dumps(text2vec))
file.close()