In [1]:
import pandas as pd
import requests
import json
import numpy as np

from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, pipeline

### Create Embeddings

The purpose of this notebook is to create embeddings from article titles + abstracts for downstream analysis. Currently, using [SPECTER](https://github.com/allenai/specter) as a feature extractor.

In [2]:
input_path = "../data/target1_cleaned.csv"
save_path = "../embeddings/specter_embeddings_target1.json"

df = pd.read_csv(input_path)

In [3]:
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/specter")
model = AutoModel.from_pretrained("allenai/specter")

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [4]:
# Reformat title + abstract
papers = df[["Title", "Abstract"]].fillna("").to_dict("records")

# Concatenate title and abstract
title_abs = [
    d["Title"] + tokenizer.sep_token + (d.get("Abstract") or "") for d in papers
]

def create_embeddings(tokenizer, model, title_abs, chunk_size=50):

    # Separate title_abs into article chunks to prevent kernal crashes
    chunk_size = chunk_size
    chunks = [
        title_abs[x : x + chunk_size] for x in range(0, len(title_abs), chunk_size)
    ]

    # Create embeddings
    embed = []
    for chunk in tqdm(chunks):

        # Preprocess the input
        inputs = tokenizer(
            chunk, padding=True, truncation=True, return_tensors="pt", max_length=512
        )
        result = model(**inputs)

        # Take the first token in the batch as the embedding
        embeddings = result.last_hidden_state[:, 0, :]

        # Append batch to full embedding list
        embeddings_list = embeddings.tolist()
        embed = embed + embeddings_list

    return embed

specter_embed = create_embeddings(tokenizer, model, title_abs)

100%|██████████████████████████████████████████| 19/19 [32:30<00:00, 102.66s/it]


In [5]:
def save_embedding_json(df, file_name, embedding, id_type="refid"):

    embed_json = []
    for i, j in zip(df.index, embedding):
        embed_dict = {}
        embed_dict["id"] = i
        if id_type == "refid":
            embed_dict["Refid"] = int(df["Refid"][i])
        else:
            embed_dict["PMID"] = int(df["pmid"][i])
        embed_dict["embedding"] = j
        embed_json.append(embed_dict)

    with open(file_name, "w") as fp:
        json.dump(embed_json, fp)


save_embedding_json(df, save_path, specter_embed)