# Build Abstract Database

Builds a pinecone database of vector embeddings from a list of abstracts

## Setup

In [None]:
!pip install numpy==1.23.2
!pip install transformers==4.28.0
!pip install -U sentence-transformers
!pip install datasets
!pip install torch
!pip install tensorflow
!pip install -qU pinecone-client[grpc]
!pip install Cython

### Imports

In [None]:
import os
import shutil

import numpy as np
import pandas as pd
import pinecone
from tqdm.auto import tqdm

PINECONE_API_KEY = ""
PINECONE_ENV = ""

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
    )

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

## Index Initialisation

In [None]:
index_name = "reviewer-assignment"

if index_name not in pinecone.list_indexes():
  pinecone.create_index(
      name=index_name,
      dimension=768,
      metric='cosine'
  )
index = pinecone.GRPCIndex(index_name)

In [None]:
index.describe_index_stats()

## Model Initialization

In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification
import torch

m_tokenizer = AutoTokenizer.from_pretrained("biodatlab/MIReAD-Neuro")
m_model = BertForSequenceClassification.from_pretrained("biodatlab/MIReAD-Neuro")
miread_bundle = (m_tokenizer,m_model)

### Helper Functions

In [None]:
def create_miread_embed(text,bundle):
  tokenizer = bundle[0]
  model = bundle[1]
  model.cuda()
  tokens = tokenizer(text,
                   max_length=512,
                   padding=True,
                   truncation=True,
                   return_tensors="pt"
                  )
  device = torch.device('cuda')
  tokens = tokens.to(device)
  with torch.no_grad():
    output = model.bert(**tokens)
    embeddings = output.last_hidden_state[:, 0, :]
  return embeddings.cpu()

In [None]:
def upsert(data,index,create_embed,bundle,name=''):
  """
  data : list of rows with an 'abstract' and an 'identifier' field
  index : pinecone Index object
  create_embed : function that creates the embedding given an abstract
  """
  batch_size = 128                               # If your RAM is crashing, it might help to decrease the batch_size
  for i in tqdm(range(0, len(data), batch_size)):
      # find end of batch
      i_end = min(i+batch_size, len(data))
      # create IDs batch
      ids = [name + '-' + str(x) for x in range(i, i_end)]
      # create metadata batch
      metadatas = [{
                    'journal':row.get('journal','None'),
                    'identifier':row['identifier'],
                    'abstract': row['abstract'],
                    'author':row.get('author','None')
                    } for row in data[i:i_end]]
      # create embeddings
      embeddings = [create_embed(row['abstract'],bundle).tolist()[0] for row in data[i:i_end]]
      # create records list for upsert
      records = zip(ids, embeddings, metadatas)
      # upsert to Pinecone
      index.upsert(vectors=records)

  # check number of records in the index
  index.describe_index_stats()

## Data Loading

Your data must be in csv file and there should be columns named 'title','abstract','journal' and 'author'.
Pinecone doesn't allow the use of `np.nan` so we are replacing NaN values in the dataframe to `None`.

In [None]:
data = pd.read_csv('your_data.csv')
data = data.replace(np.nan, None)
data.info()

In [None]:
data = data[['title','abstract','author','journal']]
data = data.rename(columns={'title':'identifier'})
data = data.to_dict('records')

## Upsert To Database

In [None]:
upsert(data,index,create_miread_embed,miread_bundle,'id')