# Fetch Recommendations

Notebook to access and fetch recommendations from the database

## Setup

You will need to restart the runtime after running this cell as the numpy version is changed

In [None]:
!pip install numpy==1.23.2
!pip install transformers==4.28.0
!pip install -U sentence-transformers
!pip install datasets
!pip install torch
!pip install -qU pinecone-client[grpc]
!pip install Cython

### Imports

In [None]:
import os
import shutil

import numpy as np
import pandas as pd
import pinecone
from tqdm.auto import tqdm

PINECONE_API_KEY = ""
PINECONE_ENV = ""

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
    )

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

### Connecting to the Index

In [None]:
index_name = "reviewer-assignment"    # Replace with your index name
index = pinecone.GRPCIndex(index_name)

## Helper Functions

### Model Initiation

In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification
import torch

m_tokenizer = AutoTokenizer.from_pretrained("biodatlab/MIReAD-Neuro")
m_model = BertForSequenceClassification.from_pretrained("biodatlab/MIReAD-Neuro")
miread_bundle = (m_tokenizer,m_model)

### Create Embedding from Query

In [None]:
def create_miread_embed(text,bundle):
  tokenizer = bundle[0]
  model = bundle[1]
  model.cuda()
  tokens = tokenizer(text,
                   max_length=512,
                   padding=True,
                   truncation=True,
                   return_tensors="pt"
                  )
  cuda = torch.device('cuda')
  tokens = tokens.to(cuda)
  with torch.no_grad():
    out = model.bert(**tokens)
    feature = out.last_hidden_state[:, 0, :]
  return feature.cpu()

### Function to query the database

In [None]:
def get_matches(query,k=10,include_metadata=True,mode='a'):
  """
  Queries the index to get matches from the database and prints them.

          Parameters:
                query (int) : containing the abstract
                k (int) : fetches best 'k' results
                include_metadata (bool) : fetches the title and abstract of the match as well
                mode (string) : what to recommend. 'a' for abstracts, 'j' for journals, 'n' for author names

          Returns:
                matches : The actual object returned by the index.query() method, if you require any additional data about the matches.
  """
  encoded_query = create_miread_embed(query,miread_bundle)
  # Get matches from the pinecone database
  matches = index.query(encoded_query.tolist()[0],top_k=k,include_metadata=include_metadata)
  # Buckets to store results of each mode
  j_bucket = {'None':0}
  n_bucket = {'None':0}
  for i,match in enumerate(matches['matches']):
    if 'j' in mode:
      if match['metadata']['journal'] not in j_bucket:
        j_bucket[match['metadata']['journal']] = match['score']
      else:
        j_bucket[match['metadata']['journal']] += match['score']
    if 'n' in mode:
      authors = eval(match['metadata']['author'])
      if authors and authors[0] not in n_bucket:
        n_bucket[authors[0]] = match['score']
      elif authors:
        n_bucket[authors[0]] += match['score']
      else:
        n_bucket['None'] += match['score']
    if 'a' in mode:
      print(f"Match {i+1}")
      print(f"id : {match['id']}")
      print(f"score : {match['score']}")
      print(f"title : {match['metadata']['identifier']}")
      print(f"abstract: {match['metadata']['abstract']}")
      print(f"journal: {match['metadata']['journal']}")
      print(f"author: {match['metadata']['author']}")
      print('----------------------------------------------------------------------------------------------------------------------------')
  # Sort results
  j_results = sorted([(key,val) for key,val in j_bucket.items()],key= lambda x : x[1],reverse=True)
  n_results = sorted([(key,val) for key,val in n_bucket.items()],key= lambda x : x[1],reverse=True)
  if 'j' in mode:
    print(f"{mode.upper()} Results :")
    results = j_results
  elif 'n' in mode:
    results = n_results
  if results:
    for i, item in enumerate(results):
      if item[0] != 'None':
        print(f"{i}) '{item[0]}' with score {item[1]}")
      else:
        print(f"Some similar matches didn't have the required data. The score was {item[1]} ")
  return matches

## Sample Usage

In [None]:
query = "Insert your abstract here."

In [None]:
matches = get_matches(query,k=30,include_metadata=True,mode='j')