<a href="https://colab.research.google.com/github/biodatlab/nbdt-llm/blob/main/Build_VecStore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Builds a LangChain VectorStore from a list of abstracts

## Some Setup

In [None]:
!pip install transformers==4.28.0
!pip install -U sentence-transformers
!pip install datasets
!pip install langchain
!pip install torch
!pip install faiss-cpu

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m118.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.28.0
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 k

In [None]:
import os
import shutil

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification

m_tokenizer = AutoTokenizer.from_pretrained("biodatlab/MIReAD-Neuro-Large")
m_model = BertForSequenceClassification.from_pretrained("biodatlab/MIReAD-Neuro-Large")
miread_bundle = (m_tokenizer,m_model)

Downloading (…)okenizer_config.json:   0%|          | 0.00/415 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/717k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/231k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/447M [00:00<?, ?B/s]

In [None]:
def create_miread_embed(sents,bundle):
  tokenizer = bundle[0]
  model = bundle[1]
  model.cuda()
  tokens = tokenizer(sents,
                   max_length=512,
                   padding=True,
                   truncation=True,
                   return_tensors="pt"
                  )
  device = torch.device('cuda')
  tokens = tokens.to(device)
  with torch.no_grad():
    out = model.bert(**tokens)
    feature = out.last_hidden_state[:, 0, :]
  return feature.cpu()

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "biodatlab/MIReAD-Neuro-Large"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
faiss_embedder = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

def add_to_db(data,create_embed,bundle,name=''):
  batch_size = 128
  """
  data : list of rows with an 'abstract' and an 'identifier' field
  index : pinecone Index object
  create_embed : function that creates the embedding given an abstract
  """
  res = []
  vecdb = None
  for i in tqdm(range(0, len(data), batch_size)):
      # find end of batch
      i_end = min(i+batch_size, len(data))
      # create IDs batch
      ids = [name + '-' + str(x) for x in range(i, i_end)]
      # create metadata batch
      metadatas = [{
                    'journal':row.get('journal','None'),
                    'title':row['title'],
                    'abstract': row['abstract'],
                    'authors':row.get('authors','None'),
                    'link':row.get('link','None'),
                    'date':row.get('date','None'),
                    'submitter':row.get('submitter','None'),
                    } for row in data[i:i_end]]
      # create embeddings
      em = [create_embed(row['abstract'],bundle).tolist()[0] for row in data[i:i_end]]
      texts = [row['abstract'] for row in data[i:i_end]]
      records = list(zip(texts, em))
      if vecdb:
        vecdb_batch = FAISS.from_embeddings(records,faiss_embedder,metadatas=metadatas,ids=ids)
        vecdb.merge_from(vecdb_batch)
      else:
        vecdb = FAISS.from_embeddings(records,faiss_embedder,metadatas=metadatas,ids=ids)
  return vecdb

Downloading (…)e5c4b/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)6e3e5c4b/config.json:   0%|          | 0.00/231k [00:00<?, ?B/s]

Downloading optimizer.pt:   0%|          | 0.00/894M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/447M [00:00<?, ?B/s]

Downloading rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

Downloading scheduler.pt:   0%|          | 0.00/627 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)e5c4b/tokenizer.json:   0%|          | 0.00/717k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/415 [00:00<?, ?B/s]

Downloading (…)b/trainer_state.json:   0%|          | 0.00/27.0k [00:00<?, ?B/s]

Downloading training_args.bin:   0%|          | 0.00/3.58k [00:00<?, ?B/s]

Downloading (…)976e3e5c4b/vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/biodatlab_MIReAD-Neuro-Large were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading...
From: https://drive.google.com/uc?id=1-123xEqdY9uNhgoYjayroHr70kRaP3rj
To: /content/data_final.json
100% 58.1M/58.1M [00:00<00:00, 77.8MB/s]


In [None]:
nbdt_data = pd.read_json('data_final.json')
aliases = pd.read_csv('id_list.csv')

In [None]:
# data = pd.read_csv("MEDLINE_COMPLETE.csv",parse_dates=['P_Date'])

In [None]:
# data = data[['PMID','Title','Abstract','Author','Journal','P_Date']]
# data['PMID'] = data['PMID'].apply(lambda x: "http://www.ncbi.nlm.nih.gov/pubmed/" + str(x))
# data = data.rename(columns={'Title':'title','Abstract':'abstract','Journal':'journal','Author':'authors','PMID':'link','P_Date':'date'})
# data['submitter'] = 'None'

In [None]:
aliases

Unnamed: 0,Full Name,Author IDs
0,konrad kording,"['150174214', '117802858', '3282030', '2125917..."
1,jonathan pillow,"['122413326', '2066071995', '1791723', '104134..."
2,jakob macke,"['81550838', '1748468', '120371668', '18257937..."
3,alex gomez-marin,[]
4,roozbeh kiani,"['2050606', '50018291', '81827077', '147248542']"
...,...,...
133,timothy west,"['46464140', '1886101288', '47973851', '205975..."
134,josue nassar,['49350812']
135,matthew schlegel,"['145304454', '40369965']"
136,matthew perich,['5228640']


In [None]:
aliases = aliases.drop_duplicates('Full Name')
aliases

Unnamed: 0,Full Name,Author IDs
0,konrad kording,"['150174214', '117802858', '3282030', '2125917..."
1,jonathan pillow,"['122413326', '2066071995', '1791723', '104134..."
2,jakob macke,"['81550838', '1748468', '120371668', '18257937..."
3,alex gomez-marin,[]
4,roozbeh kiani,"['2050606', '50018291', '81827077', '147248542']"
...,...,...
133,timothy west,"['46464140', '1886101288', '47973851', '205975..."
134,josue nassar,['49350812']
135,matthew schlegel,"['145304454', '40369965']"
136,matthew perich,['5228640']


In [None]:
nbdt_data

Unnamed: 0,authorId,name,paperCount,papers
0,150174214,K. Kording,34,[{'paperId': '3d1de4ece70d3b47202a39963e2b09a1...
1,117802858,Kording Konrad,4,[{'paperId': 'bfecc78ea37dd1ce1950a95204abafdf...
2,3282030,Konrad Paul Kording,364,[{'paperId': '5cba4b2a4d0b74c8aad0c94b6f468f6c...
3,2125917123,Konrad P. Kording,1,[{'paperId': 'c78c99bed12d9e0915e34260f485c87d...
4,122413326,J. Pillow,3,[{'paperId': '50920b36cca05c524a658a453fd5484e...
...,...,...,...,...
1516,2150616033,Ryota Kobayashi,9,[{'paperId': '8ee2cc3dfa4e72ba5ebdb3ea35a17aa0...
1517,2070509747,Kobayashi Ryota,5,[{'paperId': '6716a26c8ba15f00781180e0ae53bf17...
1518,2180111021,Ryota Kobayashi,1,[{'paperId': 'f37e0281abf075fcd6faf78f8948eedd...
1519,2177233469,Ryota Kobayashi,1,[{'paperId': '1d13f9c2238eea124c304bdbd74d0b4f...


In [None]:
def load_nbdt(data,aliases):
  nbdt_records = []
  urls = []
  no_abst_count = 0
  no_journal_count = 0
  for row in aliases.itertuples():
    name = row[1]
    auth_ids = eval(row[2])
    auth_ids = [int(x) for x in auth_ids]
    papers = nbdt_data.loc[nbdt_data['authorId'].isin(auth_ids)]['papers']
    all_papers = []
    for paper_set in papers:
      all_papers.extend(paper_set)
    for paper in all_papers:
      url = paper['url']
      title = paper['title']
      abst = paper['abstract']
      year = paper['year']
      journal = paper.get('journal')
      if journal:
        journal = journal.get('name')
      else:
        journal = 'None'
        no_journal_count += 1
      authors = [name]
      if not(abst):
        abst = ''
        no_abst_count += 1
      record = {'journal':journal,'title':title,'abstract':abst,'link':url,'date':year,'authors':authors,'submitter':'None'}
      if url not in urls:
        nbdt_records.append(record)
        urls.append(url)
  return nbdt_records, (no_abst_count,no_journal_count)
nbdt_recs, no_counts = load_nbdt(nbdt_data,aliases)

In [None]:
links = [r['link'] for r in nbdt_recs]
len(set(links)),len(links)

(35360, 35360)

In [None]:
from dateutil.parser import parse

def parse_date(date):
  try:
    return parse(date)
  except:
    if 'Summer' in date.split():
      date = date.replace("Summer","Jul")
      return parse(date)
    elif 'Winter' in date.split():
      date = date.replace("Winter","Jan")
      return parse(date)
    elif 'Fall' in date.split():
      date = date.replace("Fall","Oct")
      return parse(date)
    elif 'Autumn' in date.split():
      date = date.replace("Autumn","Oct")
    elif 'Spring' in date.split():
      date = date.replace("Spring","Apr")
      return parse(date)
    elif 'Quarter' in date.split():
      parts = date.split()
      if (('First' in parts) or ('1st' in parts)):
        date = parts[0] + ' Jan'
      elif (('Second' in parts) or ('2nd' in parts)):
        date = parts[0] + ' Apr'
      elif (('Third' in parts) or ('3rd' in parts)):
        date = parts[0] + ' Jul'
      elif (('Fourth' in parts) or ('4th' in parts)):
        date = parts[0] + '  Oct'
      return parse(date)
    elif '-' in date:
      date = date.replace('Ene','Jan')
      dat = ' '.join([part.split('-')[0] for part in date.split()])
      return parse(dat)
    elif '/' in date:
      date = ' '.join([part.split('/')[0] for part in date.split()])
      return parse(date)
# data['date'] = data['date'].apply(parse_date)

In [None]:
# data['date'] = data['date'].dt.strftime('%b %Y')
# data_dict = data.to_dict('records')

In [None]:
# data_dict[0]

In [None]:
# small_data = data_dict[:200]

In [None]:
# faissdb = add_to_db(small_data,create_miread_embed,miread_bundle,'doc')

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# faissdb.save_local("faiss")

In [None]:
faissdb2 = add_to_db(nbdt_recs,create_miread_embed,miread_bundle,'nbdt')
faissdb2.save_local("nbdt_miread_large")

  0%|          | 0/277 [00:00<?, ?it/s]

In [None]:
from google.colab import files
# files.download('/content/nbdt_miread/index.faiss')
# files.download('/content/nbdt_miread/index.pkl')
files.download('/content/nbdt_miread_large/index.faiss')
files.download('/content/nbdt_miread_large/index.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# query = small_data[5]['abstract']

In [None]:
# query[:100]

'Molecular neurobiological insight into human nervous tissues is needed to generate next-generation t'

In [None]:
# docs_and_scores = faissdb.similarity_search_with_score(query)

In [None]:
# for d in docs_and_scores:
#   print(d[0].page_content[:100],d[1])

Molecular neurobiological insight into human nervous tissues is needed to generate next-generation t 73.296104
The formation of the Society for Neuroscience in 1969 was a scientific landmark, remarkable for the  725.89545
Parkinson's disease (PD) is the second most prevalent neurodegenerative disease among the elderly. T 778.618
To identify novel genes associated with ALS, we undertook two lines of investigation. We carried out 786.0128
