In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
  !pip install transformers 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 7.6 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 44.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 44.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.2 transformers-4.24.0


In [None]:
!python --version

Python 3.7.15


In [None]:
%cd /content/drive/MyDrive/AI5 MLOPS/

/content/drive/.shortcut-targets-by-id/1IBzU3wncSMdMkz8e2pEnfntllTMVQ3O0/AI5 MLOPS


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import datetime
import random
import json

import torch
from keras_preprocessing.sequence import pad_sequences
import torch.optim as optim
from transformers import BertTokenizer,  BertForMaskedLM ,AdamW ,BertConfig 
from sklearn.metrics.pairwise import cosine_similarity


from sklearn.metrics.pairwise import cosine_similarity

def load_data():
  data  = pd.read_csv("papers_with_abstract.csv")
  print("Data Shape :" , data.shape)

  data = data.drop(["Unnamed: 0","source_id"],axis = 1)
  data['abstract'] = data['abstract'].replace(r'\n', '', regex=True)
  data['full_text'] = data['full_text'].replace(r'\n', '', regex=True)
  return data 

def model_load(FILE = None):

  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

  # Get the SciBERT pretrained model path from Allen AI repo
  pretrained_model = 'allenai/scibert_scivocab_uncased'

  # Get the tokenizer from the previous path
  sciBERT_tokenizer = BertTokenizer.from_pretrained(pretrained_model, 
                                            do_lower_case=True)

  #objective of the masked language model is to predict the masked token, the label and the inputs are the same                                        do_lower_case=True)
  model = BertForMaskedLM.from_pretrained(pretrained_model,output_attentions=False,
                                                          output_hidden_states=True)
  model.to(device)
  checkpoint = torch.load(FILE,map_location=device)
  model.load_state_dict(checkpoint['model_state'])
  optimizer = optim.AdamW(model.parameters(), lr=5e-5)
  optimizer.load_state_dict(checkpoint['optim_state'])
  model.to(torch.device('cpu'))
  return device , sciBERT_tokenizer , model


def convert_single_abstract_to_embedding(in_text, MAX_LEN = 150):
    
    input_ids = sciBERT_tokenizer.encode(
                        in_text, 
                        add_special_tokens = True, 
                        max_length = MAX_LEN,                           
                   )    
    
    #print("input ids",input_ids)

    results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long", 
                              truncating="post", padding="post")
    #print("results",results)
    
    # Remove the outer list.
    input_ids = results[0]
    #print("input ids",input_ids)

    # Create attention masks    
    attention_mask = [int(i>0) for i in input_ids]
    #print("attention_mask",attention_mask)
    
    # Convert to tensors.
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    #print("input ids",input_ids)
    #print("attention_mask",attention_mask)

    # Add an extra dimension for the "batch" (even though there is only one 
    # input in this batch.)
    input_ids = input_ids.unsqueeze(0)
    attention_mask = attention_mask.unsqueeze(0)

    #print("input ids",input_ids)
    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()

    #input_ids = input_ids.to(device)
    #attention_mask = attention_mask.to(device)
    
    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():        
        o  = model(
                        input_ids = input_ids, 
                        token_type_ids = None, 
                        attention_mask = attention_mask)
        
        h_s = o[1][1:]

    layer_i = 11 # The last BERT layer before the classifier.
    batch_i = 0 # Only one input in the batch.
    token_i = 0 # The first token, corresponding to [CLS]

    #print(h_s[11].shape)
    # Extract the embedding.
    embedding = h_s[layer_i][batch_i][token_i]

    # Move to the CPU and convert to numpy ndarray.
    embedding = embedding.detach().cpu().numpy()

    return(embedding)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

def process_query(query_text):
    """
    # Create a vector for given query and adjust it for cosine similarity search
    """
    query_vect = convert_single_abstract_to_embedding(query_text)
    query_vect = np.array(query_vect)
    query_vect = query_vect.reshape(1, -1)
    return query_vect


def get_top_N_articles_cosine(query_text, data, top_N=5):
    """
    Retrieve top_N (5 is default value) articles similar to the query
    """
    query_vect = process_query(query_text)
    revevant_cols = ["title", "abstract", "cos_sim"]
    
    # Run similarity Search
    data["cos_sim"] = data["embeddings"].apply(lambda x: cosine_similarity(query_vect, x))
    data["cos_sim"] = data["cos_sim"].apply(lambda x: x[0][0])
    
    """
    Sort Cosine Similarity Column in Descending Order 
    Here we start at 1 to remove similarity with itself because it is always 1
    """
    most_similar_articles = data.sort_values(by='cos_sim', ascending=False)[1:top_N+1]
    
    return most_similar_articles[revevant_cols]



if __name__ == "__main__":
  data = load_data()
  global device , sciBERT_tokenizer , model 
  device , sciBERT_tokenizer , model  = model_load("finalcheckpoint19.pth")

  a_data = pd.read_pickle("data_abstract.pkl")
  t_data = pd.read_pickle("data_title.pkl")

  t0 = time.time()
  output = pd.DataFrame()
  query_text_test = ["HUMAN BRAIN MIMICS" , "VLSI neural network"]
  for i in query_text_test:
    output = pd.DataFrame()
    for j in [a_data,t_data]:
      # Get the query text
      # Get the similar articles
      top_articles = get_top_N_articles_cosine(i, j)
      output = pd.concat([top_articles,output], axis=0)
    print("Results for \" "+i+ " \"query")
    display(output)
    print("Time taken :",format_time(time.time() - t0))

Data Shape : (9313, 6)


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strat

Results for " HUMAN BRAIN MIMICS "query


Unnamed: 0,title,abstract,cos_sim
6077,Stochastic Online AUC Maximization,Area under ROC (AUC) is a metric which is wide...,0.991502
8458,Levenshtein Transformer,Modern neural sequence generation models are b...,0.991375
3042,Supervised Topic Models,We introduce supervised latent Dirichlet alloc...,0.991162
6556,Triple Generative Adversarial Nets,Generative Adversarial Nets (GANs) have shown ...,0.99111
1968,Hyperkernels,We consider the problem of choosing a kernel s...,0.990959
2475,Fast Gaussian Process Regression using KD-Trees,1,0.961937
37,SPONTANEOUS AND INFORMATION-TRIGGERED SEGMENT...,he brain works in a state-dependent manner: pr...,0.949969
38,Simulations Suggest Information Processing Rol...,computer model of the hippocampal pyramidal c...,0.929786
60,HIGH DENSITY ASSOCIATIVE MEMORIES,rom a description of desired properties,0.900976
41,"Discovering Structure from Motion in Monkey, M...",he ability to obtain three-dimensional structu...,0.866808


Time taken : 0:00:07
Results for " VLSI neural network "query


Unnamed: 0,title,abstract,cos_sim
8819,Variational Graph Recurrent Neural Networks,Representation learning over graph structured ...,0.994412
7209,Reversible Recurrent Neural Networks,Recurrent neural networks (RNNs) provide state...,0.994219
7911,Hyperbolic Graph Convolutional Neural Networks,Graph convolutional neural networks (GCNs) emb...,0.994154
7978,Hyperbolic Graph Neural Networks,Learning from graph-structured data is an impo...,0.994024
6054,Doubly Convolutional Neural Networks,Building large models with parameter sharing a...,0.993757
2475,Fast Gaussian Process Regression using KD-Trees,1,0.958352
37,SPONTANEOUS AND INFORMATION-TRIGGERED SEGMENT...,he brain works in a state-dependent manner: pr...,0.947613
38,Simulations Suggest Information Processing Rol...,computer model of the hippocampal pyramidal c...,0.934392
60,HIGH DENSITY ASSOCIATIVE MEMORIES,rom a description of desired properties,0.901424
41,"Discovering Structure from Motion in Monkey, M...",he ability to obtain three-dimensional structu...,0.871801


Time taken : 0:00:13
