# Cosine Similarity | Abstracts | Pretrained Model

 ⠀

 ⠀

# Loading Abstracts from File

### Import Required Libraries

In [1]:
from utility import Corpus, Document, SearchEngine, Evaluator

In [2]:
corp = Corpus('search_list_txt/')

abstracts = corp.split_sections('abstract')

search_list_txt/Regorafenib Loaded Poly  Lactide Co Glycolide  Microspheres.txt
search_list_txt/Asymptomatic Carrier State  Acute Respiratory Disease.txt
search_list_txt/Phospolipids And Application.txt
search_list_txt/Deamidation In Moxetumomab Pasudotox Leading To Conformational Change And Immunotoxin Activity Loss.txt
search_list_txt/Assembling Structurally Customizable Synthetic Carriers.txt
search_list_txt/Molecular Immune Pathogenesis And Diagnosis Of Covid 19.txt
search_list_txt/Redox Sensitive  Peg Shielded Carboxymethyl Pei.txt
search_list_txt/Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder.txt
search_list_txt/Anti Toxoplasma Gondii Activity Of 5 Oxo Hexahydroquinoline Derivatives   Synthesis, In Vitro And In Vivo Evaluations, And Molecular Docking Analysis.txt
search_list_txt/The Effect Of Pramlintide, An Antidiabetic Amylin Analogue On Angiogenesis Related Markers In Vitro.txt
search_list_txt/Pharmaceutical Care In A Patie

In [3]:
abstracts

{'Regorafenib Loaded Poly  Lactide Co Glycolide  Microspheres': 'transarterial chemoembolization tace has been widely introduced to treat hepatocellularcarcinoma hcc especially for unresectable patients for decades. however tace evokesan angiogenic response due to the secretion of vascular endothelial growth factor vegfresulting in the formation of new blood vessels and eventually tumor recurrence. thus weaimed to develop regorafenib regoloaded poly lactidecoglycolide plga microspheresthat enabled localized and sustained drug delivery to limit proangiogenic responses',
 'Asymptomatic Carrier State  Acute Respiratory Disease': 'since the emergence of coronavirus disease 2019 covid19 formerly known as the 2019 novel coronavirus 2019ncov in wuhan china in december 2019 which is caused by severe acute respiratory syndrome coronavirus 2 sarscov2 more than 75000 cases have been reported in 32 countriesregions resulting in more than 2000 deaths worldwide. despite the fact that most covid19 ca

 ⠀

# Preprocess Abstracts

### Import Required Libraries

In [4]:
# numpy imports
import numpy as np
from numpy import dot
from numpy.linalg import norm

# nltk imports
import nltk
from nltk import punkt
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

# punctuation list
puncs="!#$%&'()*+,-./:;<=>?@[\]^_`{|}~"

### Remove Punctuations and Stopwords from Abstracts

In [5]:
stop_words = set(stopwords.words('english')) 

In [6]:
abstract_filtered = dict()

for key, abstract in abstracts.items():
  temp_list = []
  print(key,"\n")
  for word in word_tokenize(abstract):
    if word not in stop_words:
      if word not in puncs:
        temp_list.append(word)
  abstract_filtered[key] = temp_list

Regorafenib Loaded Poly  Lactide Co Glycolide  Microspheres 

Asymptomatic Carrier State  Acute Respiratory Disease 

Phospolipids And Application 

Deamidation In Moxetumomab Pasudotox Leading To Conformational Change And Immunotoxin Activity Loss 

Assembling Structurally Customizable Synthetic Carriers 

Molecular Immune Pathogenesis And Diagnosis Of Covid 19 

Redox Sensitive  Peg Shielded Carboxymethyl Pei 

Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder 

Anti Toxoplasma Gondii Activity Of 5 Oxo Hexahydroquinoline Derivatives   Synthesis, In Vitro And In Vivo Evaluations, And Molecular Docking Analysis 

The Effect Of Pramlintide, An Antidiabetic Amylin Analogue On Angiogenesis Related Markers In Vitro 

Pharmaceutical Care In A Patient With Recurrent Infection After Liver Transplant 

Nanotechnology And Vaccine Development 

Hollow Fiber Based Liquid Phase Microextraction 

Pharmacological Options Beyond Proton Pump Inhibitor

 ⠀

# Word2Vec Search

### Import Required Libraries

In [7]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile
import gensim.downloader as api
from pprint import pprint

### Load Model from Memory

In [8]:
model = api.load("glove-twitter-50")

### Convert Abstracts to Vectors

In [9]:
#Abstract to vector
abstract2vec=dict()
for key, abstract in abstract_filtered.items():
  centroid = np.zeros((50,))
  for word in abstract:
    try:
      centroid += model[word.lower()]
    except:
      pass
  centroid /= len(abstract)
  abstract2vec[key] = centroid

abstract2vec

{'Regorafenib Loaded Poly  Lactide Co Glycolide  Microspheres': array([-0.12479275,  0.17013659, -0.25284041,  0.07199952,  0.11043473,
         0.04493812,  0.10610927, -0.36099259, -0.09909542, -0.29467104,
        -0.03400764, -0.33096792, -1.50815028,  0.12759448,  0.24886575,
         0.13988896,  0.13908802, -0.08152196,  0.13063436, -0.21846889,
        -0.08374892,  0.15320835,  0.01138815, -0.02396688, -0.20624868,
         0.55014025,  0.0260206 ,  0.13125911, -0.09416927,  0.37362843,
         0.17869767, -0.18835398,  0.1152775 , -0.1499756 ,  0.15868358,
        -0.12736281, -0.15712371,  0.25324392,  0.00533474, -0.05771021,
         0.17947104, -0.01338884,  0.11007594, -0.03567727, -0.02510219,
        -0.14890528, -0.0126895 ,  0.18132062, -0.16524269,  0.14843898]),
 'Asymptomatic Carrier State  Acute Respiratory Disease': array([-0.08006624,  0.31573784, -0.39160539,  0.0543032 ,  0.052363  ,
         0.00500647,  0.15511289, -0.30982707, -0.07494067, -0.45312202,
  

### Search for a Query

In [10]:
class Engine(SearchEngine):
    def search(query):
        results = dict()
        query = query.lower()
        #Abstract to vector
        query2vec=np.zeros((50,))
        query_tokenized=word_tokenize(query)
        for word in query_tokenized:
          if word not in stop_words:
            if word not in puncs:
              try:
                query2vec+=model[word]
              except:
                pass
        query2vec/=len(query_tokenized)
        
        for key, abstract_vec in abstract2vec.items():
            #round to 4th decimal
            cos_sim = round(dot(abstract_vec, query2vec)/(norm(abstract_vec)*norm(query2vec)), 6)*100
            results[key]=cos_sim
            
        return results

In [11]:
query = 'the structures phospholipids'

pprint(Engine.search(query))

{'A Comprehensive Review Article On Isoprostanes As Biological Markers': 63.2729,
 'Anti Toxoplasma Gondii Activity Of 5 Oxo Hexahydroquinoline Derivatives   Synthesis, In Vitro And In Vivo Evaluations, And Molecular Docking Analysis': 64.43480000000001,
 'Assembling Structurally Customizable Synthetic Carriers': 68.0523,
 'Asymptomatic Carrier State  Acute Respiratory Disease': 53.5597,
 'Deamidation In Moxetumomab Pasudotox Leading To Conformational Change And Immunotoxin Activity Loss': 54.85,
 'Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder': 55.335,
 'Hollow Fiber Based Liquid Phase Microextraction': 62.502,
 'Molecular Immune Pathogenesis And Diagnosis Of Covid 19': 56.2859,
 'Molecular Pharmaceuticals': 56.2779,
 'Multi Dimensional Visualization For The Morphology': 54.019099999999995,
 'Nanotechnology And Vaccine Development': 62.91949999999999,
 'Pegylation In Anti Cancer Therapy': 59.1595,
 'Pharmaceutical Care In A Patien

In [12]:
eva = Evaluator(corp, Engine)

In [13]:
pprint(eva.evaluate(method='top-N-results', key='abstract', num_queries=200, num_docs=50))

Input n: 3


  cos_sim = round(dot(abstract_vec, query2vec)/(norm(abstract_vec)*norm(query2vec)), 6)*100


0.5871052631578947


In [14]:
pprint(eva.evaluate(method='top-N-results', key='all', num_queries=200, num_docs=50))

Input n: 3


  cos_sim = round(dot(abstract_vec, query2vec)/(norm(abstract_vec)*norm(query2vec)), 6)*100


0.27175


In [17]:
Engine.search('Melanoma Treatment')

{'Regorafenib Loaded Poly  Lactide Co Glycolide  Microspheres': 73.81269999999999,
 'Asymptomatic Carrier State  Acute Respiratory Disease': 76.52590000000001,
 'Phospolipids And Application': 59.61070000000001,
 'Deamidation In Moxetumomab Pasudotox Leading To Conformational Change And Immunotoxin Activity Loss': 63.37049999999999,
 'Assembling Structurally Customizable Synthetic Carriers': 62.585800000000006,
 'Molecular Immune Pathogenesis And Diagnosis Of Covid 19': 75.82419999999999,
 'Redox Sensitive  Peg Shielded Carboxymethyl Pei': 67.4353,
 'Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder': 62.4396,
 'Anti Toxoplasma Gondii Activity Of 5 Oxo Hexahydroquinoline Derivatives   Synthesis, In Vitro And In Vivo Evaluations, And Molecular Docking Analysis': 68.78659999999999,
 'The Effect Of Pramlintide, An Antidiabetic Amylin Analogue On Angiogenesis Related Markers In Vitro': 72.394,
 'Pharmaceutical Care In A Patient With Recurr