# Cosine Similarity | Abstracts | Pretrained Model

 ⠀

 ⠀

# Loading Abstracts from File

### Import Required Libraries

In [1]:
import glob
import os
import re

In [2]:
working_dir = "search_list/Abstracts/"

In [3]:
def filebrowser(curr_dir):
    "Returns files with an extension"
    return [f for f in glob.glob(curr_dir + "*.txt")]

In [4]:
file_list = filebrowser(working_dir)
print(file_list)

['search_list/Abstracts/Anti-Toxoplasma gondii activity of 5-oxo-hexahydroquinoline derivatives.txt', 'search_list/Abstracts/pharmaceutical-care-in-a-patient-with-recurrent-infection-after-liver-transplant.txt', 'search_list/Abstracts/The effect of pramlintide, an antidiabetic amylin analogue on angiogenesis-related markers in vitro.txt', 'search_list/Abstracts/pharmacological-options-beyond-proton-pump-inhibitors-in-childrenwith-gastroesophageal-reflux-diseas.txt']


In [5]:
name_list = [i.replace(working_dir, '').replace('.txt', '') for i in file_list]
name_list

['Anti-Toxoplasma gondii activity of 5-oxo-hexahydroquinoline derivatives',
 'pharmaceutical-care-in-a-patient-with-recurrent-infection-after-liver-transplant',
 'The effect of pramlintide, an antidiabetic amylin analogue on angiogenesis-related markers in vitro',
 'pharmacological-options-beyond-proton-pump-inhibitors-in-childrenwith-gastroesophageal-reflux-diseas']

In [6]:
abstracts_list = []
contents = ''

for filepath in file_list:
    with open(filepath, 'r') as f:
        for line in f.readlines():
            line = line.replace('\n', ' ')
            contents += ''.join(e for e in line if (e.isalnum() or e==' '))
        abstracts_list.append(contents)
        f.close()

In [7]:
abstracts = dict()
for i in range(len(abstracts_list)):
    abstracts[name_list[i]] = abstracts_list[i]
    
abstracts

{'Anti-Toxoplasma gondii activity of 5-oxo-hexahydroquinoline derivatives': 'Abstract Background and purpose The aim of this study was to evaluate the in vitro and in vivo antiToxoplasma gondii T gondii effect of 5oxohexahydroquinoline compounds Moreover molecular docking study of the compounds into the active site of enoylacyl carrier protein reductase ENR as a necessary enzyme for the vitality of apicoplast was carried out Experimental approach A number of 5oxohexahydoquinoline derivatives Z1Z4 were synthesized The T gondii tachyzoites of RH strain were treated by different concentrations 164 μgmL of the compounds The viability of the encountered parasites with compounds was assessed using flow cytometry and propidium iodide PI staining Due to the high mortality effect of Z3 and Z4 in vitro their chemotherapy effect was assessed by inoculation of tachyzoites to four BALBc mice groups n  5 followed by the gavage of various concentrations of the compounds to the mice Molecular docking 

 ⠀

# Preprocess Abstracts

### Import Required Libraries

In [8]:
# numpy imports
import numpy as np
from numpy import dot
from numpy.linalg import norm

# nltk imports
import nltk
from nltk import punkt
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

# punctuation list
puncs="!#$%&'()*+,-./:;<=>?@[\]^_`{|}~"

### Remove Punctuations and Stopwords from Abstracts

In [9]:
stop_words = set(stopwords.words('english')) 

In [10]:
abstract_filtered = dict()

In [11]:
for key, abstract in abstracts.items():
  temp_list = []
  print(key,"\n")
  for word in word_tokenize(abstract):
    if word not in stop_words:
      if word not in puncs:
        temp_list.append(word)
  abstract_filtered[key] = temp_list

Anti-Toxoplasma gondii activity of 5-oxo-hexahydroquinoline derivatives 

pharmaceutical-care-in-a-patient-with-recurrent-infection-after-liver-transplant 

The effect of pramlintide, an antidiabetic amylin analogue on angiogenesis-related markers in vitro 

pharmacological-options-beyond-proton-pump-inhibitors-in-childrenwith-gastroesophageal-reflux-diseas 



In [12]:
abstract_filtered

{'Anti-Toxoplasma gondii activity of 5-oxo-hexahydroquinoline derivatives': ['Abstract',
  'Background',
  'purpose',
  'The',
  'aim',
  'study',
  'evaluate',
  'vitro',
  'vivo',
  'antiToxoplasma',
  'gondii',
  'T',
  'gondii',
  'effect',
  '5oxohexahydroquinoline',
  'compounds',
  'Moreover',
  'molecular',
  'docking',
  'study',
  'compounds',
  'active',
  'site',
  'enoylacyl',
  'carrier',
  'protein',
  'reductase',
  'ENR',
  'necessary',
  'enzyme',
  'vitality',
  'apicoplast',
  'carried',
  'Experimental',
  'approach',
  'A',
  'number',
  '5oxohexahydoquinoline',
  'derivatives',
  'Z1Z4',
  'synthesized',
  'The',
  'T',
  'gondii',
  'tachyzoites',
  'RH',
  'strain',
  'treated',
  'different',
  'concentrations',
  '164',
  'μgmL',
  'compounds',
  'The',
  'viability',
  'encountered',
  'parasites',
  'compounds',
  'assessed',
  'using',
  'flow',
  'cytometry',
  'propidium',
  'iodide',
  'PI',
  'staining',
  'Due',
  'high',
  'mortality',
  'effect',
  

 ⠀

# Word2Vec Search

### Import Required Libraries

In [13]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile
import gensim.downloader as api
from pprint import pprint

### Load Model from Memory

In [14]:
model = api.load("glove-twitter-50")

### Convert Abstracts to Vectors

In [15]:
#Abstract to vector
abstract2vec=dict()
for key, abstract in abstract_filtered.items():
  centroid = np.zeros((50,))
  for word in abstract:
    try:
      centroid += model[word.lower()]
    except:
      pass
  centroid /= len(abstract)
  abstract2vec[key] = centroid

abstract2vec

{'Anti-Toxoplasma gondii activity of 5-oxo-hexahydroquinoline derivatives': array([-0.07997122,  0.1322542 , -0.28960934,  0.10028079,  0.22969557,
         0.16735991,  0.09946941, -0.48924301, -0.03658194, -0.3161375 ,
        -0.00695457, -0.19977862, -1.64909231,  0.23055069,  0.21412665,
        -0.08681255,  0.32984696, -0.01174029,  0.21621427, -0.12933005,
        -0.0781037 ,  0.15181748,  0.0524821 , -0.12120208, -0.32243147,
         0.48045451, -0.00462152,  0.24578823,  0.00185973,  0.32224417,
         0.29715275, -0.11596642,  0.13682656, -0.21628157,  0.09412573,
        -0.25494326, -0.06640304,  0.20673274,  0.0900704 , -0.10836769,
         0.28670992, -0.06118384,  0.05629857, -0.19056601,  0.03675359,
        -0.18041929, -0.12160913,  0.17665429, -0.09784039, -0.04430752]),
 'pharmaceutical-care-in-a-patient-with-recurrent-infection-after-liver-transplant': array([-0.11085061,  0.1674317 , -0.38085551,  0.08403651,  0.19086554,
         0.12107865,  0.13377114, -0

### Search for a Query

In [16]:
def search(query):
    results = dict()
    query = query.lower()
    #Abstract to vector
    query2vec=np.zeros((50,))
    query_tokenized=word_tokenize(query)
    for word in query_tokenized:
      if word not in stop_words:
        if word not in puncs:
          try:
            query2vec+=model[word]
          except:
            pass
    query2vec/=len(query_tokenized)
    return results, query2vec

In [17]:
query = 'the structures phospholipids'

results, query2vec = search(query)

In [18]:
for key, abstract_vec in abstract2vec.items():
  #round to 4th decimal
  cos_sim = round(dot(abstract_vec, query2vec)/(norm(abstract_vec)*norm(query2vec)), 6)*100
  results[key]=cos_sim
    
pprint(results)

{'Anti-Toxoplasma gondii activity of 5-oxo-hexahydroquinoline derivatives': 65.2364,
 'The effect of pramlintide, an antidiabetic amylin analogue on angiogenesis-related markers in vitro': 62.3069,
 'pharmaceutical-care-in-a-patient-with-recurrent-infection-after-liver-transplant': 61.968999999999994,
 'pharmacological-options-beyond-proton-pump-inhibitors-in-childrenwith-gastroesophageal-reflux-diseas': 62.164500000000004}
