# Cosine Similarity | Sentences of entire Document | Pubmed Model

  ⠀

  ⠀

# Loading the Documents

### Load Utility Libraries

In [1]:
from utility import Corpus, Document

In [2]:
corp = Corpus('search_list_txt/')

search_list_txt/Regorafenib Loaded Poly  Lactide Co Glycolide  Microspheres.txt
search_list_txt/Asymptomatic Carrier State  Acute Respiratory Disease.txt
search_list_txt/Phospolipids And Application.txt
search_list_txt/Deamidation In Moxetumomab Pasudotox Leading To Conformational Change And Immunotoxin Activity Loss.txt
search_list_txt/Assembling Structurally Customizable Synthetic Carriers.txt
search_list_txt/Molecular Immune Pathogenesis And Diagnosis Of Covid 19.txt
search_list_txt/Redox Sensitive  Peg Shielded Carboxymethyl Pei.txt
search_list_txt/Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder.txt
search_list_txt/Anti Toxoplasma Gondii Activity Of 5 Oxo Hexahydroquinoline Derivatives   Synthesis, In Vitro And In Vivo Evaluations, And Molecular Docking Analysis.txt
search_list_txt/The Effect Of Pramlintide, An Antidiabetic Amylin Analogue On Angiogenesis Related Markers In Vitro.txt
search_list_txt/Pharmaceutical Care In A Patie

In [3]:
documents = corp.split_sentences()
print(documents)

{'Regorafenib Loaded Poly  Lactide Co Glycolide  Microspheres': ['', 'transarterial chemoembolization tace has been widely introduced to treat hepatocellularcarcinoma hcc especially for unresectable patients for decades', ' however tace evokesan angiogenic response due to the secretion of vascular endothelial growth factor vegfresulting in the formation of new blood vessels and eventually tumor recurrence', ' thus weaimed to develop regorafenib regoloaded poly lactidecoglycolide plga microspheresthat enabled localized and sustained drug delivery to limit proangiogenic responses', 'microspheres transarterial chemoembolization', 'transarterial chemoembolization tace are broadly usedagainst hepatocellular carcinoma hcc especially forinoperable patients for decades 12', ' conventional taceusually involves intraarterial administration of contrastmedium and chemotherapeutic drugs via a catheter underimage guidance followed by embolization of the tumorfeeding artery with embolic agents e', 'g

  ⠀

# Model

In [4]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [5]:
model = KeyedVectors.load_word2vec_format('pubmed_model/pubmed2018_w2v_200D/pubmed2018_w2v_200D.bin', binary=True)

  ⠀

# Preprocessing and Querying

### Library Imports

In [6]:
# numpy imports
import numpy as np
from numpy import dot
from numpy.linalg import norm

import re

# nltk imports
import nltk
from nltk import punkt
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

# punctuation list
puncs="!#$%&'()*+,-./:;<=>?@[\]^_`{|}~"

### Create Word2vec representation for all sentences in documents

In [7]:
doc2vec=dict()

stop_words = set(stopwords.words('english')) 

for title, sentences in documents.items():
    doc2vec[title]=[]
    for sentence in sentences:
        sentence=re.sub('[^A-Za-z]+', ' ', sentence)
        sentence_centroid=np.zeros((200,))
        for word in word_tokenize(sentence.lower()):
            if word not in stop_words:
                try:
                    sentence_centroid+=model[word]
                except:
                    pass
        sentence_centroid/=len(word_tokenize(sentence))
        doc2vec[title].append(sentence_centroid)

  sentence_centroid/=len(word_tokenize(sentence))


## Create Engine

### Import Utilities

In [8]:
from utility import SearchEngine, Evaluator
from pprint import pprint

In [9]:
class Engine(SearchEngine):
    
    def search(query):
        
        query_len=0
        query_vec=np.zeros((200,))
        result_list=dict()

        #preprocess the query
        query=re.sub('[^A-Za-z]+', ' ', query).lower()

        for word in word_tokenize(query):
            if word not in stop_words:
                try:
                    query_vec+=model[word]
                    query_len+=1
                except:
                    pass
                
        query_vec/=query_len
        
        for key, vectors in doc2vec.items():
            result_list[key]=0
            for vector in vectors:
                if np.isnan(vector).any():
                    pass
                else:
                    cos_sim = np.dot(vector, query_vec)/(np.linalg.norm(vector)*np.linalg.norm(query_vec))
                    if result_list[key]<cos_sim:
                        result_list[key]=cos_sim
            
        return result_list

In [10]:
query = 'the structures phospholipids'

pprint(Engine.search(query))

{'A Comprehensive Review Article On Isoprostanes As Biological Markers': 0.7387905746822652,
 'Anti Toxoplasma Gondii Activity Of 5 Oxo Hexahydroquinoline Derivatives   Synthesis, In Vitro And In Vivo Evaluations, And Molecular Docking Analysis': 0.6220698997991283,
 'Assembling Structurally Customizable Synthetic Carriers': 0.6462379131063613,
 'Asymptomatic Carrier State  Acute Respiratory Disease': 0.47532684149702054,
 'Deamidation In Moxetumomab Pasudotox Leading To Conformational Change And Immunotoxin Activity Loss': 0.6388677553366081,
 'Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder': 0.666865154903978,
 'Hollow Fiber Based Liquid Phase Microextraction': 0.6316439155642614,
 'Molecular Immune Pathogenesis And Diagnosis Of Covid 19': 0.6174888947568168,
 'Molecular Pharmaceuticals': 0.5002605603677398,
 'Multi Dimensional Visualization For The Morphology': 0.5836445294191333,
 'Nanotechnology And Vaccine Development': 0.7594

  cos_sim = np.dot(vector, query_vec)/(np.linalg.norm(vector)*np.linalg.norm(query_vec))


# Evaluation

In [11]:
eva = Evaluator(corp, Engine)

In [12]:
pprint(eva.evaluate(method='top-N-results', key='abstract', num_queries=50, num_docs=20))

Input n: 3


  cos_sim = np.dot(vector, query_vec)/(np.linalg.norm(vector)*np.linalg.norm(query_vec))


KeyboardInterrupt: 

In [None]:
pprint(eva.evaluate(method='top-N-results', key='all', num_queries=50, num_docs=20))

In [14]:
Engine.search('Amalgamation of Polyethelene in Melanoma Treatments')

  cos_sim = np.dot(vector, query_vec)/(np.linalg.norm(vector)*np.linalg.norm(query_vec))


{'Regorafenib Loaded Poly  Lactide Co Glycolide  Microspheres': 0.6922746800861954,
 'Asymptomatic Carrier State  Acute Respiratory Disease': 0.6317137505293444,
 'Phospolipids And Application': 0.677385025386919,
 'Deamidation In Moxetumomab Pasudotox Leading To Conformational Change And Immunotoxin Activity Loss': 0.6571864569116052,
 'Assembling Structurally Customizable Synthetic Carriers': 0.6598964716500801,
 'Molecular Immune Pathogenesis And Diagnosis Of Covid 19': 0.641591825775432,
 'Redox Sensitive  Peg Shielded Carboxymethyl Pei': 0.6638550285676842,
 'Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder': 0.6516655114135439,
 'Anti Toxoplasma Gondii Activity Of 5 Oxo Hexahydroquinoline Derivatives   Synthesis, In Vitro And In Vivo Evaluations, And Molecular Docking Analysis': 0.6184133903187177,
 'The Effect Of Pramlintide, An Antidiabetic Amylin Analogue On Angiogenesis Related Markers In Vitro': 0.6432585736794533,
 'Pharma