# PubMed Pretrained Model

In [1]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
#Import numpy for array calculation
import numpy as np

import re

#import nltk for string operations
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

In [2]:
model = KeyedVectors.load_word2vec_format('pubmed_model/pubmed2018_w2v_200D/pubmed2018_w2v_200D.bin', binary=True)

In [3]:
model.most_similar("toxoplasma") #sanity check

[('gondii', 0.9499291181564331),
 ('neospora', 0.871677041053772),
 ('tgondii', 0.8004791736602783),
 ('caninum', 0.7926305532455444),
 ('trypanosoma', 0.7545153498649597),
 ('giardia', 0.7445007562637329),
 ('toxoplasmosis', 0.7416958808898926),
 ('encephalitozoon', 0.738080620765686),
 ('cruzi', 0.7333229780197144),
 ('besnoitia', 0.7209460735321045)]

In [4]:
from utility import Document, Corpus

In [5]:
corp=Corpus("search_list_txt/")

search_list_txt\A Comprehensive Review Article On Isoprostanes As Biological Markers.txt
search_list_txt\Anti Toxoplasma Gondii Activity Of 5 Oxo Hexahydroquinoline Derivatives   Synthesis, In Vitro And In Vivo Evaluations, And Molecular Docking Analysis.txt
search_list_txt\Assembling Structurally Customizable Synthetic Carriers.txt
search_list_txt\Asymptomatic Carrier State  Acute Respiratory Disease.txt
search_list_txt\Covid-19 Pathogenesis, Cytokine Storm And Cytokine Growth Factor Reviews.txt
search_list_txt\Deamidation In Moxetumomab Pasudotox Leading To Conformational Change And Immunotoxin Activity Loss.txt
search_list_txt\Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder.txt
search_list_txt\Hollow Fiber Based Liquid Phase Microextraction.txt
search_list_txt\Molecular Immune Pathogenesis And Diagnosis Of Covid 19.txt
search_list_txt\Molecular Pharmaceuticals.txt
search_list_txt\Multi Dimensional Visualization For The Morphology.

In [6]:
#divide the document into sentences
document_sentences=corp.split_sentences("abstract")
document_sentences

{'A Comprehensive Review Article On Isoprostanes As Biological Markers': ['various obsessive procedures include free radical intervened oxidative anxiety',
  ' the elaboration of solid and nonintrusive strategies for the assessment of oxidative worry in human body is a standout amongst the most critical stridestowards perceiving the assortment of oxidative disorders apparently created by reactive oxygen species ros',
  ' lipidperoxidation is a standout amongst the most wellknown components related with oxidative anxiety and the estimationof lipid peroxidation items has been utilized to assess oxidative worry in vivo conditions',
  ' the estimation of conjugateddienes and lipid hydro peroxide while the evaluation of optional final results incorporates thiobarbituric acid reactivesubstances vaporous alkanes and prostaglandin f2like items named f2isoprostanes f2ips',
  ' as of late f2ipshave been viewed as the most significant precise and solid marker of oxidative worry in vivo and their 

In [31]:
#Create title2vec and doc2vec dictionaries

doc2vec=dict()
title2vec=dict()

stop_words = set(stopwords.words('english')) 

for title, sentences in document_sentences.items():
    doc2vec[title]=[]
    for sentence in sentences:
        sentence=re.sub('[^A-Za-z]+', ' ', sentence)
        sentence_centroid=np.zeros((200,))
        for word in word_tokenize(sentence.lower()):
            if word not in stop_words:
                try:
                    sentence_centroid+=model[word]
                except:
                    #print(word)
                    pass
        sentence_centroid/=len(word_tokenize(sentence))
        doc2vec[title].append(sentence_centroid)
    
    title_centroid=np.zeros((200,))
    title=re.sub('[^A-Za-z]+', ' ', title)
    for word in word_tokenize(title.lower()):
        if word not in stop_words:
            try:
                title_centroid+=model[word]
            except:
                pass
    title_centroid/=len(word_tokenize(title))
    title2vec[title]=title_centroid
    



In [43]:
query="the elaboration of solid and nonintrusive strategies for the assessment of oxidative worry in human body is a standout amongst the most critical"

#preprocess the query
query=re.sub('[^A-Za-z]+', ' ', query)
query=query.lower()


#calculate the query vector
query_vec=np.zeros((200,))

query_len=0
for word in word_tokenize(query):
    if word not in stop_words:
        try:
            query_vec+=model[word]
            query_len+=1
        except:
            pass
query_vec/=query_len

In [44]:
#title search
result_list=dict()

for key, vector in title2vec.items():
    result_list[key]=0
    cos_sim = np.dot(vector, query_vec)/(np.linalg.norm(vector)*np.linalg.norm(query_vec))
    result_list[key]+=cos_sim
            
result_list

{'A Comprehensive Review Article On Isoprostanes As Biological Markers': 0.7335513396491017,
 'Anti Toxoplasma Gondii Activity Of Oxo Hexahydroquinoline Derivatives Synthesis In Vitro And In Vivo Evaluations And Molecular Docking Analysis': 0.6324489041323537,
 'Assembling Structurally Customizable Synthetic Carriers': 0.5461879892341388,
 'Asymptomatic Carrier State Acute Respiratory Disease': 0.6429205729254008,
 'Covid Pathogenesis Cytokine Storm And Cytokine Growth Factor Reviews': 0.6482158786750304,
 'Deamidation In Moxetumomab Pasudotox Leading To Conformational Change And Immunotoxin Activity Loss': 0.5742641966880455,
 'Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder': 0.5079950392605406,
 'Hollow Fiber Based Liquid Phase Microextraction': 0.5526412592855859,
 'Molecular Immune Pathogenesis And Diagnosis Of Covid ': 0.6277463775774065,
 'Molecular Pharmaceuticals': 0.5469044082106406,
 'Multi Dimensional Visualization For Th

In [45]:
#doc search
result_list=dict()

for key, vectors in doc2vec.items():
    result_list[key]=0
    for vector in vectors:
        if np.isnan(vector).any():
            pass
        else:
            cos_sim = np.dot(vector, query_vec)/(np.linalg.norm(vector)*np.linalg.norm(query_vec))
            if result_list[key]<cos_sim:
                result_list[key]=cos_sim
            
result_list

  # Remove the CWD from sys.path while we load stuff.


{'A Comprehensive Review Article On Isoprostanes As Biological Markers': 0.9319909680040831,
 'Anti Toxoplasma Gondii Activity Of 5 Oxo Hexahydroquinoline Derivatives   Synthesis, In Vitro And In Vivo Evaluations, And Molecular Docking Analysis': 0.7522021791154637,
 'Assembling Structurally Customizable Synthetic Carriers': 0.6248527251745238,
 'Asymptomatic Carrier State  Acute Respiratory Disease': 0.7528743240815716,
 'Covid-19 Pathogenesis, Cytokine Storm And Cytokine Growth Factor Reviews': 0.8225176405412185,
 'Deamidation In Moxetumomab Pasudotox Leading To Conformational Change And Immunotoxin Activity Loss': 0.8136102194857758,
 'Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder': 0.7508159469605591,
 'Hollow Fiber Based Liquid Phase Microextraction': 0.789198846721704,
 'Molecular Immune Pathogenesis And Diagnosis Of Covid 19': 0.818589551072698,
 'Molecular Pharmaceuticals': 0.8426978927128493,
 'Multi Dimensional Visualiza