# Model Creation using Common English Corpus

In [1]:
#Import gensim for model training
import gensim.downloader as api
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
#Import numpy for array calculation
import numpy as np

import re

#import nltk for string operations
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

In [17]:
#Get common text corpus and train a gensim model on that
dataset = api.load("text8")
model = Word2Vec(dataset)

[('gondii', 0.9499291181564331),
 ('neospora', 0.871677041053772),
 ('tgondii', 0.8004791736602783),
 ('caninum', 0.7926305532455444),
 ('trypanosoma', 0.7545153498649597),
 ('giardia', 0.7445007562637329),
 ('toxoplasmosis', 0.7416958808898926),
 ('encephalitozoon', 0.738080620765686),
 ('cruzi', 0.7333229780197144),
 ('besnoitia', 0.7209460735321045)]

In [3]:
#Lemmatizer Sanity Check
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("breaks")) 
print(lemmatizer.lemmatize("corpora")) 
print(lemmatizer.lemmatize("anti-toxoplasma")) 

break
corpus
anti-toxoplasma


In [18]:
#import utility.py
from utility import Document, Corpus, Evaluator

In [19]:
corp = Corpus('search_list_txt/')

search_list_txt\A Comprehensive Review Article On Isoprostanes As Biological Markers.txt
search_list_txt\Anti Toxoplasma Gondii Activity Of 5 Oxo Hexahydroquinoline Derivatives   Synthesis, In Vitro And In Vivo Evaluations, And Molecular Docking Analysis.txt
search_list_txt\Assembling Structurally Customizable Synthetic Carriers.txt
search_list_txt\Asymptomatic Carrier State  Acute Respiratory Disease.txt
search_list_txt\Covid-19 Pathogenesis, Cytokine Storm And Cytokine Growth Factor Reviews.txt
search_list_txt\Deamidation In Moxetumomab Pasudotox Leading To Conformational Change And Immunotoxin Activity Loss.txt
search_list_txt\Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder.txt
search_list_txt\Hollow Fiber Based Liquid Phase Microextraction.txt
search_list_txt\Molecular Immune Pathogenesis And Diagnosis Of Covid 19.txt
search_list_txt\Molecular Pharmaceuticals.txt
search_list_txt\Multi Dimensional Visualization For The Morphology.

In [20]:
#divide the document into sentences
document_sentences=corp.split_sentences("abstract")
document_sentences

{'A Comprehensive Review Article On Isoprostanes As Biological Markers': ['various obsessive procedures include free radical intervened oxidative anxiety',
  ' the elaboration of solid and nonintrusive strategies for the assessment of oxidative worry in human body is a standout amongst the most critical stridestowards perceiving the assortment of oxidative disorders apparently created by reactive oxygen species ros',
  ' lipidperoxidation is a standout amongst the most wellknown components related with oxidative anxiety and the estimationof lipid peroxidation items has been utilized to assess oxidative worry in vivo conditions',
  ' the estimation of conjugateddienes and lipid hydro peroxide while the evaluation of optional final results incorporates thiobarbituric acid reactivesubstances vaporous alkanes and prostaglandin f2like items named f2isoprostanes f2ips',
  ' as of late f2ipshave been viewed as the most significant precise and solid marker of oxidative worry in vivo and their 

In [23]:
#convert document sentences to vectors
doc2vec=dict()
stop_words = set(stopwords.words('english')) 
for title, sentences in document_sentences.items():
    doc2vec[title]=[]
    for sentence in sentences:
        if(len(sentence)>0):
            centroid=np.zeros((100,))
            sent_len=0
            sentence=re.sub('[^A-Za-z]+', ' ', sentence)
            for word in word_tokenize(sentence.lower()):
                if(len(word)>0):
                    if word not in stop_words:
                        try:
                            centroid+=model[word]
                            sent_len+=1
                        except:
                            pass
            centroid/=sent_len
            doc2vec[title].append(centroid)
        



In [24]:
doc2vec

{'A Comprehensive Review Article On Isoprostanes As Biological Markers': [array([-0.31637745,  0.18959251, -0.19238014,  0.18669024,  0.07235393,
         -0.11872614, -0.21103916,  0.17869606,  0.14634423,  0.01279145,
         -0.12892856, -0.10964514,  0.12290907,  0.0838518 , -0.08449508,
         -0.17676044, -0.03356208,  0.13816269,  0.09119145, -0.19074848,
          0.11193106, -0.04318123, -0.00640964, -0.17453691, -0.18604553,
          0.06406721, -0.19994929,  0.2004592 ,  0.21317515, -0.38763837,
          0.00466437,  0.11574122,  0.1485781 , -0.4083275 , -0.15217359,
         -0.27107999, -0.05506101,  0.03774374, -0.07648942, -0.11712409,
          0.02810794,  0.00134096, -0.29783392,  0.13293778, -0.23923175,
          0.04716731,  0.04816068,  0.27062923, -0.04009685, -0.12878531,
         -0.11806111, -0.13774788, -0.18397922, -0.20664992,  0.08201761,
          0.10017064, -0.15371056,  0.14260386, -0.03388276, -0.03201589,
         -0.25812514,  0.04803418,  0.17

In [27]:
query="Covid-19 Pathogenesis, Cytokine Storm And Cytokine Growth Factor Reviews"
#preprocess the query
query=re.sub('[^A-Za-z]+', ' ', query)
query=query.lower()


#calculate the query vector
query_vec=np.zeros((100,))

for word in query.split():
    if word not in stop_words:
        try:
            query_vec+=model[word]
        except:
            pass
query_vec/=len(query)

In [28]:
result_list=dict()

for key, vectors in doc2vec.items():
    result_list[key]=0
    for vector in vectors:
        if np.isnan(vector).any():
            pass
        else:
            cos_sim = np.dot(vector, query_vec)/(np.linalg.norm(vector)*np.linalg.norm(query_vec))
            result_list[key]+=cos_sim
            
result_list

{'A Comprehensive Review Article On Isoprostanes As Biological Markers': 3.771618080462042,
 'Anti Toxoplasma Gondii Activity Of 5 Oxo Hexahydroquinoline Derivatives   Synthesis, In Vitro And In Vivo Evaluations, And Molecular Docking Analysis': 9.739429545402702,
 'Assembling Structurally Customizable Synthetic Carriers': 0.9500752322112419,
 'Asymptomatic Carrier State  Acute Respiratory Disease': 5.805466339255378,
 'Covid-19 Pathogenesis, Cytokine Storm And Cytokine Growth Factor Reviews': 3.4448208641216453,
 'Deamidation In Moxetumomab Pasudotox Leading To Conformational Change And Immunotoxin Activity Loss': 5.841533118231497,
 'Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder': 147.89401501839865,
 'Hollow Fiber Based Liquid Phase Microextraction': 189.08292907043915,
 'Molecular Immune Pathogenesis And Diagnosis Of Covid 19': 1.797532456426333,
 'Molecular Pharmaceuticals': 11.445546638815426,
 'Multi Dimensional Visualizatio