# Cosine Similarity | TF-IDF Keyword Search






In [1]:
import re
from utility import Document, Corpus
import pandas as pd
import numpy as np

In [2]:
corp=Corpus("search_list_txt/")

search_list_txt\A Comprehensive Review Article On Isoprostanes As Biological Markers.txt
search_list_txt\Anti Toxoplasma Gondii Activity Of 5 Oxo Hexahydroquinoline Derivatives   Synthesis, In Vitro And In Vivo Evaluations, And Molecular Docking Analysis.txt
search_list_txt\Assembling Structurally Customizable Synthetic Carriers.txt
search_list_txt\Asymptomatic Carrier State  Acute Respiratory Disease.txt
search_list_txt\Covid-19 Pathogenesis, Cytokine Storm And Cytokine Growth Factor Reviews.txt
search_list_txt\Deamidation In Moxetumomab Pasudotox Leading To Conformational Change And Immunotoxin Activity Loss.txt
search_list_txt\Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder.txt
search_list_txt\Hollow Fiber Based Liquid Phase Microextraction.txt
search_list_txt\Molecular Immune Pathogenesis And Diagnosis Of Covid 19.txt
search_list_txt\Molecular Pharmaceuticals.txt
search_list_txt\Multi Dimensional Visualization For The Morphology.

In [3]:
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

stop_words = set(stopwords.words('english')) 
lemmatizer=WordNetLemmatizer()

In [4]:
content_dict=corp.split_sections('abstract')
content_dict

{'A Comprehensive Review Article On Isoprostanes As Biological Markers': 'various obsessive procedures include free radical intervened oxidative anxiety. the elaboration of solid and nonintrusive strategies for the assessment of oxidative worry in human body is a standout amongst the most critical stridestowards perceiving the assortment of oxidative disorders apparently created by reactive oxygen species ros. lipidperoxidation is a standout amongst the most wellknown components related with oxidative anxiety and the estimationof lipid peroxidation items has been utilized to assess oxidative worry in vivo conditions. the estimation of conjugateddienes and lipid hydro peroxide while the evaluation of optional final results incorporates thiobarbituric acid reactivesubstances vaporous alkanes and prostaglandin f2like items named f2isoprostanes f2ips. as of late f2ipshave been viewed as the most significant precise and solid marker of oxidative worry in vivo and their evaluation issuggeste

In [5]:
content=[]

for title, body in content_dict.items():
    content.append({"title": title, "body": body})


In [6]:
content_df=pd.DataFrame.from_dict(content)
content_df.head()

Unnamed: 0,title,body
0,A Comprehensive Review Article On Isoprostanes...,various obsessive procedures include free radi...
1,Anti Toxoplasma Gondii Activity Of 5 Oxo Hexah...,background and purpose the aim of this study w...
2,Assembling Structurally Customizable Synthetic...,this study demonstrates that our previously re...
3,Asymptomatic Carrier State Acute Respiratory ...,since the emergence of coronavirus disease 201...
4,"Covid-19 Pathogenesis, Cytokine Storm And Cyto...",the outbreak of the novel sarscov2 severe acut...


In [7]:

def preprocess(text):
    clean_text=[]
    text=re.sub('[^A-Za-z0-9]+', ' ', text)
    for word in word_tokenize(text):
        if word not in stop_words:
            word=lemmatizer.lemmatize(word)
            clean_text.append(word)
    clean_text=' '.join([w for w in clean_text])
    return clean_text.lower()
            

In [8]:
content_df["body"]=content_df["body"].apply(lambda x: preprocess(x))
content_df.head()

Unnamed: 0,title,body
0,A Comprehensive Review Article On Isoprostanes...,various obsessive procedure include free radic...
1,Anti Toxoplasma Gondii Activity Of 5 Oxo Hexah...,background purpose aim study evaluate vitro vi...
2,Assembling Structurally Customizable Synthetic...,study demonstrates previously reported polywra...
3,Asymptomatic Carrier State Acute Respiratory ...,since emergence coronavirus disease 2019 covid...
4,"Covid-19 Pathogenesis, Cytokine Storm And Cyto...",outbreak novel sarscov2 severe acute respirato...


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
vocabulary=set()

for body in content_df.body:
    vocabulary.update(word_tokenize(body))
vocabulary=list(vocabulary)

In [11]:
# Intializating the tfIdf model
tfidf = TfidfVectorizer(vocabulary=vocabulary)

# Fit the TfIdf model
tfidf.fit(content_df.body)

# Transform the TfIdf model
tfidf_tran=tfidf.transform(content_df.body)

In [12]:
#create a doc2vec dict
doc2vec=dict()
doc_vec_array=tfidf_tran.toarray()
for i in range(content_df.shape[0]):
    doc2vec[content_df.title.iloc[i]]=doc_vec_array[i]

In [13]:
def queryVec(query):
    query=preprocess(query)
    query_vec=tfidf.transform([query]).toarray()[0]
    return query_vec

In [14]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [15]:
def search_results(query):
    query_vec=queryVec(query)
    results=dict()
    for title, vec in doc2vec.items():
        docScore=cosine_sim(vec,query_vec)
        if docScore>0:
            results[title]=docScore
    return results

In [17]:
#Manual Search
query="the samplesolution was pumped into the ttextractor through the two tconnectors while the acceptor phase was injected into the hollowﬁber"
results=search_results(query)
results

{'Development Of Muco Adhesive Orally Disintegrating Tablets Containing Tamarind Gum Coated Tea Powder': 0.0014648342290778157,
 'Hollow Fiber Based Liquid Phase Microextraction': 0.3237433570885144,
 'Molecular Pharmaceuticals': 0.010838760869464985}

In [19]:
class Engine(SearchEngine):
    
    def search(query):
        return search_results(query)
        

In [20]:
from utility import SearchEngine, Evaluator
from pprint import pprint
eva = Evaluator(corp, Engine)

In [25]:
#when key is abstract
pprint(eva.evaluate(method='top-N-results', key='abstract', num_queries=50, num_docs=20))

Input n: 3
0.988


In [26]:
#when queries are generated from the whole document
pprint(eva.evaluate(method='top-N-results', key='all', num_queries=50, num_docs=20))

Input n: 3


  


0.5742857142857143
