# Search scholarly articles with text similarity
* How to run?
     * First Run all Cells
     * Next Change query and run all cells below from that cell.

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time 
import datetime
import numpy as np
import ipywidgets
import ipyleaflet
from IPython.display import display, HTML

__import work_data and work_metadata__

In [4]:
work_data = pd.read_csv("work_data.csv")
work_metadata= pd.read_csv("work_metadata.csv")

In [5]:
work_data.head()

Unnamed: 0,text,word_count,unique_words_count,lang,label
0,surfactant protein d pulmonary host defense su...,3903,1060,en,2
1,heme oxygenase carbon monoxide pulmonary medic...,3480,1019,en,2
2,functional genomic functional immunomic new ch...,3851,1051,en,2
3,model base design growth attenuated virus live...,4262,1029,en,2
4,object simulation model model hypothetical dis...,3915,1111,en,0


In [6]:
work_metadata.head()

Unnamed: 0,cord_uid,sha,title,doi,pmcid,pubmed_id,abstract,publish_time,authors,journal,pmc_json_files,url,full_literature
0,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972.0,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,Surfactant protein-D (SP-D) is a member of the...
1,8qnrcgnk,faaf1022ccfe93b032c5608097a53543ba24aedb,Heme oxygenase-1 and carbon monoxide in pulmon...,10.1186/1465-9921-4-7,PMC193681,12964953.0,"Heme oxygenase-1 (HO-1), an inducible stress p...",2003-08-07,"Slebos, Dirk-Jan; Ryter, Stefan W; Choi, Augus...",Respir Res,document_parses/pmc_json/PMC193681.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,The heme oxygenase-1/carbon monoxide (HO-1/CO)...
2,d3cko4j2,40500cd7ae5b4e116e8b13e5408e7dfd96d43ab4,From Functional Genomics to Functional Immunom...,10.1371/journal.pcbi.0020081,PMC1523295,16863395.0,The development of DNA microarray technology a...,2006-07-28,"Braga-Neto, Ulisses M; Marques, Ernesto T. A",PLoS Comput Biol,document_parses/pmc_json/PMC1523295.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,Functional genomics was made possible by the s...
3,j3p1u80n,4c84dbfd01f7b2009ebed54376da8afcbcf1ec64,Model-Based Design of Growth-Attenuated Viruses,10.1371/journal.pcbi.0020116,PMC1557587,16948530.0,Live-virus vaccines activate both humoral and ...,2006-09-01,"Lim, Kwang-il; Lang, Tobias; Lam, Vy; Yin, John",PLoS Comput Biol,document_parses/pmc_json/PMC1557587.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,Infections caused by viruses persistently thre...
4,pk7pnmlo,61d9a0fe39f4e845c44a06787de6f5f033b998a3,An object simulation model for modeling hypoth...,10.1186/1742-4682-3-32,PMC1570461,16928271.0,"BACKGROUND: EpiFlex is a flexible, easy to use...",2006-08-23,"Hanley, Brian",Theor Biol Med Model,document_parses/pmc_json/PMC1570461.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,This work is related to several threads within...


In [7]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(work_data['text'].values)

In [8]:
X.shape

(10126, 5000)

In [9]:
words = vectorizer.get_feature_names()

In [10]:
len(words)

5000

In [1]:
#extra queries
# nucleolus defence mechanism and immune system

# Input Query and RUN 

In [12]:
q = "pulmonary"

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import svm
import matplotlib.pyplot as plt

__Fit the new text to the TFIDF matrix__

In [14]:
query = TfidfVectorizer().fit(words)

In [15]:
query =  query.transform([q])

In [16]:
query

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

# Calculate cosine similarity for whole datasets and get the result 
# (This appraoch used for research paper)

In [17]:
cosine_similarities_all_data = cosine_similarity(query,X).flatten()

In [18]:
len(cosine_similarities_all_data)

10126

In [19]:
cosine_similarity_all_data = pd.DataFrame()
cosine_similarity_all_data['value'] = cosine_similarities_all_data

__Cosine Similarity for top 5 results__

In [20]:
cosine_similarity_all_data.nlargest(5, 'value')

Unnamed: 0,value
1480,0.511002
1721,0.458348
2093,0.423798
2133,0.390497
1756,0.373095


In [21]:
related_documents_1 = cosine_similarity_all_data.nlargest(100, 'value')

In [22]:
related_documents_1.index

Int64Index([1480, 1721, 2093, 2133, 1756, 1379, 8049, 6433, 1543, 8129, 1740,
            2149, 4168, 2366, 3584, 1412, 1883, 1810,   40, 2201, 1439, 1542,
            1942, 9527, 7870, 1946, 6915, 5315, 6569, 6534, 1811, 8705, 1409,
            1400, 2956, 4985,  828, 6722, 1925, 1958, 4953, 1392,  383, 8775,
            7070, 4738, 8915, 2217, 1937, 5780, 4496,  815, 2216, 8200, 9333,
            2097, 2233, 3615, 1665, 1998, 5295, 2290, 6358, 1373, 8191,  822,
            1627, 1891, 8099, 1463, 1087, 4474, 8315,  747, 9799, 1633, 1893,
            5416, 7770, 6538, 1986, 1547, 1728, 7773, 4264, 9004, 2501,  885,
            1975, 1515, 3543, 7063,  829, 4969, 6345, 4536, 2022, 7179,   14,
            1451],
           dtype='int64')

In [23]:
related_documents_1.shape

(100, 1)

In [24]:
work_data.loc[related_documents_1.index]

Unnamed: 0,text,word_count,unique_words_count,lang,label
1480,pulmonary complication malignancy blood marrow...,3261,1110,en,2
1721,select disorder respiratory system respiration...,3207,1201,en,2
2093,pleuropulmonary change induce drug patient hem...,8808,1879,en,2
2133,pulmonary effect antineoplastic therapy pulmon...,5359,1505,en,2
1756,sudden death pulmonary cause chapter seek surv...,3348,1199,en,2
...,...,...,...,...,...
4536,sar cov possible connection er ace rage focus ...,3513,1030,en,2
2022,pulmonary manifestation predominantly antibody...,5768,1462,en,2
7179,carbon black nanoparticle induce biphasic gene...,4992,1224,en,1
14,animal model acute lung injury acute lung inju...,6293,1403,en,2


In [25]:
result_data_1 = work_metadata[["title","authors","url","journal","publish_time"]]

In [26]:
result_data_1 = result_data_1.loc[related_documents_1.index]

__Final Results__

In [27]:
result_data_1

Unnamed: 0,title,authors,url,journal,publish_time
1480,Pulmonary Complications of Malignancies and Bl...,"Jayasuriya, Geshani; Lin, Beryl; Keogh, Steven...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Pulmonary Complications of Non-Pulmonary Pedia...,2017-10-24
1721,Selected Disorders of the Respiratory System,"Howlett, Bethany M.; Coleman, George C.; Hoffm...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Family Medicine,2016-02-17
2093,Pleuropulmonary Changes Induced by Drugs in Pa...,"Camus, Philippe",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Pulmonary Involvement in Patients with Hematol...,2010-08-19
2133,Pulmonary Effects of Antineoplastic Therapy,"Dhakal, Sughosh; Weiner, Daniel; Schwartz, Cin...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Survivors of Childhood and Adolescent Cancer,2015-04-13
1756,Sudden Death from Pulmonary Causes,"Cunningham, Kris S.; Pollanen, Michael S.",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Forensic Pathology Reviews,2011-07-16
...,...,...,...,...,...
4536,"SARS‐CoV‐2 and the possible connection to ERs,...","Stilhano, Roberta Sessa; Costa, Angelica Jardi...",https://doi.org/10.1096/fj.202001394rr; https:...,FASEB J,2020-09-23
2022,Pulmonary Manifestations of Predominantly Anti...,"Saghazadeh, Amene; Rezaei, Nima",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Pulmonary Manifestations of Primary Immunodefi...,2019-01-01
7179,Carbon black nanoparticles induce biphasic gen...,"Husain, Mainul; Kyjovska, Zdenka O.; Bourdon-L...",https://www.ncbi.nlm.nih.gov/pubmed/26551751/;...,Toxicol Appl Pharmacol,2015-12-15
14,Animal models of acute lung injury,"Matute-Bello, Gustavo; Frevert, Charles W.; Ma...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,Am J Physiol Lung Cell Mol Physiol,2008-09-08


# Final Search Results, Top 100 Results

In [28]:
def make_clickable(val):
    check = isinstance(val, float)
    if(check):
        return val
    else:
        val = val.split(";")
        return '<a href="{}">{}</a>'.format(val[0],val[0])

In [29]:
def f(code : str):
    if(code == 'All'):
        return result_data_1.style.format(make_clickable, na_rep=None)
    else:
        return result_data_1[result_data_1['journal'] == code].style.format(make_clickable)
    
    
available_codes_1 = ['All']  

for code in pd.unique(result_data_1['journal']):
    available_codes_1.append(code)
widget_codes_1 = ipywidgets.Dropdown(
    options=[code for code in available_codes_1],
    value=available_codes_1[0],
    description='Journal:',
)
ipywidgets.interact(f, code=widget_codes_1)

Unnamed: 0,title,authors,url,journal,publish_time
1480,Pulmonary Complications of Malignancies and Blood and Marrow Transplantation,"Jayasuriya, Geshani",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7120544/,Pulmonary Complications of Non-Pulmonary Pediatric Disorders,2017-10-24
1721,Selected Disorders of the Respiratory System,"Howlett, Bethany M.",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7121868/,Family Medicine,2016-02-17
2093,Pleuropulmonary Changes Induced by Drugs in Patients with Hematologic Diseases,"Camus, Philippe",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7123804/,Pulmonary Involvement in Patients with Hematological Malignancies,2010-08-19
2133,Pulmonary Effects of Antineoplastic Therapy,"Dhakal, Sughosh",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7124061/,Survivors of Childhood and Adolescent Cancer,2015-04-13
1756,Sudden Death from Pulmonary Causes,"Cunningham, Kris S.",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7122050/,Forensic Pathology Reviews,2011-07-16
1379,Alveolar Hemorrhage,"Wells, Jason",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7119931/,Orphan Lung Diseases,2014-12-11
8049,Novel insights on the pulmonary vascular consequences of COVID-19,"Potus, François",https://www.ncbi.nlm.nih.gov/pubmed/32551862/,Am J Physiol Lung Cell Mol Physiol,2020-08-01
6433,Diagnosis and treatment of pulmonary chronic GVHD: report from the consensus conference on clinical practice in chronic GVHD,"Hildebrandt, G C",https://doi.org/10.1038/bmt.2011.35,Bone Marrow Transplant,2011-03-28
1543,Care of the Postoperative Pulmonary Resection Patient,"Kuckelman, John",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7120963/,Surgical Critical Care Therapy,2018-05-04
8129,COVID-19: The Potential Treatment of Pulmonary Fibrosis Associated with SARS-CoV-2 Infection,"Lechowicz, Kacper",https://www.ncbi.nlm.nih.gov/pubmed/32575380/,J Clin Med,2020-06-19


<function __main__.f(code: str)>

# Not part of Paper (Just for exploration)

# Calculate cosine Similarity
   * __Calculate cosine similarity for the datasets for the respective cluster to which query belongs to__
       * __Cluster for the new query is predicted by SVM model__

__Load SVM classification Model__

In [30]:
from joblib import dump, load
svm_clf = load('svm_clf_classification_model.joblib') 

__Cosine Similarity for specific cluster__
   * __Predict which cluster query belongs to__

In [31]:
cluster_pred = svm_clf.predict(query.toarray())

In [32]:
cluster_pred

array([2], dtype=int32)

In [33]:
work_data[work_data['label']==2]

Unnamed: 0,text,word_count,unique_words_count,lang,label
0,surfactant protein d pulmonary host defense su...,3903,1060,en,2
1,heme oxygenase carbon monoxide pulmonary medic...,3480,1019,en,2
2,functional genomic functional immunomic new ch...,3851,1051,en,2
3,model base design growth attenuated virus live...,4262,1029,en,2
7,hairpin structure utr dna polymerase mrna act ...,3744,1081,en,2
...,...,...,...,...,...
10112,nf b dependent independent transcriptome chrom...,4750,1011,en,2
10114,management infection patient kidney transplant...,3589,1193,en,2
10116,comparative genomic analysis illuminate distin...,3971,1112,en,2
10121,tmt base quantitative proteomics analysis reve...,5750,1180,en,2


In [34]:
index = work_data[work_data['label']==2].index

In [35]:
index

Int64Index([    0,     1,     2,     3,     7,    11,    12,    13,    14,
               15,
            ...
            10101, 10102, 10109, 10110, 10111, 10112, 10114, 10116, 10121,
            10123],
           dtype='int64', length=5239)

In [36]:
X_new = X[index]

In [37]:
X_new

<5239x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 4893590 stored elements in Compressed Sparse Row format>

In [38]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
cosine_similarities_new = cosine_similarity(query,X_new).flatten()

In [40]:
len(cosine_similarities_new)

5239

In [41]:
cosine_similarity_new = pd.DataFrame()
cosine_similarity_new['value'] = cosine_similarities_new
cosine_similarity_new['index'] = index
cosine_similarity_new = cosine_similarity_new.set_index('index')

In [43]:
cosine_similarity_new.nlargest(5, 'value')

Unnamed: 0_level_0,value
index,Unnamed: 1_level_1
1480,0.511002
1721,0.458348
2093,0.423798
2133,0.390497
1756,0.373095


In [44]:
related_documents_new = cosine_similarity_new.nlargest(100, 'value')

In [45]:
related_documents_new

Unnamed: 0_level_0,value
index,Unnamed: 1_level_1
1480,0.511002
1721,0.458348
2093,0.423798
2133,0.390497
1756,0.373095
...,...
8748,0.123577
9349,0.123266
7201,0.121611
3001,0.120186


In [46]:
related_documents_new.index

Int64Index([1480, 1721, 2093, 2133, 1756, 1379, 8049, 6433, 1543, 8129, 2149,
            4168, 2366, 1883, 1810,   40, 2201, 1439, 1542, 1942, 9527, 7870,
            1946, 6915, 5315, 6569, 1811, 8705, 1409, 2956, 4985,  828, 1925,
            1958,  383, 8775, 7070, 4738, 8915, 2217, 1937, 5780, 4496,  815,
            2216, 8200, 9333, 2097, 2233, 1665, 1998, 5295, 2290, 6358, 1373,
            8191,  822, 1627, 1891, 8099, 1463, 1087, 4474, 8315,  747, 9799,
            1893, 5416, 7770, 6538, 1986, 1547, 1728, 7773, 4264, 9004, 2501,
             885, 1515, 3543, 7063, 4969, 4536, 2022,   14, 1451, 1859, 2135,
            2434,  901, 1621, 8871, 6989, 4667, 3781, 8748, 9349, 7201, 3001,
            7990],
           dtype='int64', name='index')

In [47]:
work_data.loc[related_documents_new.index]

Unnamed: 0_level_0,text,word_count,unique_words_count,lang,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1480,pulmonary complication malignancy blood marrow...,3261,1110,en,2
1721,select disorder respiratory system respiration...,3207,1201,en,2
2093,pleuropulmonary change induce drug patient hem...,8808,1879,en,2
2133,pulmonary effect antineoplastic therapy pulmon...,5359,1505,en,2
1756,sudden death pulmonary cause chapter seek surv...,3348,1199,en,2
...,...,...,...,...,...
8748,metabolic handbook covid pandemic infectious d...,5568,1239,en,2
9349,advanced pulmonary cardiac support covid patie...,3534,1281,en,2
7201,respiratory medicine upper respiratory infecti...,7173,2310,en,2
3001,formation lamellar body like structure initiat...,3090,1119,en,2


In [48]:
result_data = work_metadata[["title","authors","url","journal","publish_time"]]

In [49]:
result_data = result_data.loc[related_documents_new.index]

In [50]:
result_data

Unnamed: 0_level_0,title,authors,url,journal,publish_time
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1480,Pulmonary Complications of Malignancies and Bl...,"Jayasuriya, Geshani; Lin, Beryl; Keogh, Steven...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Pulmonary Complications of Non-Pulmonary Pedia...,2017-10-24
1721,Selected Disorders of the Respiratory System,"Howlett, Bethany M.; Coleman, George C.; Hoffm...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Family Medicine,2016-02-17
2093,Pleuropulmonary Changes Induced by Drugs in Pa...,"Camus, Philippe",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Pulmonary Involvement in Patients with Hematol...,2010-08-19
2133,Pulmonary Effects of Antineoplastic Therapy,"Dhakal, Sughosh; Weiner, Daniel; Schwartz, Cin...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Survivors of Childhood and Adolescent Cancer,2015-04-13
1756,Sudden Death from Pulmonary Causes,"Cunningham, Kris S.; Pollanen, Michael S.",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Forensic Pathology Reviews,2011-07-16
...,...,...,...,...,...
8748,A metabolic handbook for the COVID-19 pandemic,"Ayres, Janelle S.",https://www.ncbi.nlm.nih.gov/pubmed/32694793/;...,Nat Metab,2020-06-30
9349,Advanced Pulmonary and Cardiac Support of COVI...,"Rajagopal, Keshava; Keller, Steven P.; Akkanti...",https://www.ncbi.nlm.nih.gov/pubmed/32358232/;...,ASAIO J,2020-05-11
7201,Respiratory medicine,"Scully, Crispian",https://www.sciencedirect.com/science/article/...,Scully's Medical Problems in Dentistry,2014-06-25
3001,Formation of lamellar body-like structure may ...,"Park, Eun-Jung; Seong, Eunsol; Kang, Min-Sung;...",https://api.elsevier.com/content/article/pii/S...,Toxicol Appl Pharmacol,2020-08-05


In [51]:
def make_clickable(val):
    check = isinstance(val, float)
    if(check):
        return val
    else:
        val = val.split(";")
        return '<a href="{}">{}</a>'.format(val[0],val[0])

In [52]:
def f(code : str):
    if(code == 'All'):
        return result_data.style.format(make_clickable)
    else:
        return result_data[result_data['journal'] == code].style.format(make_clickable)
    
    
available_codes = ['All']  

for code in pd.unique(result_data['journal']):
    available_codes.append(code)
widget_codes = ipywidgets.Dropdown(
    options=[code for code in available_codes],
    value=available_codes[0],
    description='Journal:',
)
ipywidgets.interact(f, code=widget_codes)

Unnamed: 0_level_0,title,authors,url,journal,publish_time
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1480,Pulmonary Complications of Malignancies and Blood and Marrow Transplantation,"Jayasuriya, Geshani",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7120544/,Pulmonary Complications of Non-Pulmonary Pediatric Disorders,2017-10-24
1721,Selected Disorders of the Respiratory System,"Howlett, Bethany M.",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7121868/,Family Medicine,2016-02-17
2093,Pleuropulmonary Changes Induced by Drugs in Patients with Hematologic Diseases,"Camus, Philippe",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7123804/,Pulmonary Involvement in Patients with Hematological Malignancies,2010-08-19
2133,Pulmonary Effects of Antineoplastic Therapy,"Dhakal, Sughosh",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7124061/,Survivors of Childhood and Adolescent Cancer,2015-04-13
1756,Sudden Death from Pulmonary Causes,"Cunningham, Kris S.",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7122050/,Forensic Pathology Reviews,2011-07-16
1379,Alveolar Hemorrhage,"Wells, Jason",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7119931/,Orphan Lung Diseases,2014-12-11
8049,Novel insights on the pulmonary vascular consequences of COVID-19,"Potus, François",https://www.ncbi.nlm.nih.gov/pubmed/32551862/,Am J Physiol Lung Cell Mol Physiol,2020-08-01
6433,Diagnosis and treatment of pulmonary chronic GVHD: report from the consensus conference on clinical practice in chronic GVHD,"Hildebrandt, G C",https://doi.org/10.1038/bmt.2011.35,Bone Marrow Transplant,2011-03-28
1543,Care of the Postoperative Pulmonary Resection Patient,"Kuckelman, John",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7120963/,Surgical Critical Care Therapy,2018-05-04
8129,COVID-19: The Potential Treatment of Pulmonary Fibrosis Associated with SARS-CoV-2 Infection,"Lechowicz, Kacper",https://www.ncbi.nlm.nih.gov/pubmed/32575380/,J Clin Med,2020-06-19


<function __main__.f(code: str)>