In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
warnings.filterwarnings('ignore')

In [2]:
data  = pd.read_csv('../raw_data/neuroscience_data_100000.csv').drop(columns=['Unnamed: 0']).set_index('PMID').dropna(subset=['abstract','articleTitle'])
df_clean_title = data[~data.articleTitle.str.contains("{")]
df = df_clean_title[~df_clean_title.abstract.str.contains("{")]
df = df[df.abstract != '.']
df


Unnamed: 0_level_0,abstract,articleTitle,Journaltitle,volume,pubDate,authors,MeSh,citations,affiliations,keywords,IsReviewArticle
PMID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
35961771,Navigation through complex environments requir...,Coordination between eye movement and whisking...,eNeuro,1,2022-Aug-12,,,0,[],,0
35961623,Recent evidence shows that genetic and environ...,Polygenic risk for schizophrenia as a moderato...,Progress in neuro-psychopharmacology & biologi...,1,2022-Aug-09,,,0,[],"Adversity, Bipolar disorder, Genetics, Psychos...",0
35961621,This study aimed to examine the prospective as...,The association of substance use with attainin...,Preventive medicine,1,2022-Aug-09,,,0,[],"Alcohol use, Cannabis use, Employment, Tobacco...",0
35961602,Psychosis presentation can be affected by gene...,Differences of affective and non-affective psy...,Journal of affective disorders,1,2022-Aug-09,,,0,[],"Bipolar disorder, Depression, First-episode, P...",0
35961582,Mental health and cognitive achievement are pa...,Brain structure and function show distinct rel...,Biological psychiatry. Cognitive neuroscience ...,1,2022-Aug-09,,,0,[],"cognition, genetics, mental health, polygenic ...",0
...,...,...,...,...,...,...,...,...,...,...,...
32848625,The inferior colliculus (IC) is an auditory mi...,Developmentally Regulated Rebound Depolarizati...,Frontiers in cellular neuroscience,14,2020,"Sant'Ana Leandro de Oliveira, Machado Sérgio, ...",,74,"Janeiro, Brazil., Janeiro, Brazil., Janeiro, B...","action potential, auditory system, brain slice...",0
32848624,Hearing loss is the third most common chronic ...,Berbamine Analogs Exhibit Differential Protect...,Frontiers in cellular neuroscience,14,2020,"Sant'Ana Leandro de Oliveira, Machado Sérgio, ...",,58,"Janeiro, Brazil., Janeiro, Brazil., Janeiro, B...","aminoglycoside, berbamine, hair cell, hearing ...",0
32848620,Tyrosine hydroxylase (Th) expression has previ...,Purkinje Cell-Specific Knockout of Tyrosine Hy...,Frontiers in cellular neuroscience,14,2020,"Sant'Ana Leandro de Oliveira, Machado Sérgio, ...",,124,"Janeiro, Brazil., Janeiro, Brazil., Janeiro, B...","catecholamine, cerebellum, cognition, dopamine...",0
32848619,Myelin is a dynamic membrane that is important...,That's a Wrap! Molecular Drivers Governing Neu...,Frontiers in cellular neuroscience,14,2020,"Sant'Ana Leandro de Oliveira, Machado Sérgio, ...",,48,"Janeiro, Brazil., Janeiro, Brazil., Janeiro, B...","Caspr, Nogo-A, PrPc, myelin, nogo receptor, pa...",1


In [3]:
df_abstract = df[['abstract']]
df_abstract

Unnamed: 0_level_0,abstract
PMID,Unnamed: 1_level_1
35961771,Navigation through complex environments requir...
35961623,Recent evidence shows that genetic and environ...
35961621,This study aimed to examine the prospective as...
35961602,Psychosis presentation can be affected by gene...
35961582,Mental health and cognitive achievement are pa...
...,...
32848625,The inferior colliculus (IC) is an auditory mi...
32848624,Hearing loss is the third most common chronic ...
32848620,Tyrosine hydroxylase (Th) expression has previ...
32848619,Myelin is a dynamic membrane that is important...


In [4]:
df_title = df[['articleTitle']]
df_title

Unnamed: 0_level_0,articleTitle
PMID,Unnamed: 1_level_1
35961771,Coordination between eye movement and whisking...
35961623,Polygenic risk for schizophrenia as a moderato...
35961621,The association of substance use with attainin...
35961602,Differences of affective and non-affective psy...
35961582,Brain structure and function show distinct rel...
...,...
32848625,Developmentally Regulated Rebound Depolarizati...
32848624,Berbamine Analogs Exhibit Differential Protect...
32848620,Purkinje Cell-Specific Knockout of Tyrosine Hy...
32848619,That's a Wrap! Molecular Drivers Governing Neu...


In [6]:
def clean_txt(text):
    
    text = text.lower()
    
    for punctuation in string.punctuation:
        
        text = text.replace(punctuation,'')
              
    text = ''.join(char for char in text if not char.isdigit()) 
    
    tokenized_text = word_tokenize(text)
    stop_words = set(stopwords.words('english')) 
    tokenized_sentence_cleaned = [w for w in tokenized_text 
                                  if not w in stop_words]
    
    verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")
              for word in tokenized_sentence_cleaned]
    
    noun_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
               for word in verb_lemmatized]
        
    return " ".join(noun_lemmatized)

df_title.articleTitle = df_title.articleTitle.astype(str).apply(clean_txt)
df_abstract.abstract = df_abstract.abstract.astype(str).apply(clean_txt)


In [35]:
df_mesh.MeSh = df_mesh.MeSh.astype(str).apply(clean_txt)

In [37]:
tfidf_vectorizer_mesh =TfidfVectorizer()

tfidf_mesh = tfidf_vectorizer_mesh.fit_transform(df_mesh.MeSh)

tfidf_mesh

<88070x11351 sparse matrix of type '<class 'numpy.float64'>'
	with 899857 stored elements in Compressed Sparse Row format>

In [7]:
tfidf_vectorizer_abstract = TfidfVectorizer()

tfidf_abstract = tfidf_vectorizer_abstract.fit_transform(df_abstract.abstract)

tfidf_abstract

<88070x197778 sparse matrix of type '<class 'numpy.float64'>'
	with 8007116 stored elements in Compressed Sparse Row format>

In [8]:
tfidf_vectorizer_title =TfidfVectorizer()

tfidf_title = tfidf_vectorizer_title.fit_transform(df_title.articleTitle)

tfidf_title

<88070x50248 sparse matrix of type '<class 'numpy.float64'>'
	with 870832 stored elements in Compressed Sparse Row format>

In [40]:
input_topic  = " ".join(['brain'])

cleaned_input = pd.DataFrame({'Predict search':clean_txt(input_topic)},index=[1])

cleaned_input

Unnamed: 0,Predict search
1,brain


In [10]:
# search_terms_list_1 = ["brain imaging animal"]#"neuronal circuit","voltage imaging",
# search_terms_list_2 = ["in vivo imaging","multiphoton","optogenetics","intravital","pre-clinical"]
# search_terms_list_3 = ["patchclamp","neurodegenerative"]

# input_topic = " ".join(search_terms_list_1)

In [42]:
input_tfidf_mesh= tfidf_vectorizer_mesh.transform(cleaned_input)

In [43]:
input_tfidf_title = tfidf_vectorizer_title.transform(cleaned_input)



In [44]:
input_tfidf_abstract = tfidf_vectorizer_abstract.transform(cleaned_input)

input_tfidf_abstract

<1x197778 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [46]:
n_neighbors = 100

KNN_t = NearestNeighbors(n_neighbors,p=2)
KNN_t.fit(tfidf_title)

NNs_t = KNN_t.kneighbors(input_tfidf_title,return_distance=True)

recommendations_by_title = df.iloc[list(NNs_t[1][0]), :]

recommendations_by_title.iloc[0]['articleTitle']

'From "satisfaction of search" to "subsequent search misses": a review of multiple-target search errors across radiology and cognitive science.'

In [47]:
n_neighbors = 100

KNN_a = NearestNeighbors(n_neighbors,p=2)
KNN_a.fit(tfidf_abstract)

NNs_a = KNN_a.kneighbors(input_tfidf_abstract,return_distance=True)

recommendations_by_abstract = df.iloc[list(NNs_a[1][0]), :]
recommendations_by_abstract.iloc[0]['abstract']

'Studying age-related changes in working memory (WM) and visual search can provide insights into mechanisms of visuospatial attention. In visual search, WM is used to remember previously inspected objects/locations and to maintain a mental representation of the target to guide the search. We sought to extend this work, using aging as a case of reduced WM capacity. The present study tested whether various domains of WM would predict visual search performance in both young (n\xa0=\xa047; aged 18-35\xa0yrs) and older (n\xa0=\xa048; aged 55-78) adults. Participants completed executive and domain-specific WM measures, and a naturalistic visual search task with (single) feature and triple-conjunction (three-feature) search conditions. We also varied the WM load requirements of the search task by manipulating whether a reference picture of the target (i.e., target template) was displayed during the search, or whether participants needed to search from memory. In both age groups, participants 

In [48]:
n_neighbors = 100

KNN_m = NearestNeighbors(n_neighbors,p=2)
KNN_m.fit(tfidf_mesh)

NNs_m = KNN_m.kneighbors(input_tfidf_mesh,return_distance=True)

recommendations_by_mesh = df.iloc[list(NNs_m[1][0]), :]

recommendations_by_mesh.iloc[0]['MeSh']

'COVID-19, Humans, Mental Health, Pandemics, SARS-CoV-2, Search Engine'

In [31]:
recommendations_by_abstract

Unnamed: 0_level_0,abstract,articleTitle,Journaltitle,volume,pubDate,authors,MeSh,citations,affiliations,keywords,IsReviewArticle
PMID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
33453649,Studying age-related changes in working memory...,Role of aging and working memory in performanc...,Cortex; a journal devoted to the study of the ...,136,2021-03,,"Adolescent, Adult, Aged, Aging, Attention, Hum...",0,[],"Aging, Individual differences, Visual search, ...",0
35044053,Visual search guides goal-directed action in h...,Visual search: Heritability and association wi...,"Genes, brain, and behavior",21,2022-02,"Du Fangxin, Dong Zhiyong, Guan Yiran, Zeid Abd...","Humans, Intelligence, Phenotype, Twins, Dizygo...",38,"Anhui, China., Anhui, China., Anhui, China., 3...","cognition, etiology, evolution, genetics, indi...",0
35623205,"Patients and their caregivers, including clini...","Infodemiology of autoimmune encephalitis, auto...",Epilepsy & behavior : E&B,132,2022-07,"Marques-Pereira Catarina, Pires Manuel, Moreir...","Encephalitis, Epilepsy, Hashimoto Disease, Hum...",0,"Pavia, Italy., Pavia, Italy., Pavia, Italy.","Autoimmune encephalitis, Autoimmune epilepsy, ...",0
34609229,Search-the problem of exploring a space of alt...,Sources of variation in search and foraging: A...,Quarterly journal of experimental psychology (...,75,2022-Feb,"Penfold Rose S, Zazzara Maria Beatrice, Österd...","Attention, Exploratory Behavior, Humans",0,"Foundation Trust., Rome, Italy., Rome, Italy.,...","Foraging, cognitive control, cognitive deficit...",1
34056737,Nonhuman primates (NHPs) are widely studied ac...,Comprehensive search filters for retrieving pu...,American journal of primatology,83,2021-07,"Cassidy Lauren C, Leenaars Cathalijn H C, Rinc...","Animals, Databases, Bibliographic, Primates, R...",15,"Goettingen, Germany., The Netherlands., The Ne...","literature review, nonhuman primates, search f...",0
...,...,...,...,...,...,...,...,...,...,...,...
32914556,PEDro (the Physiotherapy Evidence Database) is...,A description of the primary studies of diagno...,Physiotherapy research international : the jou...,25,2020-Oct,"Vacek Sarah, Whisman Mark A","Databases, Factual, Diagnostic Tests, Routine,...",33,"China. zjm135@zju.edu.cn., China. zjm135@zju.e...","bibliographic databases, diagnosis, evidence-b...",1
33216369,There is a dearth of long-term follow-up studi...,Five-year outcomes of ADHD diagnosed in adulth...,Scandinavian journal of psychology,62,2021-Feb,"Donkervoort Sandra, Kutzner Carl E, Hu Ying, L...","Adolescent, Adult, Attention Deficit Disorder ...",69,"CA 92037., CA 92037., CA 92037., CA 92037., CA...","Adult, attention-deficit/hyperactivity disorde...",0
33667949,Predicting the onset and course of mood and an...,Predicting the 9-year course of mood and anxie...,Psychiatry research,299,2021-05,"Barrera-Conde Marta, Ausin Karina, Lachén-Mont...","Adult, Anxiety, Anxiety Disorders, Bayes Theor...",0,"Barcelona, Spain., Barcelona, Spain., Barcelon...","Anxiety disorder, Depression, Epidemiologic me...",0
33309675,Selective attention can facilitate performance...,Visual Working Memory Guides Spatial Attention...,Neuropsychologia,151,2021-01-22,"Raffaele Stefano, Gelosa Paolo, Bonfanti Elisa...","Attention, Electroencephalography, Humans, Mem...",0,"Gothenburg, Sweden., Gothenburg, Sweden., Goth...","Alpha, Attentional selection, CDA, N2pc, Visua...",0


In [51]:
df_abstract

Unnamed: 0_level_0,abstract
PMID,Unnamed: 1_level_1
35961771,navigation complex environment require motor p...
35961623,recent evidence show genetic environmental ris...
35961621,study aim examine prospective association toba...
35961602,psychosis presentation affect genetic environm...
35961582,mental health cognitive achievement partly her...
...,...
32848625,inferior colliculus ic auditory midbrain struc...
32848624,hear loss third common chronic health conditio...
32848620,tyrosine hydroxylase th expression previously ...
32848619,myelin dynamic membrane important coordinate f...


In [52]:
from gensim.models import Word2Vec

word2vec = Word2Vec(sentences=df_abstract['abstract'])
wv = word2vec.wv

In [58]:
wv

<gensim.models.keyedvectors.KeyedVectors at 0x7fae850eda00>

In [68]:
print(wv.most_similar('a'))

[('o', 0.404689759016037), ('÷', 0.3907855749130249), ('‒', 0.38502994179725647), ('ā', 0.35999223589897156), ('´', 0.3563843071460724), ('≧', 0.3500874638557434), ('\ufeff', 0.345418781042099), ('ᵞ', 0.33598455786705017), ('¯', 0.32226788997650146), ('»', 0.3152369558811188)]
