### Module for searching articles

In [1]:
articles_count = 10000
similar_art_count = 20
your_entrez_email = ''

In [2]:
from Bio import Entrez
import json
from datetime import date

from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import pandas as pd

In [3]:
def search(query):
    Entrez.email = your_entrez_email
    handle = Entrez.esearch(db='pubmed', sort='name', retmax=articles_count, retmode='xml', term=query)
    results = Entrez.read(handle)
    return results

In [4]:
def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = your_entrez_email
    handle = Entrez.efetch(db='pubmed', retmode='xml', id=ids)
    results = Entrez.read(handle)
    return results

### Get neccesary information and collect founded data to dataframe

In [5]:
def get_abstract(article):
    try:
        background_abstract = str(article['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
    except KeyError:
        background_abstract = 'Abstract not found'
    return background_abstract

In [6]:
def get_keywords(article):
    key_word_list = []
    try:
        key_words = article['MedlineCitation']['KeywordList'][0]

        for key_word in key_words:
            key_word_list.append(str(key_word))
            
    except (KeyError, IndexError):
        pass
        
    return key_word_list

In [7]:
def get_article_date(article):
    try:
        article_date = article['MedlineCitation']['DateCompleted']
        article_date = eval(str(article_date))
        article_date = date(int(article_date['Year']), int(article_date['Month']), int(article_date['Day']))
    except KeyError:
        return ''
        
    return article_date

In [8]:
def get_title(article):
    return article['MedlineCitation']['Article']['ArticleTitle']

In [9]:
def get_id(article):
    return int(article['MedlineCitation']['PMID'])

In [5]:
### If you don't want to waiting for download articles, you can use saved (breast_cancer_data.csv)
# data = pd.read_csv('breast_cancer_data.csv')

In [15]:
keyword = 'breast cancer'

result = search(keyword)
id_list = result['IdList']
details = fetch_details(id_list)

In [11]:
ids = []
titles = []
dates = []
key_words = []
abstracts = []
for i in range(0, len(details['PubmedArticle'])):
    article = details['PubmedArticle'][i]
    ids.append(get_id(article))
    titles.append(get_title(article))
    dates.append(get_article_date(article))
    key_words.append(get_keywords(article))
    abstracts.append(get_abstract(article))

NameError: name 'details' is not defined

In [17]:
data_dict = {
    'id': ids,
    'title': titles,
    'date': dates,
    'keywords': key_words,
    'abstract': abstracts
}
data = pd.DataFrame(data_dict)

In [19]:
# data.to_csv('breast_cancer_data.csv')

In [3]:
### If you don't want to waiting for download articles, you can use saved (breast_cancer_data.csv)
data = pd.read_csv('breast_cancer_data.csv')

In [4]:
# Annotation for testing
annotation = 'A method of care of patients with locally-advanced breast cancer consisting of using regional selective intraar- terial chemotherapy in the schedule of the complex (palliative) treatment is presented. Results of treatment showed an advantage in comparison with the application of traditional methods of the breast cancer treatment. The methodology on intra-arterial introduction of chemotherapeutic agents developed at Donetsk Regional Antitumoral Center and Uni- versity clinic of Odessa showed its undisputable contribution into development of modern oncology through decrease of primary tumoral locus, transition from inoperable state into the state at which it is possible to perform the radical volume of surgical interference to patient. The special attention is paid to development of new methods of treating BC patients (regional forms of disease) with unfavorable factors for forecast of tumor growth.'
annotation_2 = 'This multicenter study assessed breast cancer screening uptake in 461 unaffected women at increased risk of developing breast cancer on the basis of family history who approached familial cancer clinics for advice about surveillance options. At the time of attending the clinic, 89% and 90% of participants were vigilant with respect to age- and risk-specific recommendations for mammography and clinical breast examination, respectively, and 51% reported practicing breast self-examination monthly or more frequently. The degree to which health outcomes are perceived to be under one personal control (χ2=-2.09, p=0.0037) and breast cancer anxiety (χ2=8.11,p=0.044) were both associated with monthly or more frequent breast self-examination, while there were no associations with sociodemographic characteristics. A significantly lower percentage (56%) of women aged <30 were vigilant with respect to mammography recommendations, compared to 77%, 96% and 98% of women aged 30-39, 40-49 and >50, respectively (χ2=37.2,p<0.0001). These relatively low rates of mammographic screening in young women may reflect concerns about increased cancer risk associated with early and repeated radiation exposure or lack of sensitivity in young women with radiographically dense breasts. If mammographic screening is ultimately shown to lower mortality in women at high risk, there will be a strong case to promote screening in young women. The need for regular mammographic screening would then need to be highlighted and reinforced amongst young women and their referring physicians. Awareness amongst general practitioners, who are largely responsible for referral to screening services, would also need to be increased.'

In [7]:
x = data.abstract
x = x.where(x != 'Abstract not found').dropna()
x = x.append(pd.Series(annotation, index=[articles_count + 1]))

In [9]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=0.2, max_df=0.9)
tfidf = tfidf_vectorizer.fit_transform(x)

In [None]:
# Display more important words
# pd.DataFrame(tfidf[65].T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=['tfidf']). \
#     sort_values(by=['tfidf'], ascending=False)

In [10]:
similarity_matrix = (tfidf * tfidf.T).A
similarity = similarity_matrix[-1][0:len(similarity_matrix) - 1]
similarity_sorted = np.sort(similarity)

In [11]:
top = similarity_sorted[len(similarity_sorted) - similar_art_count:]
index_list = np.where(np.isin(similarity, top) == True)[0]

In [12]:
result = data[np.isin(data.index, index_list)]

In [13]:
result.to_csv('result.csv')