In [4]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install requests
!pip install bs4
!pip install lxml
!pip install lxml beautifulsoup4
!pip install scikit-learn
!pip install torch torchvision torchaudio
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting spacy
  Downloading spacy-3.5.3-cp310-cp310-win_amd64.whl (12.2 MB)
     ---------------------------------------- 0.0/12.2 MB ? eta -:--:--
      --------------------------------------- 0.2/12.2 MB 3.9 MB/s eta 0:00:04
     - -------------------------------------- 0.4/12.2 MB 4.2 MB/s eta 0:00:03
     -- ------------------------------------- 0.6/12.2 MB 4.5 MB/s eta 0:00:03
     -- ------------------------------------- 0.9/12.2 MB 4.7 MB/s eta 0:00:03
     --- ------------------------------------ 1.1/12.2 MB 4.8 MB/s eta 0:00:03
     ---- ----------------------------------- 1.4/12.2 MB 4.8 MB/s eta 0:00:03
     ----- ---------------------------------- 1.6/12.2 MB 4.8 MB/s eta 0:00:03
     ----- ---------------------------------- 1.8/12.2 MB 4.8 MB/s eta 0:00:03
     ------ --------------------------------- 2.1/12.2 MB 4.9 MB/s eta 0:00:03
     ------- -------------------------------- 2.3/12.2 MB 5.0 MB/s eta 0:00:02
     -------- ------------------------------- 2.5/12.2 MB 4

In [4]:
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup
import lxml
import sklearn
import spacy
from datetime import datetime

In [5]:
base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
api_key = 'cf94d59a96d4b17b7389dde3a72724a2eb08'

In [6]:
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=asthma&retmax=1000&usehistory=y&retmode=json'
# url = f'{base_url}esearch.fcgi?db={db}&term={terms_string}&retmax={retmax}'
url = f'{base_url}esearch.fcgi?db=pubmed&term=asthma&retmax=1000&usehistory=y'
url = f'{base_url}esearch.fcgi?db=pubmed&retmax=1000&usehistory=y'
response = requests.get(url)
print(response.text)

ids = re.findall(r"<Id>(\d+)</Id>", response.text)

web_match = re.search(r"<WebEnv>(\S+)</WebEnv>", response.text)
web = web_match.group(1) if web_match else None

key_match = re.search(r"<QueryKey>(\d+)</QueryKey>", response.text)
key = key_match.group(1) if key_match else None

print(web)
print(key)

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">
<eSearchResult>
	<ERROR>Empty term and query_key - nothing todo</ERROR>
</eSearchResult>

None
None


In [6]:
def search_by_terms(terms, db='pubmed', retmax=1000, use_history=True):

    if not terms:
        print('ERROR: No terms to search for')
        if use_history:
            return None, None, None
        return None
    
    terms_string = '+AND+'.join([s.strip().replace(' ', '+') for s in terms])

    url = f'{base_url}esearch.fcgi?db={db}&term={terms_string}&retmax={retmax}&api_key={api_key}'
    if use_history:
        url += '&usehistory=y'

    response = requests.get(url)
    if response.status_code != 200:
        print('ERROR: Bad response code')
        if use_history:
            return None, None, None
        return None
    
    ids = re.findall(r"<Id>(\d+)</Id>", response.text)

    if use_history:
        web_match = re.search(r"<WebEnv>(\S+)</WebEnv>", response.text)
        web = web_match.group(1) if web_match else None

        key_match = re.search(r"<QueryKey>(\d+)</QueryKey>", response.text)
        key = key_match.group(1) if key_match else None

        return ids, web, key

    return ids

In [7]:
def get_articles_data(ids=[], web_env='', query_key='', db='pubmed', retmax=1000):

    use_web_env = False

    if not ids and not (query_key and web_env):
        print('ERROR: No ids or query_key/web_env provided')
        return None
    elif not ids:
        use_web_env = True 

    url = f'{base_url}efetch.fcgi?db={db}'
    if use_web_env:
        url += f'&query_key={query_key}&WebEnv={web_env}'
    else:
        ids_string = [str(id) for id in ids]
        url += '&id=' + ','.join(ids_string)

    url += f'&rettype=abstract&retmode=xml&api_key={api_key}&retmax={retmax}'

    response = requests.get(url)

    if response.status_code != 200:
        print('ERROR: Bad response code')
        return None
    
    soup = BeautifulSoup(response.text, features="xml")
    articles = soup.find_all('PubmedArticle')
    if not articles:
        print('ERROR: No articles found')
        return None
    
    data = pd.DataFrame(columns=['id', 'title', 'abstract', 'keywords', 'pub_date'])
    for article in articles:
        id = article.find('PMID').get_text()
        date = article.find('PubMedPubDate', {'PubStatus': 'received'})
        pub_date = None
        if date:
            pub_date = datetime.strptime(f'{date.find("Day").get_text()} {date.find("Month").get_text()} {date.find("Year").get_text()}', "%d %m %Y")
        title = article.find('ArticleTitle').get_text()
        abstract = ''.join([a.get_text() for a in article.find_all('AbstractText')])
        keywords = [k.get_text() for k in article.find_all('Keyword')]
        data = pd.concat([data, pd.DataFrame({'id': id, 'title': title, 'abstract': abstract, 'keywords': [keywords], 'pub_date': pub_date})]).reset_index(drop=True)

    return data

**id, title, abstract, keywords, pub_date** \
\<PubDate> \
                            \<Year>**2023**\</Year> \
                            \<Month>**May**\</Month> \
                            \<Day>**28**\</Day> \
                        \</PubDate>


\<ArticleTitle>**Metformin protects against retinal ischemia/reperfusion injury through AMPK-mediated mitochondrial fusion.**\</ArticleTitle>

\<Abstract> \
                    \<AbstractText>**Retinal ischemia/reperfusion (I/R) ...injury.**\</AbstractText> \
                    \<CopyrightInformation>**Copyright &#xa9; 2023. Published by Elsevier Inc.**\</CopyrightInformation> \
                \</Abstract>


\<KeywordList Owner="NOTNLM"> \
                \<Keyword MajorTopicYN="N">**AMPK**\</Keyword>            
                \<Keyword MajorTopicYN="N">**Retinal ischemia/reperfusion injury**\</Keyword> \
            \</KeywordList>    

In [12]:
def clean_data(data, drop_id_duplicates=True, drop_empty_abstracts=True, drop_nan_abstracts=True, drop_abstracts_with_matches=True, drop_abstracts_matches=['[This corrects the article DOI: ', '[This retracts the article DOI: '], drop_date_nan=False, drop_date_before=None, drop_date_after=None, search_terms=[]):
    if data is None or data.empty:
        print('ERROR: No data provided')
        return None

    if drop_id_duplicates:
        data = data.drop_duplicates(subset=['id']).reset_index(drop=True)

    if drop_empty_abstracts:
        data = data.loc[data['abstract'] != ''].reset_index(drop=True)

    if drop_nan_abstracts:
        data = data.dropna(subset=['abstract']).reset_index(drop=True)
    
    if drop_abstracts_with_matches and drop_abstracts_matches:
        data = data.loc[~data['abstract'].str.startswith(tuple(drop_abstracts_matches))].reset_index(drop=True)  

    if drop_date_nan:
        data = data.dropna(subset=['pub_date']).reset_index(drop=True)

    if drop_date_before:
        data = data.loc[data['pub_date'] > drop_date_before].reset_index(drop=True)
    if drop_date_after:
        data = data.loc[data['pub_date'] < drop_date_after].reset_index(drop=True)
    
    if search_terms:
        data['search_terms'] = [search_terms]*len(data)
        
    return data

In [19]:
def data_extraction_pipeline(terms, db='pubmed', n_articles=1000, use_history=True, drop_id_duplicates=True, drop_empty_abstracts=True, drop_nan_abstracts=True, drop_abstracts_with_matches=True, drop_abstracts_matches=['[This corrects the article DOI: ', '[This retracts the article DOI: '], drop_date_nan=False, drop_date_before=None, drop_date_after=None, add_search_terms=True, file_name=None):
    data = None
    if use_history:
        ids, web, key = search_by_terms(terms, db=db, retmax=n_articles, use_history=use_history)
        data = get_articles_data(web_env=web, query_key=key, retmax=n_articles)
    else:
        ids = search_by_terms(terms, db=db, retmax=n_articles, use_history=use_history)
        data = get_articles_data(ids=ids, db=db, retmax=n_articles)

    if data is None:
        return None
    
    if add_search_terms:
        data = clean_data(data, drop_id_duplicates=drop_id_duplicates, drop_empty_abstracts=drop_empty_abstracts, drop_nan_abstracts=drop_nan_abstracts, drop_abstracts_with_matches=drop_abstracts_with_matches, drop_abstracts_matches=drop_abstracts_matches, drop_date_nan=drop_date_nan, drop_date_before=drop_date_before, drop_date_after=drop_date_after, search_terms=terms)
    else:
        data = clean_data(data, drop_id_duplicates=drop_id_duplicates, drop_empty_abstracts=drop_empty_abstracts, drop_nan_abstracts=drop_nan_abstracts, drop_abstracts_with_matches=drop_abstracts_with_matches, drop_abstracts_matches=drop_abstracts_matches, drop_date_nan=drop_date_nan, drop_date_before=drop_date_before, drop_date_after=drop_date_after)
    
    if file_name:
        data.to_csv(f'../data/{file_name}.csv', index=False)
        
    return data

In [20]:
terms = []
print('Enter search terms (enter "q" when finished):')
while True:
    term = input()
    if term == 'q':
        break
    terms.append(term)

print('Enter max number of articles to search for:')
n_articles = int(input())

print(f'Searching for max {n_articles} articles with terms: {terms}')

print('Do you want to save the data? (y/n)')
while True:
    save = input()
    if save == 'y':
        print('Enter file name:')
        file_name = input()
        data = data_extraction_pipeline(terms=terms, n_articles=n_articles, file_name=file_name)
        break
    elif save == 'n':
        data = data_extraction_pipeline(terms=terms, n_articles=n_articles)
        break

data

Enter search terms (enter "q" when finished):
Enter max number of articles to search for:
Searching for max 15 articles with terms: ['liver', 'spleen']
Do you want to save the data? (y/n)
Enter file name:


Unnamed: 0,id,title,abstract,keywords,pub_date,search_terms
0,37366124,[Effects of interleukin-17A on liver and kidne...,To explore the effect of interleukin-17A (IL-1...,[],NaT,"[liver, spleen]"
1,37364301,PBTK-TD model of the phagocytosis activity in ...,Due to the high production volume and persiste...,"[BPA, Fish, Immunomarker, PBTK-TD model, Phago...",2023-01-26,"[liver, spleen]"
2,37356957,[Clinical Analysis of Patients with Blastic Pl...,"To explore the clinical characteristics, treat...",[blastic plasmacytoid dendritic cell neoplasm\...,NaT,"[liver, spleen]"
3,37355947,Validation of Spleen Shear Wave Elastography f...,Total number of avoided endoscopies using Bave...,[],NaT,"[liver, spleen]"
4,37355081,Tiaogan Jiejiu Tongluo Formula attenuated alco...,"Tiaogan Jiejiu Tongluo Formula (TJTF), a tradi...","[AMPKα, Chronic alcoholic liver injury, LKB1, ...",2023-05-10,"[liver, spleen]"
5,37354684,"STUDY OF THE PROCESSES OF LIPID PEROXIDATION, ...",The effectiveness of management of patients wi...,[],NaT,"[liver, spleen]"
6,37354616,Pathogenicity and innate immune responses indu...,"Fowl adenovirus serotype 8b (FAdV-8b), as caus...","[fowl adenovirus serotype 8b, immune response,...",2023-03-17,"[liver, spleen]"
7,37354568,N6-methyladenosine methylation mediates non-co...,Owing to their potential adverse health effect...,"[Cardiotoxicity, Circ-Arfgef2, LncG3bp2, Micro...",2023-03-10,"[liver, spleen]"
8,37354099,Preclinical pharmacokinetics of 4-hydroxy isol...,Aim: To study the preclinical pharmacokinetics...,"[4-hydroxyisoleucine, LC–MS/MS, bioavailabilit...",NaT,"[liver, spleen]"
9,37353061,Hepcidin and piscidin modulation and antibacte...,Vibriosis is an infectious disease that genera...,"[Antimicrobial peptides, Hepcidin, Humoral imm...",2023-02-20,"[liver, spleen]"


## TESTS

In [43]:
ids, web, key = search_by_terms(['ischemia', 'reperfusion'], retmax=1)
print(ids)
print(web, key)

print('--------')

ids, web, key = search_by_terms([])
print(ids)
print(web, key)

['37249015']
MCID_6475dc0fbdf29f532548b292 1
--------
ERROR: No terms to search for
None
None None


In [198]:
asd = get_articles_data(ids=[123456789,987654321])

url = https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=123456789,987654321&rettype=abstract&retmode=xml&api_key=cf94d59a96d4b17b7389dde3a72724a2eb08&retmax=1000
request status code = 200
ERROR: No articles found


In [195]:
df.loc[df['abstract'].str.len() <= 100]

Unnamed: 0,id,title,abstract,keywords,pub_date,search_terms


In [196]:
assert len(data.loc[(data['abstract'].isna()) | (data['abstract'] == '')]) == 0

In [200]:
n_articles = 1000
terms = ['ischemia', 'reperfusion']

ids, web, key = search_by_terms(terms, retmax=n_articles)
data = get_articles_data(web_env=web, query_key=key, retmax=n_articles)
data = clean_data(data, search_terms=terms)

data.to_csv('../data/pubmed_data.csv', index=False)

data

Unnamed: 0,id,title,abstract,keywords,pub_date,search_terms
0,37253410,Metformin protects against retinal ischemia/re...,Retinal ischemia/reperfusion (I/R) injury is a...,"[AMPK, Metformin, Mitochondrial dynamics, Reac...",2023-03-26,"[ischemia, reperfusion]"
1,37252276,Ginsenoside compound K protects against cerebr...,"Ginsenoside compound K (CK), the main active m...","[Bioenergy, Cerebral ischemia/reperfusion inju...",2022-06-06,"[ischemia, reperfusion]"
2,37251826,Non-coding RNA mediates endoplasmic reticulum ...,Apoptosis is a complex and highly self-regulat...,"[Apoptosis, Endoplasmic reticulum stress, Hear...",2022-09-30,"[ischemia, reperfusion]"
3,37251271,E-Selectin/AAV Gene Therapy Promotes Myogenesi...,The response to ischemia in peripheral artery ...,[],2023-02-15,"[ischemia, reperfusion]"
4,37249913,Insufficient oxygen inhalation during cardiopu...,Cardiac arrest (CA) and concomitant post-CA sy...,"[cardiopulmonary resuscitation, heart arrest, ...",2022-12-15,"[ischemia, reperfusion]"
...,...,...,...,...,...,...
960,36826023,Myrrh Essential Oil Mitigates Renal Ischemia/R...,Ischemia/reperfusion (I/R)-induced renal injur...,"[anti-apoptotic, anti-inflammatory, antioxidan...",2022-12-22,"[ischemia, reperfusion]"
961,36825607,Testicular torsion in vivo models: Mechanisms ...,Testicular torsion is a condition in which a t...,"[animal models, medical therapy, reperfusion, ...",2022-07-25,"[ischemia, reperfusion]"
962,36825458,Safety and efficacy of intracoronary thromboly...,Large thrombus burden in patients with ST elev...,"[STEMI, intracoronary, primary PCI, thrombolys...",NaT,"[ischemia, reperfusion]"
963,36824459,The role of (pro)renin receptor and its solubl...,The renin-angiotensin system (RAS) is a major ...,"[(pro)renin receptor, cardiovascular disease, ...",2022-11-01,"[ischemia, reperfusion]"


In [201]:
df = pd.read_csv('../data/pubmed_data.csv')
df

Unnamed: 0,id,title,abstract,keywords,pub_date,search_terms
0,37253410,Metformin protects against retinal ischemia/re...,Retinal ischemia/reperfusion (I/R) injury is a...,"['AMPK', 'Metformin', 'Mitochondrial dynamics'...",2023-03-26,"['ischemia', 'reperfusion']"
1,37252276,Ginsenoside compound K protects against cerebr...,"Ginsenoside compound K (CK), the main active m...","['Bioenergy', 'Cerebral ischemia/reperfusion i...",2022-06-06,"['ischemia', 'reperfusion']"
2,37251826,Non-coding RNA mediates endoplasmic reticulum ...,Apoptosis is a complex and highly self-regulat...,"['Apoptosis', 'Endoplasmic reticulum stress', ...",2022-09-30,"['ischemia', 'reperfusion']"
3,37251271,E-Selectin/AAV Gene Therapy Promotes Myogenesi...,The response to ischemia in peripheral artery ...,[],2023-02-15,"['ischemia', 'reperfusion']"
4,37249913,Insufficient oxygen inhalation during cardiopu...,Cardiac arrest (CA) and concomitant post-CA sy...,"['cardiopulmonary resuscitation', 'heart arres...",2022-12-15,"['ischemia', 'reperfusion']"
...,...,...,...,...,...,...
960,36826023,Myrrh Essential Oil Mitigates Renal Ischemia/R...,Ischemia/reperfusion (I/R)-induced renal injur...,"['anti-apoptotic', 'anti-inflammatory', 'antio...",2022-12-22,"['ischemia', 'reperfusion']"
961,36825607,Testicular torsion in vivo models: Mechanisms ...,Testicular torsion is a condition in which a t...,"['animal models', 'medical therapy', 'reperfus...",2022-07-25,"['ischemia', 'reperfusion']"
962,36825458,Safety and efficacy of intracoronary thromboly...,Large thrombus burden in patients with ST elev...,"['STEMI', 'intracoronary', 'primary PCI', 'thr...",,"['ischemia', 'reperfusion']"
963,36824459,The role of (pro)renin receptor and its solubl...,The renin-angiotensin system (RAS) is a major ...,"['(pro)renin receptor', 'cardiovascular diseas...",2022-11-01,"['ischemia', 'reperfusion']"


# SPACY

In [39]:
nlp = spacy.load('en_core_web_sm')

if abstracts:
    for a in abstracts:
        print(a)
        doc = nlp(a)
        for ent in doc.ents:
            print(ent.text, ent.label_)
        print('---')

Retinal ischemia/reperfusion (I/R) injury is a common pathological process responsible for cellular damage in glaucoma, diabetic retinopathy and hypertensive retinopathy. Metformin is a biguanide drug that exerts strong effects on multiple diseases. This study aims to evaluate the protective effect of metformin against retinal I/R injury and its underlying mechanism. I/R induced reduction in retina thickness and cell number in ganglion cell layer, and metformin alleviated I/R-induced retinal injury. Both retinal I/R and simulated ischemia/reperfusion (SIR) in R28 cells down-regulated expression of mitochondrial fusion protein Mfn2 and OPA1, which led to mitochondrial fission. Metformin also alleviated damage in R28 cells, and reversed the alteration in Mfn2 and OPA1, mitochondrial fission and mitochondrial membrane potential (MMP) disruption-induced by I/R or SIR as well. Intriguingly, inhibition of AMPK by compound C or siRNA prevented metformin-mediated up-regulation of Mfn2 and OPA1

KeyboardInterrupt: 

# MedSpacy

In [31]:
!pip install medspacy

Collecting medspacy
  Downloading medspacy-1.1.2.tar.gz (111 kB)
     ---------------------------------------- 0.0/111.4 kB ? eta -:--:--
     -------------------------------------- 111.4/111.4 kB 3.3 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting PyRuSH>=1.0.8
  Downloading PyRuSH-1.0.8-cp310-cp310-win_amd64.whl (63 kB)
     ---------------------------------------- 0.0/63.3 kB ? eta -:--:--
     ---------------------------------------- 63.3/63.3 kB 3.3 MB/s eta 0:00:00
Collecting medspacy-quickumls==3.0
  Downloading medspacy_quickumls-3.0-py3-none-any.whl (92 kB)
     ---------------------------------------- 0.0/92.1 kB ? eta -:--:--
     ----------------------------------------

  error: subprocess-exited-with-error
  
  × python setup.py bdist_wheel did not run successfully.
  │ exit code: 1
  ╰─> [34 lines of output]
      c:\Users\grego\OneDrive\Desktop\progetto diploma\.conda\lib\site-packages\setuptools\__init__.py:84: _DeprecatedInstaller: setuptools.installer and fetch_build_eggs are deprecated.
      !!
      
              ********************************************************************************
              Requirements should be satisfied by a PEP 517 installer.
              If you are using pip, you can try `pip install --use-pep517`.
              ********************************************************************************
      
      !!
        dist.fetch_build_eggs(dist.setup_requires)
      running bdist_wheel
      running build
      running build_ext
      building 'unqlite' extension
      creating build
      creating build\temp.win-amd64-cpython-310
      creating build\temp.win-amd64-cpython-310\Release
      creating build

In [32]:
import sys
import spacy
import medspacy

from medspacy.ner import TargetMatcher, TargetRule
from medspacy.visualization import visualize_ent, visualize_dep

In [64]:
nlp = medspacy.load()

# nlp = medspacy.load('en_core_web_sm', disable={'ner'})

# nlp = spacy.load('en_core_web_sm', disable={'ner'})
# nlp = medspacy.load(nlp)

In [71]:
# nlp = medspacy.load(medspacy_enable=["medspacy_pyrush"])
target_matcher = nlp.get_pipe("medspacy_target_matcher")
target_rules = [
    # TargetRule(literal="abdominal pain", category="PROBLEM"),
    # TargetRule("stroke", "PROBLEM"),
    # TargetRule("hemicolectomy", "TREATMENT"),
    # TargetRule("Hydrochlorothiazide", "TREATMENT"),
    # TargetRule("colon cancer", "PROBLEM"),
    # TargetRule("metastasis", "PROBLEM"),
    TargetRule("asthma", "TREATMENT"),
]
target_matcher.add(target_rules)
text = ''.join(abstracts2)

# doc = nlp(text)
doc = nlp(abstracts2[0])

# Update sentence boundaries
# sentences = [sent.text for sent in doc.sents]
# new_doc = spacy.tokens.Doc(nlp.vocab, words=doc, spaces=[True] * len(doc))
# new_doc.sentences = sentences

# # Iterate over the entities in the new document
# for ent in new_doc.ents:
#     print(ent.text, ent.label_)



print(doc.ents)

(asthma,)


In [68]:
abstracts2[0]

'Tobacco use commonly starts during adolescence and is the leading cause of preventable disease, disability, and death in the United States. Secondhand smoke (SHS) exposure increases asthma and respiratory infection hospitalizations and contributes to sudden unexpected infant death. Few pediatric hospitalist-led smoking cessation studies are formal quality improvement (QI), with most at academic institutions and studying caregivers.'

In [67]:
''.join(abstracts2)

"Tobacco use commonly starts during adolescence and is the leading cause of preventable disease, disability, and death in the United States. Secondhand smoke (SHS) exposure increases asthma and respiratory infection hospitalizations and contributes to sudden unexpected infant death. Few pediatric hospitalist-led smoking cessation studies are formal quality improvement (QI), with most at academic institutions and studying caregivers.To increase SHS exposure/tobacco use screening, smoking cessation discharge instructions, and Smokers' Helpline referrals for community hospital pediatric patients/caregivers through QI.All pediatric, newborn, and NICU admissions were eligible. The baseline period was December 2019 through November 2020 and intervention period December 2020 through June 2021. Interventions included hospitalist education, standardizing documentation, visual reminders, and Helpline wallet cards. The primary measure was monthly percentage of patients screened for SHS exposure/t