In [1]:
# Архив Baseline: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/
# Aрхив Daily Updates: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/

# Загрузим экземпляр для примера
!curl -O https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n1219.xml.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 29.2M  100 29.2M    0     0  5146k      0  0:00:05  0:00:05 --:--:-- 6187k


In [2]:
import pubmed_parser as pp
import pandas as pd
import scispacy
import spacy

In [3]:
dicts_out = pp.parse_medline_xml('pubmed24n1219.xml.gz')

In [4]:
df = pd.DataFrame(dicts_out)
df = df[df['abstract'] != ''][['title', 'abstract', 'pmid', 'pubdate']]
# Выведем некоторые метаданные для первых 5 записей попавших в файл с непустым abstract
df[:5]

Unnamed: 0,title,abstract,pmid,pubdate
0,GRP94 in cerebrospinal fluid may contribute to...,The present study was designed to investigate ...,38081093,2023
1,The application of ferritin in transporting an...,The ferritin cage can not only load iron ions ...,38081094,2023
2,Design and characterization of tannic acid/ε-p...,The shelf life of beef is shortened by microbi...,38081095,2023
3,"A comprehensive ""quality-quantity-activity"" ap...",Spices have long been popular worldwide. Besid...,38081096,2023
4,A novel colorimetric and fluorometric dual-sig...,Nanozymes were nanomaterials with enzymatic pr...,38081097,2023


In [5]:
# Выведем abstract для первой записи из набора
df['abstract'][0]

'The present study was designed to investigate potential biomarkers of depression and targets of antidepressants from the perspective of hippocampal endoplasmic reticulum stress (ERS) based on cerebrospinal fluid (CSF) proteomics. Firstly, a six-week depression model was established and treated with fluoxetine (FLX). We found antidepressant-FLX could ameliorate depression-like behaviors and cognition in depressed rats caused by chronic unpredictable mild stress (CUMS). FLX significantly increased neuronal numbers in dentate gyrus (DG) and CA3 regions of hippocampus. CSF proteome data revealed thirty-seven differentially expressed proteins (DEPs) co-regulated by CUMS and FLX, including GRP94 and EIF2α. Results of Gene Oncology (GO) annotation and KEGG pathway enrichment for DEPs mainly included PERK-mediated unfolded protein response, endoplasmic reticulum, and translational initiation. The expression levels of GRP94, p-PERK, p-EIF2α, CHOP and Caspase-12 were increased in hippocampus of

In [7]:
# Полезное по scispacy: https://towardsdatascience.com/using-scispacy-for-named-entity-recognition-785389e7918d

nlp = spacy.load("en_ner_jnlpba_md")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [8]:
%%time
# Функция для извлечения именованных сущностей - в нашем случае - протеины
def extract_proteins(text):
    doc = nlp(text)
    proteins = [ent.text for ent in doc.ents if ent.label_ == "PROTEIN"]
    return list(set(proteins))

df = df[:100]
# Запустим процесс извлечения для 100 первых публикаций из архива
df['proteins'] = df['abstract'].apply(extract_proteins)
# Отобразим те публикации для которых что-то было найдено
df[df['proteins'].apply(len) > 0]

CPU times: user 2.49 s, sys: 6.98 ms, total: 2.5 s
Wall time: 2.5 s


Unnamed: 0,title,abstract,pmid,pubdate,proteins
0,GRP94 in cerebrospinal fluid may contribute to...,The present study was designed to investigate ...,38081093,2023,"[PERK, antidepressant-FLX, GRP94, PERK-mediate..."
1,The application of ferritin in transporting an...,The ferritin cage can not only load iron ions ...,38081094,2023,"[Ferritin, ferritin]"
8,Early-stage diagnosis of bladder cancer using ...,Early diagnosis and accurate assessment of tum...,38081101,2023,"[SERS, nano-biomarkers]"
9,Discovery of a novel pyroptosis inhibitor acti...,Pyroptosis is a proinflammatory type of regula...,38081102,2023,"[N77, NLRP3]"
10,Interleukin-10 gene intervention ameliorates l...,BACKGROUND AND AIMS\nInterleukin 10 (IL-10) an...,38081103,2023,"[serum transaminase, chemokine, CD107a, NKG2D,..."
12,Improvement of transdermal absorption rate by ...,Nonthermal biocompatible plasma (NBP) is a pro...,38081105,2023,[NBP]
15,Integrin αV mediated activation of myofibrobla...,BACKGROUND\nMyocardial infarction (MI) dramati...,38081108,2023,"[Integrins, extracellular matrix (ECM) related..."
16,Comparative survey of coordinated regulation o...,Hypothalamic-Pituitary-Somatotropic (HPS) axis...,38081109,2023,"[igf1, ssts, igf1ra, igf1rb, Functional domain]"
17,Olfactory dysfunction and training in children...,OBJECTIVE\nPostviral olfactory dysfunction (OD...,38081110,2023,"[COVID-19 OD, UPSIT, OT]"
18,Dissolution behavior of ionic liquids for diff...,Lignin is regarded as a potential solution for...,38081111,2023,"[nanocellulose/lignin, cellulose-based products]"
