In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import data_platform.datasource as ds
from data_platform.config import ConfigManager
from data_platform.datasource.science_direct import ScienceDirectDS, ScienceDirectFactory

In [3]:
from pathlib import Path
import os

In [4]:
current_path = Path(os.getcwd())
data_path = current_path / 'data'
xml_path = data_path / 'unprocessed_articles_xml'

## 构造配置文件

In [5]:
config = ConfigManager({
    "init": {
        "location": xml_path    
    }
})

## 解析xml文件夹或文件

In [6]:
ds = ScienceDirectDS(config)

Parsing file 10.xml error:  There is no article.
Parsing file 109.xml error:  There is no article.
Parsing file 111.xml error:  There is no article.
Parsing file 116.xml error:  There is no article.
Parsing file 117.xml error:  There is no body.
Parsing file 119.xml error:  There is no body.
Parsing file 12.xml error:  There is no article.
Parsing file 122.xml error:  There is no article.
Parsing file 125.xml error:  There is no article.
Parsing file 126.xml error:  There is no article.
Parsing file 129.xml error:  There is no article.
Parsing file 136.xml error:  There is no article.
Parsing file 140.xml error:  There is no article.
Parsing file 142.xml error:  There is no body.
Parsing file 145.xml error:  There is no article.
Parsing file 146.xml error:  There is no article.
Parsing file 150.xml error:  There is no article.
Parsing file 158.xml error:  There is no body.
Parsing file 160.xml error:  There is no tail.
Parsing file 161.xml error:  There is no article.
Parsing file 162.

In [7]:
doc_dict = ds.read_doc()

In [8]:
len(doc_dict)

141

In [9]:
next(iter(doc_dict.items()))

(DocKeyPair(docset_name='_default', doc_name='97'),
 {'root': {'section_list': [{'subsections': [],
     'paragraphs': [{'inline_list': [{'type': 'text',
         'content': 'Introduction',
         'init_args': (),
         'init_kwargs': {}}],
       'init_args': (),
       'init_kwargs': {'id': 'st020'}},
      {'inline_list': [{'type': 'text',
         'content': 'Attention allocation, updating working memory, and language processing are interdependent cognitive tasks related to the focused direction of limited resources, refreshing and substituting information in the current focus of attention, and receiving/sending verbal communication, respectively. These systems are probably impossible to study in isolation, which makes interpretation of findings related to any one of them challenging. For example, we know that children with an unexpected and disproportionate impairment in language development known as specific language impairment (SLI) show deficits relative to age-matched pee

### 使用read_docset来将输出转换为DocumentSet

In [10]:
doc_set = ds.read_docset()

In [11]:
len(doc_set)

141

## 文档集可迭代

In [12]:
doc_name, doc = next(iter(doc_set.items()))

In [13]:
print(doc.get_text())

Introduction
Attention allocation, updating working memory, and language processing are interdependent cognitive tasks related to the focused direction of limited resources, refreshing and substituting information in the current focus of attention, and receiving/sending verbal communication, respectively. These systems are probably impossible to study in isolation, which makes interpretation of findings related to any one of them challenging. For example, we know that children with an unexpected and disproportionate impairment in language development known as specific language impairment (SLI) show deficits relative to age-matched peers on measures of attention ([Finneran, Francis, & Leonard, 2009) and working memory ([Archibald & Gathercole, 2007). Despite efforts to employ nonlinguistic tasks in these studies, the interdependency of the cognitive systems supporting working memory, attention, and language processing makes it difficult to rule out entirely explanations related to the l

## 分节

In [14]:
doc = doc_set[('_default', '68')]

In [15]:
doc.get_sections()

[<data_platform.document.Section at 0x1d23648d828>,
 <data_platform.document.Section at 0x1d2364951d0>,
 <data_platform.document.Section at 0x1d2364959e8>,
 <data_platform.document.Section at 0x1d23649f400>]

In [16]:
sec = doc[0]
sec._init_kwargs

{'section_title': 'Materials and Methods', 'id': 'sec1', 'view': 'all'}

In [17]:
print(sec.get_text())

Materials and Methods
After Institutional Review Board approval, data were extracted from the Richard L. Roudebush Veterans Administration Medical Center in Indianapolis, Indiana. The data for this study were obtained from an ongoing study of first-time colonoscopy. All data extracted were in electronic format. Inclusion and exclusion criteria were previously selected, and the test characteristics were not explicitly outlined for this study and were applied by using custom fully electronic software as part of the parent study design.
Inclusion criteria for the parent study cohort included all veterans aged 40 years and older who had an index outpatient colonoscopy between 2002 and 2009 for any indication.
Exclusion criteria for the cohort included (1) previous Veterans Health Administration–based colonoscopy, (2) indication for colonoscopy of surveillance for neoplasia, (3) surgical resection of any part of the large intestine, (4) history of polyps or cancer of the colon or rectum, (5

## 段落

In [18]:
para1 = sec[0]
para1._init_kwargs

{}

In [19]:
print(para1.get_text())

Materials and Methods


In [20]:
para2 = sec[1]
para2._init_kwargs

{'id': 'p0035', 'view': 'all'}

In [21]:
print(para2.get_text())

After Institutional Review Board approval, data were extracted from the Richard L. Roudebush Veterans Administration Medical Center in Indianapolis, Indiana. The data for this study were obtained from an ongoing study of first-time colonoscopy. All data extracted were in electronic format. Inclusion and exclusion criteria were previously selected, and the test characteristics were not explicitly outlined for this study and were applied by using custom fully electronic software as part of the parent study design.


## 行内元素

In [22]:
type(sec[4][0])

data_platform.document.Text

In [23]:
print(sec[4][0].get_text())

The extracted reports were linked as part of the parent study by using study-specific software to their corresponding pathology reports and de-identified for NLP analysis. There were 10,798 reports, with 6379 linked to pathology. Five hundred of the reports with linked pathology were randomly selected by using MySQL random record selection for triplicate manual annotation.


In [24]:
type(sec[4][1])

data_platform.document.Tag

In [25]:
print(sec[4][1].get_text())

[Figure 1


In [26]:
print(sec[4][1].tag_name)
print(sec[4][1].tag_text)
print(sec[4][1].tag_attr)

cross-ref
[Figure 1
{'refid': ['fig1']}


## 引文数据

In [27]:
doc.metadatas

{'coredata': <data_platform.document.MetaData at 0x1d23649f550>,
 'bib2para': <data_platform.document.MetaData at 0x1d23649f588>,
 'references': <data_platform.document.MetaData at 0x1d23649f5c0>}

In [28]:
ref = doc.metadatas['references']
ref.meta_dict

{'id': 'bibl0005',
 'view': 'all',
 'section-title': {'text': 'References'},
 'bibbliography-section': {'id': 'bibs0005',
  'references': {'bib1': {'id': 'bib1',
    'label': '1',
    'authors': [{'given-name': 'L.C.', 'surname': 'Seeff'},
     {'given-name': 'T.B.', 'surname': 'Richards'},
     {'given-name': 'J.A.', 'surname': 'Shapiro'}],
    'title': {'maintitle': 'How many endoscopies are performed for colorectal cancer screening?',
     'subtitle': "Results from CDC's survey of endoscopic capacity"},
    'series': '',
    'date': '2004',
    'pages': {'first-page': '1670', 'last-page': '1677'}},
   'bib2': {'id': 'bib2',
    'label': '2',
    'authors': [{'given-name': 'C.J.', 'surname': 'Kahi'},
     {'given-name': 'T.F.', 'surname': 'Imperiale'}],
    'title': {'maintitle': 'Flexible sigmoidoscopy screening reduced colorectal cancer incidence and mortality in older adults'},
    'series': '',
    'date': '2012'},
   'bib3': {'id': 'bib3',
    'label': '3',
    'authors': [{'giv

In [29]:
bib2para = doc.metadatas['bib2para']
bib2para.meta_dict

{'fig1': ['/root/sec_0/para_4'],
 'bib20': ['/root/sec_0/para_6'],
 'bib21': ['/root/sec_0/para_6'],
 'bib22': ['/root/sec_0/para_6'],
 'bib23': ['/root/sec_0/para_6'],
 'bib24': ['/root/sec_0/para_6'],
 'bib25': ['/root/sec_0/para_12'],
 'tbl1': ['/root/sec_1/para_2'],
 'tbl2': ['/root/sec_1/para_3',
  '/root/sec_1/para_4',
  '/root/sec_1/para_5',
  '/root/sec_1/para_6'],
 'fig2': ['/root/sec_1/para_8'],
 'bib26': ['/root/sec_2/para_1'],
 'bib27': ['/root/sec_2/para_1'],
 'bib28': ['/root/sec_2/para_1',
  '/root/sec_2/para_1',
  '/root/sec_2/para_2',
  '/root/sec_2/para_8'],
 'bib29': ['/root/sec_2/para_1'],
 'bib30': ['/root/sec_2/para_1', '/root/sec_2/para_3', '/root/sec_2/para_8'],
 'bib31': ['/root/sec_2/para_4'],
 'bib32': ['/root/sec_2/para_4'],
 'bib33': ['/root/sec_2/para_4'],
 'bib16': ['/root/sec_2/para_7']}

## 从 paragraph id 反查段落

In [31]:
paraid_map = doc.get_paragraphs()
paraid_map

{'/root/sec_0/para_0': <data_platform.document.Paragraph at 0x1d23648d860>,
 '/root/sec_0/para_1': <data_platform.document.Paragraph at 0x1d23648d8d0>,
 '/root/sec_0/para_2': <data_platform.document.Paragraph at 0x1d23648d940>,
 '/root/sec_0/para_3': <data_platform.document.Paragraph at 0x1d23648d9b0>,
 '/root/sec_0/para_4': <data_platform.document.Paragraph at 0x1d23648da20>,
 '/root/sec_0/para_5': <data_platform.document.Paragraph at 0x1d23648db70>,
 '/root/sec_0/para_6': <data_platform.document.Paragraph at 0x1d23648dbe0>,
 '/root/sec_0/para_7': <data_platform.document.Paragraph at 0x1d23648de10>,
 '/root/sec_0/para_8': <data_platform.document.Paragraph at 0x1d23648de80>,
 '/root/sec_0/para_9': <data_platform.document.Paragraph at 0x1d23648def0>,
 '/root/sec_0/para_10': <data_platform.document.Paragraph at 0x1d23648df60>,
 '/root/sec_0/para_11': <data_platform.document.Paragraph at 0x1d23648dfd0>,
 '/root/sec_0/para_12': <data_platform.document.Paragraph at 0x1d236495080>,
 '/root/s

## 核心元数据

In [32]:
coredata = doc.metadatas['coredata']
coredata.meta_dict

{'url': 'https://api.elsevier.com/content/article/pii/S1542356513000104',
 'identifier': 'doi:10.1016/j.cgh.2012.11.035',
 'eid': '1-s2.0-S1542356513000104',
 'doi': '10.1016/j.cgh.2012.11.035',
 'pii': 'S1542-3565(13)00010-4',
 'title': 'Natural Language Processing Accurately Categorizes Findings From Colonoscopy and Pathology Reports',
 'publicationName': 'Clinical Gastroenterology and Hepatology',
 'aggregationType': 'Journal',
 'pubType': 'Original article\n               \n                  Alimentary tract',
 'issn': '15423565',
 'volume': '11',
 'issueIdentifier': '6',
 'startingPage': '689',
 'endingPage': '694',
 'pageRange': '689-694',
 'number': '6',
 'format': 'text/xml',
 'coverDate': '2013-06-30',
 'coverDisplayDate': 'June 2013',
 'copyright': 'Copyright © 2013 AGA Institute. Published by Elsevier Ltd. All rights reserved.',
 'publisher': 'AGA Institute. Published by Elsevier Ltd.',
 'creator': 'Imperiale, Thomas F.',
 'description': 'Background & Aims\n                 