In [40]:
import json

In [3]:
# Create json file object
fname = open("json/data/0a00a6df208e068e7aa369fb94641434ea0e6070.json", "rb") 

In [4]:
# Convert json file object to dictionary object
json = json.load(fname)

In [6]:
json.keys()
# this tells us everything we have access to in the object
# the json article document at it's top level has 7 different accessible keys.

dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])

In [15]:
# the paper_id key gives us the unique identifier of the particular article
# this is also the filename (with .json)
json['paper_id']

'0a00a6df208e068e7aa369fb94641434ea0e6070'

In [17]:
# the metadata key gives us title and author information
print(json['metadata'].keys())
print(json['metadata'])

dict_keys(['title', 'authors'])
{'title': 'BMC Genomics Novel genome polymorphisms in BCG vaccine strains and impact on efficacy', 'authors': [{'first': 'Andrea', 'middle': ['S'], 'last': 'Leung', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'University of Toronto', 'location': {}}, 'email': ''}, {'first': 'Vanessa', 'middle': [], 'last': 'Tran', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'University of Toronto', 'location': {}}, 'email': 'vanessa.tran@utoronto.ca'}, {'first': 'Zuowei', 'middle': [], 'last': 'Wu', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'Chinese Academy of Sciences', 'location': {'settlement': 'Beijing', 'region': 'PR China'}}, 'email': ''}, {'first': 'Xuping', 'middle': [], 'last': 'Yu', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'Zhejiang University', 'location': {'settlement': 'Hangzhou', 'country': 'PR China'}}, 'email': 'xpyu@zju.edu.cn'}, {'first': 'David', 'middle': ['C'], 'last': '

In [20]:
# the abstract key gives you access to the abstract text and any citations
# like the body_text, this is a list of dictionaries, even if the length of the list is often only 1.
json['abstract'][0]   # however, we should not assume that an abstract will only ever 
                      # be a single paragraph, we should account for the possibility of many

[{'text': 'Bacille Calmette-Guérin (BCG) is an attenuated strain of Mycobacterium bovis currently used as a vaccine against tuberculosis. Global distribution and propagation of BCG has contributed to the in vitro evolution of the vaccine strain and is thought to partially account for the different outcomes of BCG vaccine trials. Previous efforts by several molecular techniques effectively identified large sequence polymorphisms among BCG daughter strains, but lacked the resolution to identify smaller changes. In this study, we have used a NimbleGen tiling array for whole genome comparison of 13 BCG strains. Using this approach, in tandem with DNA resequencing, we have identified six novel large sequence polymorphisms including four deletions and two duplications in specific BCG strains. Moreover, we have uncovered various polymorphisms in the phoP-phoR locus. Importantly, these polymorphisms affect genes encoding established virulence factors including cell wall complex lipids, ESX sec

In [23]:
# the body_text key gives the same text/citation/reference/section information as the 
# json['body_text']
json['body_text'][0] # grab individual paragraphs from within the "body text"

{'text': 'Bacille Calmette-Guérin (BCG) is an attenuated strain of Mycobacterium bovis and is the only available vaccine against tuberculosis (TB). Since 1974, BCG vaccination has been included in the World Health Organization (WHO) Expanded Program on Immunization. It is estimated that more than 3 billion individuals have been immunized with BCG and over 100 million doses of BCG are administered annually. Multiple studies have confirmed that BCG is generally safe and can protect children against disseminated disease, including tuberculosis meningitis [1, 2] . BCG also provides cross-protection against leprosy [3] . However, the success of BCG against pulmonary TB in adults is still debated, since randomized clini-cal trials have reported protection efficacy ranging from 0-80% [4, 5] . Several hypotheses for the variation in observed efficacy have been proposed [6] [7] [8] [9] .',
 'cite_spans': [{'start': 548, 'end': 551, 'text': '[1,', 'ref_id': 'BIBREF0'},
  {'start': 552, 'end': 55

In [27]:
# the bib_entries key gives information about the bibliography. Each item within is another dictionary
# json[bib_entries]   # access the entire dict object (more dictionaries)
json['bib_entries']['BIBREF0']     # each bibliographical entry is it's own dictionary
# for each object you can access ['ref_id', 'title', 'authors', 'year', 'venue', 'volume', 'issn', 'pages', 'other_ids']

{'ref_id': 'b0',
 'title': 'The efficacy of bacillus Calmette-Guerin vaccination of newborns and infants in the prevention of tuberculosis: meta-analyses of the published literature',
 'authors': [{'first': 'G', 'middle': ['A'], 'last': 'Colditz', 'suffix': ''},
  {'first': 'C', 'middle': ['S'], 'last': 'Berkey', 'suffix': ''},
  {'first': 'F', 'middle': [], 'last': 'Mosteller', 'suffix': ''},
  {'first': 'T', 'middle': ['F'], 'last': 'Brewer', 'suffix': ''},
  {'first': 'M', 'middle': ['E'], 'last': 'Wilson', 'suffix': ''},
  {'first': 'E', 'middle': [], 'last': 'Burdick', 'suffix': ''},
  {'first': 'H', 'middle': ['V'], 'last': 'Fineberg', 'suffix': ''}],
 'year': 1995,
 'venue': 'Pediatrics',
 'volume': '96',
 'issn': '',
 'pages': '29--35',
 'other_ids': {}}

In [29]:
# the authors key within each bib entry is a list of dictionaries of names.

json['bib_entries']['BIBREF0']['authors']

[{'first': 'G', 'middle': ['A'], 'last': 'Colditz', 'suffix': ''},
 {'first': 'C', 'middle': ['S'], 'last': 'Berkey', 'suffix': ''},
 {'first': 'F', 'middle': [], 'last': 'Mosteller', 'suffix': ''},
 {'first': 'T', 'middle': ['F'], 'last': 'Brewer', 'suffix': ''},
 {'first': 'M', 'middle': ['E'], 'last': 'Wilson', 'suffix': ''},
 {'first': 'E', 'middle': [], 'last': 'Burdick', 'suffix': ''},
 {'first': 'H', 'middle': ['V'], 'last': 'Fineberg', 'suffix': ''}]

In [32]:
json['bib_entries']['BIBREF0']['authors'][0]['middle'][0]
# Note the recursion: The letter 'A' here is the first member of a LIST (of middle initials), 
# inside a DICTIONARY (of name parts), inside a LIST (of authors), inside a DICTIONARY 
# (constituting a single bibliographical reference), inside a larger DICTIONARY (of all the 
# bibliographical references), inside a final DICTIONARY (json) at the very top level.

'A'

In [38]:
# It's hard to say conclusively without the paper in front of mee, but I imagine this 
# is the section which details illustrations and figures from the article. I appears to
# be structured similar to the bib_entries with each figure existing as a dictionary.
# Seems to be of the format, figures first, then tables... alphabetical?
print(json['ref_entries'].keys())
print(json['ref_entries']['FIGREF0'])
print(json['ref_entries']['TABREF0'])
# within each entry there appears to be ['text', 'latex', 'type', 'html']

dict_keys(['FIGREF0', 'TABREF0', 'TABREF2', 'TABREF3', 'TABREF4'])
{'text': 'IS6110 insertion in the phoP promoter in BCG-Russia, -Moreau, and -Japan. (A) Schematic representation of the phoP-phoR locus with IS6110 inserted in an inverse orientation 18 bp upstream from phoP start codon. (B) Nucleotide sequence surrounding IS6110. The IS6110 sequence is boxed. The GAA direct repeats flanking the IS6110 insertion site is underlined and in boldface. The ATG start codons of phoP and phoR are indicated by arrows and in boldface.', 'latex': None, 'type': 'figure'}
{'text': 'Novel deletions and duplications determined in current study.', 'latex': None, 'type': 'table', 'html': '<html><body><table><tr><td>BCG strains </td><td>Polymorphisms </td><td>Start </td><td>End </td><td>Size (bp) </td><td>Genes affected\n</td></tr><tr><td>Moreau </td><td>Deletion </td><td>3244503 </td><td>3245478 </td><td>975 </td><td>fadD26, ppsA\n</td></tr><tr><td>\xa0</td><td>\xa0</td><td>4370517 </td><td>4371645 </td

In [39]:
# Again, in the same format as both the abstract and body_text, we have a list of dictionaries
# each containing text, cite and ref info, and section title. Like the abstract, the list of objects is length 1.
json['back_matter']

[{'text': 'This work was supported by an award from the National Natural Science Foundation of China (NSFC) (to JL), and research grants from Canadian Institutes of Health Research (CIHR) (MOP-15107 and MOP-82772 to JL), and a grant (Z0005190043521) from Beijing Municipal Science And Technology Commission to (BZ).',
  'cite_spans': [],
  'ref_spans': [],
  'section': 'Acknowledgements'}]

In [None]:
# I hope this helps demonstrate the ways of accessing the different data within the CORD-19 dataset.
# Next is needed a function to iterate over the text of each article, and search it for keywords (as a start).