# Retrieve data from the crossref API and extract items from the data in JSON format

In [1]:
import urllib.request
import json

In [10]:
base_url = "https://api.crossref.org/works/"
doi = "10.1371/journal.pcbi.1012170"

# Contruct the full URL based on the general base URL and the specific DOI
full_url = base_url + doi

In [14]:
# Retrieving the data - at this stage this is a very long string
doi_json_data = urllib.request.urlopen(full_url).read()

In [11]:
# Commands starting with "%" are Jupyter/IPython related
# https://ipython.readthedocs.io/en/stable/interactive/magics.html

%who

base_url	 complete_url	 doi	 doi_json_data	 full_url	 json	 urllib	 


In [15]:
print(doi_json_data)

b'{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,6,21]],"date-time":"2024-06-21T00:23:38Z","timestamp":1718929418656},"reference-count":31,"publisher":"Public Library of Science (PLoS)","issue":"6","license":[{"start":{"date-parts":[[2024,6,20]],"date-time":"2024-06-20T00:00:00Z","timestamp":1718841600000},"content-version":"vor","delay-in-days":0,"URL":"http:\\/\\/creativecommons.org\\/licenses\\/by\\/4.0\\/"}],"funder":[{"DOI":"10.13039\\/501100001659","name":"Deutsche Forschungsgemeinschaft","doi-asserted-by":"publisher","award":["460129525"],"id":[{"id":"10.13039\\/501100001659","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\\/501100001659","name":"Deutsche Forschungsgemeinschaft","doi-asserted-by":"publisher","award":["442326535"],"id":[{"id":"10.13039\\/501100001659","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["www.ploscompbiol.org"],"crossmark-restriction":false},"short-contain

In [17]:
# Use the function loads from json to read this JSON formated string and translate
# it into a pythonic object (a very nested dictionary)
doi_data = json.loads(doi_json_data)

In [18]:
doi_data

{'status': 'ok',
 'message-type': 'work',
 'message-version': '1.0.0',
 'message': {'indexed': {'date-parts': [[2024, 6, 21]],
   'date-time': '2024-06-21T00:23:38Z',
   'timestamp': 1718929418656},
  'reference-count': 31,
  'publisher': 'Public Library of Science (PLoS)',
  'issue': '6',
  'license': [{'start': {'date-parts': [[2024, 6, 20]],
     'date-time': '2024-06-20T00:00:00Z',
     'timestamp': 1718841600000},
    'content-version': 'vor',
    'delay-in-days': 0,
    'URL': 'http://creativecommons.org/licenses/by/4.0/'}],
  'funder': [{'DOI': '10.13039/501100001659',
    'name': 'Deutsche Forschungsgemeinschaft',
    'doi-asserted-by': 'publisher',
    'award': ['460129525'],
    'id': [{'id': '10.13039/501100001659',
      'id-type': 'DOI',
      'asserted-by': 'publisher'}]},
   {'DOI': '10.13039/501100001659',
    'name': 'Deutsche Forschungsgemeinschaft',
    'doi-asserted-by': 'publisher',
    'award': ['442326535'],
    'id': [{'id': '10.13039/501100001659',
      'id-ty

In [19]:
type(doi_data)

dict

In [21]:
doi_data.keys()

dict_keys(['status', 'message-type', 'message-version', 'message'])

In [22]:
# The dictionary has the key "message" which points to a value
# that itself is a dictionary
doi_data["message"]

{'indexed': {'date-parts': [[2024, 6, 21]],
  'date-time': '2024-06-21T00:23:38Z',
  'timestamp': 1718929418656},
 'reference-count': 31,
 'publisher': 'Public Library of Science (PLoS)',
 'issue': '6',
 'license': [{'start': {'date-parts': [[2024, 6, 20]],
    'date-time': '2024-06-20T00:00:00Z',
    'timestamp': 1718841600000},
   'content-version': 'vor',
   'delay-in-days': 0,
   'URL': 'http://creativecommons.org/licenses/by/4.0/'}],
 'funder': [{'DOI': '10.13039/501100001659',
   'name': 'Deutsche Forschungsgemeinschaft',
   'doi-asserted-by': 'publisher',
   'award': ['460129525'],
   'id': [{'id': '10.13039/501100001659',
     'id-type': 'DOI',
     'asserted-by': 'publisher'}]},
  {'DOI': '10.13039/501100001659',
   'name': 'Deutsche Forschungsgemeinschaft',
   'doi-asserted-by': 'publisher',
   'award': ['442326535'],
   'id': [{'id': '10.13039/501100001659',
     'id-type': 'DOI',
     'asserted-by': 'publisher'}]}],
 'content-domain': {'domain': ['www.ploscompbiol.org'],
  

In [27]:
print(json.dumps(doi_data, indent=2))

{
  "status": "ok",
  "message-type": "work",
  "message-version": "1.0.0",
  "message": {
    "indexed": {
      "date-parts": [
        [
          2024,
          6,
          21
        ]
      ],
      "date-time": "2024-06-21T00:23:38Z",
      "timestamp": 1718929418656
    },
    "reference-count": 31,
    "publisher": "Public Library of Science (PLoS)",
    "issue": "6",
    "license": [
      {
        "start": {
          "date-parts": [
            [
              2024,
              6,
              20
            ]
          ],
          "date-time": "2024-06-20T00:00:00Z",
          "timestamp": 1718841600000
        },
        "content-version": "vor",
        "delay-in-days": 0,
        "URL": "http://creativecommons.org/licenses/by/4.0/"
      }
    ],
    "funder": [
      {
        "DOI": "10.13039/501100001659",
        "name": "Deutsche Forschungsgemeinschaft",
        "doi-asserted-by": "publisher",
        "award": [
          "460129525"
        ],
        "id":

In [23]:
# We can chain the keys to unpack the nested dictionary. 
doi_data["message"]["short-container-title"]

['PLoS Comput Biol']

In [28]:
# As we get list as a value of the key "short-container-title" we use the 
# index to get the first element ("[0]")
doi_data["message"]["short-container-title"][0]

'PLoS Comput Biol'

In [29]:
doi_data["message"]["title"]

['Ten simple rules for implementing electronic lab notebooks (ELNs)']

In [30]:
doi_data["message"]["title"][0]

'Ten simple rules for implementing electronic lab notebooks (ELNs)'

In [32]:
doi_data["message"]["author"]

[{'ORCID': 'http://orcid.org/0000-0002-9421-8582',
  'authenticated-orcid': True,
  'given': 'Justine',
  'family': 'Vandendorpe',
  'sequence': 'first',
  'affiliation': []},
 {'ORCID': 'http://orcid.org/0000-0002-8431-6613',
  'authenticated-orcid': True,
  'given': 'Beatrix',
  'family': 'Adam',
  'sequence': 'additional',
  'affiliation': []},
 {'ORCID': 'http://orcid.org/0000-0002-0363-3837',
  'authenticated-orcid': True,
  'given': 'Jeanne',
  'family': 'Wilbrandt',
  'sequence': 'additional',
  'affiliation': []},
 {'given': 'Birte',
  'family': 'Lindstädt',
  'sequence': 'additional',
  'affiliation': []},
 {'given': 'Konrad U.',
  'family': 'Förstner',
  'sequence': 'additional',
  'affiliation': []}]

In [34]:
doi_data["message"]["publisher"]

'Public Library of Science (PLoS)'

In [36]:
doi_data["message"]["publisher"][0]

'P'

In [37]:
# Side node - a string can also be addressed via an index and the 
# character of that position be returned - here the first character "A" 
# with index 0.
my_name = "Alice"

In [38]:
my_name[0]

'A'

In [33]:
dois = ["10.1371/journal.pcbi.1004668",
        "10.21105/joss.01035",
        "10.1038/35057062",
        "10.21105/joss.01006"]

In [44]:
# For appyling this to several DOI we can use a 
# for loops that iterates through the list
for doi in dois:
    print("- " + doi)
    full_url = base_url + doi
    doi_json_data = urllib.request.urlopen(full_url).read()
    doi_data = json.loads(doi_json_data)
    short_journal_title = doi_data["message"]["short-container-title"][0]
    title = doi_data["message"]["title"][0]
    print("  - " + short_journal_title)
    print("  - " + title)
print("Done with data extraction")

- 10.1371/journal.pcbi.1004668
  - PLoS Comput Biol
  - A Quick Introduction to Version Control with Git and GitHub
- 10.21105/joss.01035
  - JOSS
  - nasapower: A NASA POWER Global Meteorology, Surface Solar Energy and Climatology Data Client for R
- 10.1038/35057062
  - Nature
  - Initial sequencing and analysis of the human genome
- 10.21105/joss.01006
  - JOSS
  - SeqTools: A python package for easy transformation, combination and evaluation of large datasets.
Done with data extraction
