In [17]:
# Script to read Darwin Core terms in html (Basic Standards Recommendations)
# references TWGD (http://www.tdwg.org/activities/darwincore/)
# The TDWG community's priority is the development of standards 
# for the exchange of biological/biodiversity data

# input: URL for TWGD index ('http://rs.tdwg.org/dwc/terms/index.htm')
# output D.C. terms as dictionary to simple json file

# BCB, USGS, Tristan Wellman, 4-23-2018

In [22]:
%%html
<style>
.output_wrapper, .output {
    height:auto !important;
    max-height:1000px;  /* input the max-height here */
}
.output_scroll {
    box-shadow:none !important;
    webkit-box-shadow:none !important;
}
</style>

In [23]:
import os
import json
import collections
from urllib import request
from datetime import datetime
try: 
    from BeautifulSoup import BeautifulSoup, Comment
except ImportError:
    from bs4 import BeautifulSoup, Comment
    

# parse darwin core terms from html
# -----------------------------------
response = request.urlopen('http://rs.tdwg.org/dwc/terms/index.htm') 
bs = BeautifulSoup(response, "lxml")


# grab D. Core terminology table in adhoc way
# TDWG html tables currently have no id/name
# -------------------------------------------
bs_table = bs.findAll('table')[4]


# Process dictionary of Darwin Core terms:
# -----------------------------------------
DC_terms = collections.OrderedDict()
for tbody in bs_table.findAll('tbody'):
    for tr in tbody.findAll('tr'):

        # process term name
        ta = tr.find('a')
        if ta:
            if 'Term Name' in ta.string:
                key, term_name = ta.string.split(':')[-2:]
                term_name = term_name.strip()
                DC_terms[term_name] = collections.OrderedDict()

        # process term information + links
        td = tr.findAll('td')
        if td:
            try:
                label = td[0].text.replace(':','')
                DC_terms[term_name][label] = td[1].text + ' '
            except:
                pass
            href = td[1].find('a')
            
            # replace details link with url
            if href and td[0].text == 'Details:':
                dir_base = 'http://rs.tdwg.org/dwc/terms'
                dir_detail = href.attrs['href'].rsplit('./')[-1]
                url = os.path.join(dir_base, dir_detail)
                DC_terms[term_name]['Details'] = url

                
# output dictionary of Darwin Core terms to json:
# ------------------------------------------------
datenow = str(datetime.utcnow().strftime('%Y-%m-%dT%H-%M-%SZ'))
dcfname = ('DarwinCore_vocab_' + datenow).replace(' ','_') + '.json'

print('\n**** Writing Darwin Core terms to file and screen: **** \n\n ', dcfname)
with open(dcfname, 'w', encoding = 'Latin-1', errors = 'replace') as outfile:
    json.dump(DC_terms, outfile, indent=4)
    
print (json.dumps(DC_terms, sort_keys=False, indent=4))


**** Writing Darwin Core terms to file and screen: **** 

  DarwinCore_vocab_2018-04-12T15-08-13Z.json
{
    "Occurrence": {
        "Identifier": "http://rs.tdwg.org/dwc/terms/Occurrence ",
        "Class": " ",
        "Definition": "An existence of an Organism (sensu http://rs.tdwg.org/dwc/terms/Organism) at a particular place at a particular time. ",
        "Comment": "Examples: A wolf pack on the shore of Kluane Lake in 1988. A virus in a plant leaf in a the New York Botanical Garden at 15:29 on 2014-10-23. A fungus in Central Park in the summer of 1929. For discussion see http://terms.tdwg.org/wiki/dwc:Occurrence ",
        "Details": "http://rs.tdwg.org/dwc/terms/history/index.htm#Occurrence-2014-10-23"
    },
    "Organism": {
        "Identifier": "http://rs.tdwg.org/dwc/terms/Organism ",
        "Class": " ",
        "Definition": "A particular organism or defined group of organisms considered to be taxonomically homogeneous. ",
        "Comment": "Instances of the Organism