In [2]:
# Imports
from bs4 import BeautifulSoup

In [312]:
# Parse a single file
def parse_file(filename):
    
    # Grab and open the file
    handle = open(filename, 'rb')
    soup = BeautifulSoup(handle, "lxml")

    # Create the article object to hold the parsed information
    article = {}

    #
    # Header Information
    #

    # Parse the title
    if (soup.teiheader.filedesc.titlestmt.title != None):
        article["title"] = soup.teiheader.filedesc.titlestmt.title.getText()

    article["properties"] = {}
    article["properties"]["type"] = "article"

    # Parse the publisher information
    if (soup.teiheader.filedesc.publicationstmt != None):
        article["properties"]["publisher"] = {}
        publisher = soup.teiheader.filedesc.publicationstmt.findChildren(recursive=False)
        for item in publisher:
            article["properties"]["publisher"][item.name] = item.getText()

    #   
    # Grab the Authors section
    #
    authorsInput = soup.teiheader.filedesc.sourcedesc.findAll('author')
    article["properties"]["authors"] = []

    # Then Parse through it
    for item in authorsInput:
        author = {}

        # Names
        author['name'] = {}

        # Forenames
        for name in item.persname.findAll('forename'):
            author["name"][name['type']] = name.getText()

        # Surname
        if(item.persname.surname != None):
            author["name"]["surname"] = item.persname.surname.getText()

        # Rolename?
        if(item.persname.rolename != None):
            author["name"]["rolename"] = item.persname.rolename.getText()

        # Affiliations
        if(item.affiliation != None):
            author['affiliation'] = {}

            # Affiliation components
            for name in item.affiliation.findAll('orgname'):
                author["affiliation"][name['type']] = name.getText()

            # Address of org
            if(item.affiliation.address != None):
                author['affiliation']["address"] = {}
                for addressComponent in item.affiliation.address.findAll():
                    author['affiliation']["address"][addressComponent.name] = addressComponent.getText()

        # Save the author
        article["properties"]["authors"].append(author)

    #  
    # Grab journal Information
    #
    if(soup.teiheader.filedesc.sourcedesc.monogr != None):
        article["properties"]["journal"] = {}
        if (soup.teiheader.filedesc.sourcedesc.monogr.title != None):
            article["properties"]["journal"]["title"] = soup.teiheader.filedesc.sourcedesc.monogr.title.getText()

        # Grab journal publisher information
        if (soup.teiheader.filedesc.sourcedesc.monogr.imprint != None):            
            # This bit if a bit brittle... stupid format...
            for item in soup.teiheader.filedesc.sourcedesc.monogr.imprint.findChildren():

                #Publisher
                if(item.name == "publisher"):
                    article["properties"]["journal"]["publisher"] = soup.teiheader.filedesc.sourcedesc.monogr.imprint.publisher.getText()

                # Parse Date
                elif (item.name == "date"):
                    article["properties"]["journal"][item['type']] = item.getText()

                # Journal bibliographic information
                elif (item.name == "biblscope"):
                    # Page
                    if(item['unit'] == "page"):
                        article["properties"]["journal"]["page"] = {}
                        if(item['from'] != None):
                            article["properties"]["journal"]["page"]["from"] = item['from']
                        if(item['from'] != None):
                            article["properties"]["journal"]["page"]["to"] = item['to']
                    # Volume
                    elif(item['unit'] == "volume"):
                        article["properties"]["journal"]["volume"] = item.getText()
                # Print out other possibilities to prompt inclusion when encountered
                else:
                    print(item, "1")

    #Grab The DOI
    if(soup.teiheader.filedesc.sourcedesc.idno != None):
        if(soup.teiheader.filedesc.sourcedesc.idno["type"] == "DOI"):
            article["properties"]["doi"] = soup.teiheader.filedesc.sourcedesc.idno.getText()  

    #
    # Parse through the sections
    #
    article["sections"] = []

    # Parse the abstract should it exist
    if(soup.teiheader.profiledesc.abstract != None):
        article["sections"].append({"title": "Abstract", "type": "abstract", "number": 0, "paragraphs": [soup.teiheader.profiledesc.abstract.getText()], "references": []})


    # Parase the body

    for div in soup.tei.find('text').findAll('div'):
        section = {}

        # The line below is super brittle, soup can't find the head tags defined in the TEI xml
        section['title'] = div.get_text().split('\n')[1].split('\t')[-1]
        section["paragraphs"] = []
        for p in div.findAll('p'):
            section["paragraphs"].append(p.getText())

        article["sections"].append(section)

    return article

In [313]:
filename = "The Knowledge Engineering Review 2006 RISSLAND.tei.xml"
article = parse_file(filename)

print(article)

{'sections': [{'references': [], 'paragraphs': [u'\nA primary research stream that contributed to the birth of case-based reasoning (CBR) was Artificial Intelligence and Law. Since law is largely about cases, it is a particularly interesting domain for CBR researchers. This article surveys some of the historically significant systems and developments in this field.\n'], 'type': 'abstract', 'number': 0, 'title': 'Abstract'}, {'paragraphs': [u"Significantly for CBR, reasoning with hypotheticals in law is related to reasoning with examples in other domains like mathematics and to adapting exemplars to solve problems. In her doctoral research, Rissland created a model of mathematical knowledge and understanding that gave a prominent role to concrete and prototype examples and their interconnections\u2014so-called Examples-space\u2014in the network of mathematical knowledge (Rissland, 1977, 1978). In the early 1980s, she studied how one generates examples that satisfy certain desiderata for