In [4]:
################################ XML ######################################

# XML is a generalized way of describing hierarchical structured data.

# It has start and end tags
# First element in every XML document is called the root element. An XML document can only have one root element.

# By putting a / character in the start tag, you can skip the end tag altogther.
# <foo>            =>       <foo/>
# </foo>

In [None]:
# Namespaces usually look like URLs. You use an xmlns declaration to define a default namespace. 
# XML elements can be declared in different namespaces.

<feed xmlns='http://www.w3.org/2005/Atom'>  # feed element is in the http://www.w3.org/2005/Atom namespace
  <title>dive into mark</title>             # title element is also in the http://www.w3.org/2005/Atom namespace     
</feed>

# Can also be written as:
<atom:feed xmlns:atom='http://www.w3.org/2005/Atom'> 
  <atom:title>dive into mark</atom:title>             
</atom:feed>

# Namespace + element name = XML identity

<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'>
# Any element can contain an xml:lang attribute, which declares the language of the element and its children.

In [None]:
import xml.etree.ElementTree as etree

tree = etree.parse('feed.xml')               # parse() function returns an object which represents the entire document.
root = tree.getroot()                        # returns the root element of the document
root

<Element '{http://www.w3.org/2005/Atom}feed' at 0x00000146AB2DDFD0>

In [5]:
#
# In the ElementTree API, an element acts like a list. The items of the list are the element’s children.
#

print(root.tag)
print(len(root))

{http://www.w3.org/2005/Atom}feed
9


In [6]:
for child in root:
    print(child)

<Element '{http://www.w3.org/2005/Atom}title' at 0x00000146AB2DE2A0>
<Element '{http://www.w3.org/2005/Atom}subtitle' at 0x00000146AB2DFE70>
<Element '{http://www.w3.org/2005/Atom}id' at 0x00000146AB658E00>
<Element '{http://www.w3.org/2005/Atom}updated' at 0x00000146AB658EA0>
<Element '{http://www.w3.org/2005/Atom}link' at 0x00000146AB658F40>
<Element '{http://www.w3.org/2005/Atom}link' at 0x00000146AB659300>
<Element '{http://www.w3.org/2005/Atom}entry' at 0x00000146AB6592B0>
<Element '{http://www.w3.org/2005/Atom}entry' at 0x00000146AB659EE0>
<Element '{http://www.w3.org/2005/Atom}entry' at 0x00000146AB65A340>


In [8]:
#
# Attributes of elements act like dictionaries
#
print(root.attrib)
print(root[4].attrib)

{'{http://www.w3.org/XML/1998/namespace}lang': 'en'}
{'rel': 'alternate', 'type': 'text/html', 'href': 'http://diveintomark.org/'}


In [10]:
#
# Finding specific elements inside XML
#
import xml.etree.ElementTree as etree
tree = etree.parse('feed.xml')

root = tree.getroot()
root.findall('{http://www.w3.org/2005/Atom}entry')

[<Element '{http://www.w3.org/2005/Atom}entry' at 0x00000146AB66F5B0>,
 <Element '{http://www.w3.org/2005/Atom}entry' at 0x00000146AB66DF30>,
 <Element '{http://www.w3.org/2005/Atom}entry' at 0x00000146AB66D990>]

In [11]:
root.tag

'{http://www.w3.org/2005/Atom}feed'

In [13]:
print(root.findall('{http://www.w3.org/2005/Atom}feed'))    # feed element has no child called feed, so empty result
print(root.findall('{http://www.w3.org/2005/Atom}author'))  # author element is present but it is not a direct child of feed, so
                                                            # blank result

[]
[]


In [14]:
entries = tree.findall('{http://www.w3.org/2005/Atom}entry')
len(entries)

3

In [16]:
# find() method returns the first matching element

title_element = entries[0].find('{http://www.w3.org/2005/Atom}title')
title_element.text

'Dive into history, 2009 edition'

In [None]:
#
# For searching elements at all nesting levels, use './/' before search element
#

all_links = tree.findall('.//{http://www.w3.org/2005/Atom}link')

In [19]:
all_links

[<Element '{http://www.w3.org/2005/Atom}link' at 0x00000146AB66D120>,
 <Element '{http://www.w3.org/2005/Atom}link' at 0x00000146AB66D8F0>,
 <Element '{http://www.w3.org/2005/Atom}link' at 0x00000146AB66EBB0>,
 <Element '{http://www.w3.org/2005/Atom}link' at 0x00000146AB66EF70>,
 <Element '{http://www.w3.org/2005/Atom}link' at 0x00000146AB66E200>]

In [20]:
# lxml is an open source third-party library that builds on the popular libxml2 parser. 
# It provides a 100% compatible ElementTree API

from lxml import etree

tree = etree.parse('feed.xml')
root = tree.getroot()
root.findall('{http://www.w3.org/2005/Atom}entry')

[<Element {http://www.w3.org/2005/Atom}entry at 0x146ab662200>,
 <Element {http://www.w3.org/2005/Atom}entry at 0x146ab662180>,
 <Element {http://www.w3.org/2005/Atom}entry at 0x146ab661fc0>]

In [22]:
# For large XML documents, lxml is significantly faster than the built-in ElementTree library.
# Searches through all elements which have Atom namespace and href attribute

tree.findall('.//{http://www.w3.org/2005/Atom}*[@href]')

[<Element {http://www.w3.org/2005/Atom}link at 0x146a9e21fc0>,
 <Element {http://www.w3.org/2005/Atom}link at 0x146ab5aee40>,
 <Element {http://www.w3.org/2005/Atom}link at 0x146ab5adc80>,
 <Element {http://www.w3.org/2005/Atom}link at 0x146ab5af140>,
 <Element {http://www.w3.org/2005/Atom}link at 0x146ab5ad6c0>]

In [23]:
# Finding atom elements with a specific href value

tree.findall(".//{http://www.w3.org/2005/Atom}*[@href='http://diveintomark.org/']")

[<Element {http://www.w3.org/2005/Atom}link at 0x146a9e21fc0>]

In [27]:
x = tree.findall(".//{http://www.w3.org/2005/Atom}*[@href='http://diveintomark.org/']")