## Most of the modules are already pakaged with the anaconda and python installation. 

However, xmltodict module needs to installed from anaconda's terminal

In [2]:
import xml.etree.cElementTree as etree
from xml.etree.cElementTree import ElementTree
from xml.etree.cElementTree import Element
from xml.etree.cElementTree import ParseError

import lxml.etree as letree
from xml.dom import minidom
import xmltodict
from collections import namedtuple

import pandas as pd
import json

### Creating and Saving XML with ElementTree API
The ElementTree API is useful for creating and parsing XML files. It has two primary classes:

- ElementTree: supports XML and document manipulation:
    - ElementTree.find() and ElementTree.findall() methods provide XPath searching capabilities
        - XPath stands for XML Path Language
        - XPath uses "path like" syntax to identify and navigate nodes in an XML document
- Element: wraps an XML element
    
To create XML:

- create a root node using Element and then
- create a tree using ElementTree with this root node

In [3]:
root = Element('contacts')

In [4]:
tree = ElementTree(root)

In [5]:
tree

<xml.etree.ElementTree.ElementTree at 0x7faee00e05f8>

## Populate an XML tree with additional tree nodes containing the following information:

In [18]:
Contact = namedtuple("ContactRecord", 'first last age email')

records = [
    Contact('Tom', 'Smith', 53, 'tsmith@boo.com'),
    Contact('Phil', 'Hammer', 42, 'phammer@boo.com'),
    Contact('Mary', 'Fast', 22, 'mfast@boo.com'),
    Contact('Jessica', 'Rest', 33, 'jrest@goo.com')
]

In [19]:
records.sort(key= lambda a: a.age, reverse=True)

In [20]:
records

[ContactRecord(first='Tom', last='Smith', age=53, email='tsmith@boo.com'),
 ContactRecord(first='Phil', last='Hammer', age=42, email='phammer@boo.com'),
 ContactRecord(first='Jessica', last='Rest', age=33, email='jrest@goo.com'),
 ContactRecord(first='Mary', last='Fast', age=22, email='mfast@boo.com')]

### Now build and append nodes to the XML tree:

In [11]:
for record in records:
    
    contact = Element('contact')
    
    name = Element('name')
    
    first = Element('first')
    last = Element('last')
    email = Element('email')
    
    name.attrib = {'age':str(record.age)}
    first.text = record.first
    last.text = record.last
    email.text = record.email
    
    name.append(first)
    name.append(last)
    
    contact.append(email)
    contact.append(name)
    
    root.append(contact)

### Writing XML Tree to an XML File with tree.write()
Finally, save the built XML tree as an XML file:

In [12]:
tree.write("results.xml", encoding = 'utf8')

In [14]:
ls

01.NumPy.ipynb                      dtest.csv
02.Pandas.ipynb                     dtrain.csv
03.NumPy_IO_Helper_Functions.ipynb  index.json
04.DataIO_CSV_XLSX.ipynb            [0m[01;32mLordOfTheRings.txt[0m*
05.JSON.ipynb                       Missing_Data.ipynb
06.XML_IO.ipynb                     records.json
a.npy                               RegularExpressions.ipynb
a.txt                               results.xml
b.txt                               split.json
columns.json                        stdworkers.csv
Concatenate_Merge_Join.ipynb        Subset_Filter_Split.ipynb
csv_hr_db.csv                       values.json
[01;34mdata_raw[0m/                           weather_write.xlsx


### Setting and Getting XML Attributes with .set() and .get()
There are various methods to set and get attributes with the Element object:

- .set(): to set attribute (name, value) pair
- .attrib(): to set an attribute value by its name
- .get(): to get an attribute value by its name

In [25]:
for record in records:
    name = Element('name')
    
    name.attrib = {'age':str(record.age)}
    name.set('age',str(record.age))
    name.attrib['age'] = str(record.age)
    
    print('The attribute is', name.attrib['age'])
    print('The attribute is', name.get('age'))
    print('The Default attribute is', name.get('age1','default value'))
    

The attribute is 53
The attribute is 53
The Default attribute is default value
The attribute is 42
The attribute is 42
The Default attribute is default value
The attribute is 33
The attribute is 33
The Default attribute is default value
The attribute is 22
The attribute is 22
The Default attribute is default value


# Pretty Printing of the XML Tree with lxml
- The ElementTree doesn't have a pretty print feature.
The lxml version of the ElementTree has this option:
https://lxml.de/tutorial.html

## Working with LXML Parser
LXML is often preferred, feature-rich parser

- is a wrapper built around the libxml2 C++ library
- is a validating parser, which means it can support schema and DTDs,
- supports full XPath syntax, and XSLT capabilities

To install pip install lxml

In [26]:
lroot = letree.Element("root")

In [27]:
for record in records:
    contact = letree.Element("contact")
    name = letree.Element("name")
    first = letree.Element("first")
    last = letree.Element("last")
    email = letree.Element('email')
    
    name.set('age', str(record.age))
    first.text = record.first
    last.text = record.last
    email.text = record.email
    
    name.append(first)
    name.append(last)
    
    contact.append(email)
    contact.append(name)
    
    lroot.append(contact)

In [28]:
print(letree.tostring(lroot, pretty_print = True).decode('utf8'))

<root>
  <contact>
    <email>tsmith@boo.com</email>
    <name age="53">
      <first>Tom</first>
      <last>Smith</last>
    </name>
  </contact>
  <contact>
    <email>phammer@boo.com</email>
    <name age="42">
      <first>Phil</first>
      <last>Hammer</last>
    </name>
  </contact>
  <contact>
    <email>jrest@goo.com</email>
    <name age="33">
      <first>Jessica</first>
      <last>Rest</last>
    </name>
  </contact>
  <contact>
    <email>mfast@boo.com</email>
    <name age="22">
      <first>Mary</first>
      <last>Fast</last>
    </name>
  </contact>
</root>



### Optionally, use the Minidom API as a work around

In [29]:
xml_str = etree.tostring(root)

In [30]:
pretty_xml = minidom.parseString(xml_str)

In [31]:
pretty_xml = pretty_xml.toprettyxml(encoding='utf8')

In [33]:
print(pretty_xml.decode())

<?xml version="1.0" encoding="utf8"?>
<contacts>
	<contact>
		<email>tsmith@boo.com</email>
		<name age="53">
			<first>Tom</first>
			<last>Smith</last>
		</name>
	</contact>
	<contact>
		<email>phammer@boo.com</email>
		<name age="42">
			<first>Phil</first>
			<last>Hammer</last>
		</name>
	</contact>
	<contact>
		<email>jrest@goo.com</email>
		<name age="33">
			<first>Jessica</first>
			<last>Rest</last>
		</name>
	</contact>
	<contact>
		<email>mfast@boo.com</email>
		<name age="22">
			<first>Mary</first>
			<last>Fast</last>
		</name>
	</contact>
</contacts>



In [36]:
with open("pretty.xml",'w') as w:
    w.write(pretty_xml.decode())

In [37]:
ls

01.NumPy.ipynb                      dtrain.csv
02.Pandas.ipynb                     index.json
03.NumPy_IO_Helper_Functions.ipynb  [0m[01;32mLordOfTheRings.txt[0m*
04.DataIO_CSV_XLSX.ipynb            Missing_Data.ipynb
05.JSON.ipynb                       pretty.xml
06.XML_IO.ipynb                     records.json
a.npy                               RegularExpressions.ipynb
a.txt                               results.xml
b.txt                               split.json
columns.json                        stdworkers.csv
Concatenate_Merge_Join.ipynb        Subset_Filter_Split.ipynb
csv_hr_db.csv                       values.json
[01;34mdata_raw[0m/                           weather_write.xlsx
dtest.csv


# Parsing XML with ElementTree

In [38]:
Contact = namedtuple('ContactRecord', 'first last age email')

In [39]:
tree = ElementTree()

In [40]:
tree = tree.parse('results.xml')

In [41]:
contacts = []

for contact in tree.getiterator('contact'):
    first = contact.find('.//first').text
    last = contact.find('.//last').text
    email = contact.find('.//email').text
    age = contact.find('.//name').get('age')
    contacts.append(Contact(first, last, age, email))
    

In [42]:
print(contacts)

[ContactRecord(first='Tom', last='Smith', age='53', email='tsmith@boo.com'), ContactRecord(first='Phil', last='Hammer', age='42', email='phammer@boo.com'), ContactRecord(first='Jessica', last='Rest', age='33', email='jrest@goo.com'), ContactRecord(first='Mary', last='Fast', age='22', email='mfast@boo.com')]


### Example: XPath Navigation

In [43]:
xml = '''<?xml version="1.0"?>
<ItemSearchResponse>
  <Items>
    <Item>
      <ItemAttributes>
        <ListPrice>
          <Amount>2260</Amount>
        </ListPrice>
      </ItemAttributes>
      <Offers>
        <Offer>
          <OfferListing>
            <Price>
              <Amount>1853</Amount>
            </Price>
          </OfferListing>
        </Offer>
      </Offers>
    </Item>
        <Item>
      <ItemAttributes>
        <ListPrice>
          <Amount>3312</Amount>
        </ListPrice>
      </ItemAttributes>
      <Offers>
        <Offer>
          <OfferListing>
            <Price>
              <Amount>1853</Amount>
            </Price>
          </OfferListing>
        </Offer>
      </Offers>
    </Item>
  </Items>
</ItemSearchResponse>'''

with open("xpath.xml",'w') as f:
    f.write(xml)

In [44]:
fp = open("xpath.xml",'r')

In [45]:
root = ElementTree().parse(fp)

In [46]:
elements = root.findall('Items/Item/ItemAttributes/ListPrice/Amount')
for i in elements:
    print(i.text)

2260
3312


### Handling XML Exceptions
XML operations can throw exceptions. These should be handled with try-except blocks

In [47]:
try:
    tree = ElementTree().parse('results.xml')
except ParseError as e:
    print('Parse error: {err}'.format(err=e))
    sys.exit()

contacts = []

for contact in tree.getiterator('contact'):
    try:
        first = contact.find('.//first').text
        last = contact.find('.//last').text
        age = contact.find('./name').get('age')
        email = contact.find('.//email').text
        contacts.append(Contact(first, last, age, email))
    except AttributeError as e:
        print('Element error: {err}'.format(err=e))
print(contacts)

[ContactRecord(first='Tom', last='Smith', age='53', email='tsmith@boo.com'), ContactRecord(first='Phil', last='Hammer', age='42', email='phammer@boo.com'), ContactRecord(first='Jessica', last='Rest', age='33', email='jrest@goo.com'), ContactRecord(first='Mary', last='Fast', age='22', email='mfast@boo.com')]


In [48]:
### Read XML into Pandas DataFrame

In [49]:
cols = ['first', 'last','age', 'email']
dt = [str,str,int,str]
xml_df = pd.DataFrame(columns = cols,dtype=str)

try:
    tree = ElementTree().parse('results.xml')
except ParseError as e:
    print('Parse error: {err}'.format(err=e))
    sys.exit()

for contact in tree.getiterator('contact'):
    try:
        first = contact.find('.//first').text
        last = contact.find('.//last').text
        age = contact.find('./name').get('age')
        email = contact.find('.//email').text
        xml_df = xml_df.append(
            pd.Series([first, last, age, email],index=cols),
            ignore_index=True)
    except AttributeError as e:
        print('Element error: {err}'.format(err=e))

In [50]:
xml_df

Unnamed: 0,first,last,age,email
0,Tom,Smith,53,tsmith@boo.com
1,Phil,Hammer,42,phammer@boo.com
2,Jessica,Rest,33,jrest@goo.com
3,Mary,Fast,22,mfast@boo.com


In [51]:
### Convert XML to JSON with xmltodict¶

In [52]:
with open('results.xml') as f:
    xml_input = f.read()
print(xmltodict.parse(xml_input))
print()
print(json.dumps(xmltodict.parse(xml_input)))

OrderedDict([('contacts', OrderedDict([('contact', [OrderedDict([('email', 'tsmith@boo.com'), ('name', OrderedDict([('@age', '53'), ('first', 'Tom'), ('last', 'Smith')]))]), OrderedDict([('email', 'phammer@boo.com'), ('name', OrderedDict([('@age', '42'), ('first', 'Phil'), ('last', 'Hammer')]))]), OrderedDict([('email', 'jrest@goo.com'), ('name', OrderedDict([('@age', '33'), ('first', 'Jessica'), ('last', 'Rest')]))]), OrderedDict([('email', 'mfast@boo.com'), ('name', OrderedDict([('@age', '22'), ('first', 'Mary'), ('last', 'Fast')]))])])]))])

{"contacts": {"contact": [{"email": "tsmith@boo.com", "name": {"@age": "53", "first": "Tom", "last": "Smith"}}, {"email": "phammer@boo.com", "name": {"@age": "42", "first": "Phil", "last": "Hammer"}}, {"email": "jrest@goo.com", "name": {"@age": "33", "first": "Jessica", "last": "Rest"}}, {"email": "mfast@boo.com", "name": {"@age": "22", "first": "Mary", "last": "Fast"}}]}}


In [53]:
json.loads(json.dumps(xmltodict.parse(xml_input)))

{'contacts': {'contact': [{'email': 'tsmith@boo.com',
    'name': {'@age': '53', 'first': 'Tom', 'last': 'Smith'}},
   {'email': 'phammer@boo.com',
    'name': {'@age': '42', 'first': 'Phil', 'last': 'Hammer'}},
   {'email': 'jrest@goo.com',
    'name': {'@age': '33', 'first': 'Jessica', 'last': 'Rest'}},
   {'email': 'mfast@boo.com',
    'name': {'@age': '22', 'first': 'Mary', 'last': 'Fast'}}]}}

In [55]:
json.loads?