# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [4]:

from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np
document = ET.parse( './data/mondial_database.xml' )


In [5]:
#1- 10 countries with the lowest infant mortality rates

data = {'country':[], 'infant_mortality':[]}
for element in document.iterfind('country'):
    data['country'].append(element.find('name').text)
    mortality = element.find('infant_mortality')
    if mortality is not None:
        data['infant_mortality'].append(mortality.text)
    else:
        data['infant_mortality'].append('')
        
df = pd.DataFrame(data)
df = df.replace('',np.NaN)
# find the 10 lowest infant mortality rates
df.dropna().sort('infant_mortality').head(10)



Unnamed: 0,country,infant_mortality
38,Monaco,1.81
30,Romania,10.16
153,Fiji,10.2
69,Brunei,10.48
132,Grenada,10.5
237,Mauritius,10.59
124,Panama,10.7
243,Seychelles,10.77
102,United Arab Emirates,10.92
113,Barbados,10.93


Exercise 2: 10 Cities with Largest Population

In [46]:
data = {'city':[], 'country':[], 'year':[], 'population':[]}

# extract city name, population, and year from xml
for country in document.iterfind('country'):
    for city in country.iterfind('city'):     
       for population in city.iterfind('population'):
          data['country'].append(country.find('name').text)
          data['city'].append(city.find('name').text)
          data['year'].append(population.attrib['year'])
          data['population'].append(int(population.text))    
df = pd.DataFrame(data)
df['rank'] = df.sort_values(['year'], ascending=False).groupby('city').cumcount()+1
df[df['rank'] == 1].sort_values('population' , ascending = False).head(10)

Unnamed: 0,city,country,population,year,rank
433,Seoul,South Korea,9708483,2010,1
412,Al Qahirah,Egypt,8471859,2006,1
204,Bangkok,Thailand,7506700,1999,1
322,Hong Kong,Hong Kong,7055071,2009,1
229,Ho Chi Minh,Vietnam,5968384,2009,1
554,Singapore,Singapore,5076700,2010,1
409,Al Iskandariyah,Egypt,4123869,2006,1
566,New Taipei,Taiwan,3939305,2012,1
437,Busan,South Korea,3403135,2010,1
270,Pyongyang,North Korea,3255288,2008,1


Exercise 3 - 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [47]:
data = {'group':[], 'country':[], 'population':[]}
for country in document.findall('country'):
    name  = country.find('name').text
    group = ''
    population = 0
    maxYear = -999999
    for population  in country.iterfind('population'):
        year = int(population.attrib['year'])
        if year > maxYear:
            maxYear = year
            population = int(population.text)
    for ethnicGroup  in country.iterfind('ethnicgroup'):
         group = ethnicGroup.text
         percentage = float(ethnicGroup.attrib['percentage'])
         if group and percentage and population:
                data['country'].append(name)
                data['group'].append(group)
                data['population'].append(int(population * percentage / 100.))
df = pd.DataFrame(data)
df = df.groupby('group')[['population']].sum()
df.sort(columns='population', ascending=False).head(10)




Unnamed: 0_level_0,population
group,Unnamed: 1_level_1
Han Chinese,1245058800
Indo-Aryan,871815583
European,494872201
African,318325104
Dravidian,302713744
Mestizo,157734349
Bengali,146776916
Russian,131856989
Japanese,126534212
Malay,121993548


Exercise 4 - name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [57]:
rivers = {'country':[], 'name':[], 'length':[]}
for river in document.findall('river'):    
    code = river.attrib['country'] 
    name = ''
    length = 0
    for node in list(river):
        if node.tag == 'name':           
            name = node.text
        elif node.tag == 'length':
            length = float(node.text)
    if name and code and length:
        codes = code.split(' ') # some rivers cross multiple countries
        for c in codes:
            rivers['country'].append(c)
            rivers['name'].append(name) # river name
            rivers['length'].append(length)
rivers = pd.DataFrame(rivers)
rivers.head(10)


# find the longest river
grouped = rivers.groupby(['name'])[['length']].max()
longest = grouped.sort('length', ascending=False).ix[0]

# display info on longest river
print ('Longest River:')
rivers[rivers.name == longest.name][['name','country','length']]

Longest River:




Unnamed: 0,name,country,length
298,Amazonas,CO,6448.0
299,Amazonas,BR,6448.0
300,Amazonas,PE,6448.0


In [59]:
lakes = {'country':[], 'name':[], 'area':[]}

for lake in document.findall('lake'):
   
    code = lake.attrib['country'] # country code, not country name
    name = ''
    area = 0
    for node in list(lake):
        if node.tag == 'name':           
            name = node.text
        elif node.tag == 'area':
            area = float(node.text)
    if name and code and area:
        codes = code.split(' ')
        for c in codes:
            lakes['country'].append(c)
            lakes['name'].append(name) # lake name
            lakes['area'].append(area)

# create a dataframe for the lakes
df = pd.DataFrame(lakes)

grouped = df.groupby(['name'])[['area']].max()
largest = grouped.sort('area', ascending=False).ix[0].name

# display info on longest river
print ('Largest Lake:\n')
df[df.name == largest][['name','country','area']]


Largest Lake:





Unnamed: 0,name,country,area
68,Caspian Sea,R,386400.0
69,Caspian Sea,AZ,386400.0
70,Caspian Sea,KAZ,386400.0
71,Caspian Sea,IR,386400.0
72,Caspian Sea,TM,386400.0


In [61]:
airports_data = {'country':[], 'name':[], 'elevation':[]}

for airports in document.findall('airport'):   
    code = airports.attrib['country'] 
    name = ''
    elev = 0
    for airport in list(airports):
        if airport.tag == 'name':           
            name = airport.text
        elif airport.tag == 'elevation':
            if airport.text is not None:
                elev = float(airport.text)
            
    if name and code and elev:
        airports_data['country'].append(code)
        airports_data['name'].append(name) 
        airports_data['elevation'].append(elev)


df = pd.DataFrame(airports_data)

grouped = df.groupby(['name'])[['elevation']].max()
highest = grouped.sort('elevation', ascending=False).iloc[0].name
airport = df[df.name == highest].iloc[0]
airport






country               BOL
elevation            4063
name         El Alto Intl
Name: 80, dtype: object