# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

('* Albania:',)
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
('* Greece:',)
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
('* Macedonia:',)
Skopje, Kumanovo
('* Serbia:',)
Beograd, Novi Sad, Niš
('* Montenegro:',)
Podgorica
('* Kosovo:',)
Prishtine
('* Andorra:',)
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )

# 1. 10 countries with the lowest infant mortality rates

Some countries didn't report an infant mortality rate, so I had to account for that.  For each country, I saved the name of the country in a list and the infant mortality rate(im) in another list.  If there was no im, I placed np.nan in it's place.  Then I populated the dataframe with the country and im, changed the type of im so it could be sorted, dropped the np.nan values, sorted and displayed the top 10 best countries.

In [6]:
for child in document.getroot():
    print(child.find('infant_mortality').text)

13.19
4.78
7.9
6.16


AttributeError: 'NoneType' object has no attribute 'text'

In [7]:
im = []
country = []

df = pd.DataFrame()

for child in document.getroot():
    #print(child.find('infant_mortality').text)
    
    try: country.append(child.find('name').text)
    except: pass
    
    try: im.append(child.find('infant_mortality').text)
    except: im.append(np.nan)
    
    
#print(im)
df['country']=country
df['im']=im

df['im'] = df['im'].astype(float)

df.dropna()
df.sort_values(by=['im', 'country'], ascending=[True, False]).head(10)

Unnamed: 0,country,im
38,Monaco,1.81
98,Japan,2.13
36,Norway,2.48
117,Bermuda,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


# 2. 10 cities with the largest population

In [8]:
city_pop = {}

for child in document.iterfind('country'):
    for gchild in child.iterfind('city'):
        try: city_pop[gchild.find('name').text] = int(gchild.find('population').text)
        except: pass

df3 = pd.DataFrame.from_dict(city_pop, orient='index')
df3.reset_index(drop=False, inplace=True)
df3.columns = ['city', 'population']

df3.sort_values(by='population', ascending=False).head(10)

Unnamed: 0,city,population
318,Seoul,10229262
45,Hong Kong,7055071
376,Al Qahirah,6053000
94,Bangkok,5876000
8,Ho Chi Minh,3924435
335,Busan,3813814
75,New Taipei,3722082
230,Hanoi,3056146
287,Al Iskandariyah,2917000
102,Taipei,2626138


In [15]:
year = 0
year_temp = 0
population = 0
population_temp = 0
city = []
pop = []
year_current = []

for child in document.findall('country'):
    year = 0
    temp = 0
    
    #try: city.append(child.find('name').text)
    #except: pass
    
    for gchild in child.iterfind('city'):
        try:
            city.append(gchild.find('name').text)
            #year_temp = int(gchild.attrib.get('year'))
            #population_temp = int(gchild.text)
        except: pass
        
        try: year_temp = int(gchild.attrib.get('year'))
        except: pass
        
    #for gchild in child.iterfind('population'):
        #year_temp = int(gchild.attrib.get('year'))
        #population_temp = int(gchild.text)
        if year_temp > year:
            year = year_temp
            population = population_temp
            
        
    year_current.append(year)
    pop.append(population)
        #print(gchild.attrib.get('year'), gchild.text)
        #print(gchild.text)
df2 = pd.DataFrame()

df2['city'] = city
df2['year'] = year_current
df2['population'] = pop

df2.sort_values(by='population', ascending=False).head(10)

ValueError: Length of values does not match length of index

In [18]:
pop

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]