# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [10]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [11]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [12]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [13]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [29]:
tree = ET.parse( './data/mondial_database.xml' )
import operator
import pandas as pd

In [15]:
def num(s):
        return float(s)

def getkey(item):
    return item[0]

In [16]:
imlist = []
for element in tree.iterfind('country'):
    im = element.find('infant_mortality')
    if im is None:
        continue
    im = num(im.text)
    cont = element.find('name')
    imlist.append((im,cont.text))

sorted(imlist, key=getkey)[:10]





[(1.81, 'Monaco'),
 (2.13, 'Japan'),
 (2.48, 'Norway'),
 (2.48, 'Bermuda'),
 (2.53, 'Singapore'),
 (2.6, 'Sweden'),
 (2.63, 'Czech Republic'),
 (2.73, 'Hong Kong'),
 (3.13, 'Macao'),
 (3.15, 'Iceland')]

In [75]:
# def getsorted(item):
#     return item[1]

In [42]:
# poplist = []
# for element in tree.iterfind('country'):
#     for subelement in element.getiterator('city'):
#         if subelement is None:
#             for subelement in element.getiterator('province'):
#                 cname = subelement.find('name').text
#                 pop = subelement.find('population').text
#                 if pop is None:
#                     continue      
#                 poplist.append((cname,int(pop.text)))
#         else:
#             cname = subelement.find('name').text
#             pop = subelement.find('population')
#             if pop is None:
#                 continue      
#             poplist.append((cname,int(pop.text)))    
# sorted(poplist, key=getsorted, reverse=True)[:10]


pop_df = pd.DataFrame()


def find_max_pop_year(c):
    max_pop = None
    max_year = None
    for p in c.findall('population'):
        year = int(p.attrib['year'])
        pop = int(p.text)
        if pop > max_pop:
            max_pop = pop
            max_year = year
    return max_pop, max_year

for c in tree.iterfind('.//city'):
    con_name = c.attrib['country']
    city_name = c.find('name').text
    pop, year = find_max_pop_year(c)
    pop_df = pop_df.append({"city": city_name,
                    "country": con_name,
                    "year": year,
                    "population": pop}, ignore_index=True)

pop_df = pop_df.dropna()
pop_df.sort_values(by='population').tail(10)


Unnamed: 0,city,country,population,year
1067,Shenzhen,CN,10358381.0,2010.0
1064,Guangzhou,CN,11071424.0,2010.0
1342,Tianjin,CN,11090314.0,2010.0
2810,São Paulo,BR,11152344.0,2010.0
1340,Beijing,CN,11716620.0,2010.0
479,Moskva,R,11979529.0,2013.0
1527,Mumbai,IND,12442373.0,2011.0
1582,Delhi,IND,12877470.0,2001.0
771,Istanbul,TR,13710512.0,2012.0
1341,Shanghai,CN,22315474.0,2010.0


In [64]:
df = pd.DataFrame()
for country in tree.iterfind('country'):
    country_name = country.find('name').text
    pop, year = find_max_pop_year(country)
    for ethnic_group in country.iterfind('ethnicgroup'):
        percentage = float(ethnic_group.attrib['percentage']) / 100
        group_name = ethnic_group.text
        df = df.append({"country": country_name,
                        "population": pop,
                        "year": year,
                        "percentage": percentage,
                        "ethnicity": group_name}, ignore_index=True)
df = df.dropna()
df['ethnicity_population'] = (df.population * df.percentage)*10**-8
df = df.groupby('ethnicity').ethnicity_population.sum().sort_values().reset_index().tail(10)

df

Unnamed: 0,ethnicity,ethnicity_population
270,Malay,1.219936
271,Japanese,1.27289
272,Russian,1.368666
273,Bengali,1.467769
274,Mestizo,1.578553
275,Dravidian,3.027137
276,African,3.183597
277,European,4.949395
278,Indo-Aryan,8.718156
279,Han Chinese,12.450588


In [70]:
def get_resource_data_points(resource_type, target_feat):
  """
  finds all elements specified by $resource_type. For each of
  these elements, return its country, its name, and the value
  of a target feature as specified by the user (via $target_feat).
  """
  for resource in tree.iterfind(".//" + resource_type):
    country = resource.attrib['country']
    name = resource.find('name').text
    feat_elm = resource.find(target_feat)
    feat = float(feat_elm.text) if feat_elm != None and feat_elm.text != None else None
    yield {"country": country, 
           "name": name, 
           target_feat: feat}

river_df = pd.DataFrame(pt for pt in get_resource_data_points("river", "length"))
lake_df = pd.DataFrame(pt for pt in get_resource_data_points("lake", "area"))
airport_df = pd.DataFrame(pt for pt in get_resource_data_points("airport", "elevation"))


In [67]:
print "longest river"
river_df[river_df.length == river_df.length.max()]

longest river


Unnamed: 0,country,length,name
174,CO BR PE,6448.0,Amazonas


In [68]:
print "highest airport"
airport_df[airport_df.elevation == airport_df.elevation.max()]

highest airport


Unnamed: 0,country,elevation,name
80,BOL,4063.0,El Alto Intl


In [69]:
print "Largest Lake"
lake_df[lake_df.area == lake_df.area.max()]

Largest Lake


Unnamed: 0,area,country,name
54,386400.0,R AZ KAZ IR TM,Caspian Sea
