# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
import pandas as pd
import numpy as np

document = ET.parse( './data/mondial_database.xml' )

In [67]:
#### Question 1 ####

countries = []
infant_mortality = []
for element in document.iterfind('country'):
    if element.find('infant_mortality') != None:
        countries.append(element.find('name').text)
        infant_mortality.append(float(element.find('infant_mortality').text))
    else:
        # We exclude the countries where the infant_mortality data is not provided
        pass

infant_mortality_df = pd.DataFrame({"Country": countries, "Infant Mortality": infant_mortality})
infant_mortality_df.sort_values('Infant Mortality', ascending = 0).head(10)


Unnamed: 0,Country,Infant Mortality
178,Western Sahara,145.82
48,Afghanistan,117.23
173,Mali,104.34
210,Somalia,100.14
197,Central African Republic,92.86
214,Guinea-Bissau,90.92
198,Chad,90.3
176,Niger,86.27
179,Angola,79.99
185,Burkina Faso,76.8


In [69]:
#### Question 2 ####

# By looking at the data, we assume that we want to have the most recent population number
# for each city.

cities = []
countries = []
years = []
populations = []

for country in document.iterfind('country'):
    for city in country.getiterator('city'):
        year = None
        population = None
        for population in city.getiterator('population'):
            if year == None or int(population.get('year')) > year :
                year = int(population.get('year'))
                population = int(population.text)
        if population != None:
            cities.append(city.find('name').text)
            countries.append(country.find('name').text)
            years.append(year)
            populations.append(population)

population_df = pd.DataFrame({"City": cities, "Country": countries, "Year": years, "Population": populations})
population_df.sort_values('Population', ascending = 0).head(10)

Unnamed: 0,City,Country,Population,Year
1251,Shanghai,China,22315474,2010
707,Istanbul,Turkey,13710512,2012
1421,Mumbai,India,12442373,2011
443,Moskva,Russia,11979529,2013
1250,Beijing,China,11716620,2010
2594,São Paulo,Brazil,11152344,2010
1252,Tianjin,China,11090314,2010
974,Guangzhou,China,11071424,2010
1467,Delhi,India,11034555,2011
977,Shenzhen,China,10358381,2010


In [70]:
#### Question 3 ####

ethnicgroups = []
overall_populations = []
years = []

for country in document.iterfind('country'):
    year = None
    population = None
    for population in country.iterfind('population'):
        if year == None or int(population.get('year')) > year :
            year = int(population.get('year'))
            population = int(population.text)
    if population != None:
        ethnicgroup = country.find('ethnicgroup')
        if ethnicgroup != None:
            ethnicgroups.append(ethnicgroup.text)
            percentage = float(ethnicgroup.get('percentage')) / 100
            overall_population = percentage * population
            overall_populations.append(overall_population)
            years.append(year)
            
overall_populations_df = pd.DataFrame({"Ethnic Group": ethnicgroups, "Overall Population": overall_populations, "Year": years})
overall_populations_df.sort_values("Overall Population", ascending = 0).head(10)
    

Unnamed: 0,Ethnic Group,Overall Population,Year
44,Han Chinese,1245059000.0,2013
55,Dravidian,302713700.0,2011
97,European,254958100.0,2014
165,African,162651600.0,2011
53,Bengali,146776900.0,2011
81,Japanese,126534200.0,2013
21,Russian,114646200.0,2014
73,Javanese,113456000.0,2014
143,European,108886700.0,2014
62,Viet/Kinh,76078380.0,2012


In [53]:
#### Question 4 ####

# Create a dictionary which maps the country names and the country codes
country_dict = {}
for country in document.iterfind('country'):
    country_dict[country.get('car_code')] = country.find('name').text 

In [71]:
# a) Longest River

# Find the longest river and its country or countries
max_length = 0
longest_river_name = ""
for river in document.iterfind('river'):
    river_name = river.find('name').text
    if river.find('length') != None:
        river_length = float(river.find('length').text)    
        country_code = river.get("country")
        if river_length > max_length:
            longest_river_country_code = []
            max_length = river_length
            for code in country_code.split(' '):
                longest_river_country_code.append(code)
            longest_river_name = river_name

# Use the country_dict to get the full name of the country where is located the longest river            
longest_river_country = ""    
for country_code in longest_river_country_code:
    longest_river_country += country_dict[country_code] + ", "   
     
# Print the answer
print "Longest river name: %s" % longest_river_name 
print "Longest river country / countries: %s" % longest_river_country[:-2]

Longest river name: Amazonas
Longest river country / countries: Colombia, Brazil, Peru


In [72]:
# b) Largest Lake

# Find the largest lake and its country or countries
max_area = 0
largest_lake_name = ""
for lake in document.iterfind('lake'):
    lake_name = lake.find('name').text
    if lake.find('area') != None:
        lake_area = float(lake.find('area').text)    
        country_code = lake.get("country")
        if lake_area > max_area:
            largest_lake_country_code = []
            max_area = lake_area
            for code in country_code.split(' '):
                largest_lake_country_code.append(code)
            largest_lake_name = lake_name

# Use the country_dict to get the full name of the country where is located the largest lake            
largest_lake_country = ""    
for country_code in largest_lake_country_code:
    largest_lake_country += country_dict[country_code] + ", "   
     
# Print the answer
print "Largest lake name: %s" % largest_lake_name 
print "Largest lake country / countries: %s" % largest_lake_country[:-2]

Largest lake name: Caspian Sea
Largest lake country / countries: Russia, Azerbaijan, Kazakhstan, Iran, Turkmenistan


In [73]:
# c) Airport at the highest elevation

# Find the airport at the highest elevation and its country or countries
max_elevation = 0
highest_elevation_airport_name = ""
for airport in document.iterfind('airport'):
    airport_name = airport.find('name').text
    if airport.find('elevation') is not None and airport.find('elevation').text is not None:
        airport_elevation = float(airport.find('elevation').text)    
        country_code = airport.get("country")
        if airport_elevation > max_elevation:
            highest_airport_country_code = []
            max_elevation = airport_elevation
            for code in country_code.split(' '):
                highest_airport_country_code.append(code)
            highest_elevation_airport_name = airport_name

# Use the country_dict to get the full name of the country where is located the largest lake            
highest_airport_country = ""    
for country_code in highest_airport_country_code:
    highest_airport_country += country_dict[country_code] + ", "   
     
# Print the answer
print "Highest elevation airport name: %s" % highest_elevation_airport_name 
print "Highest elevation airport country / countries: %s" % highest_airport_country[:-2]


Highest elevation airport name: El Alto Intl
Highest elevation airport country / countries: Bolivia
