# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [4]:
from xml.etree import ElementTree as ET
import pandas as pd

In [214]:
document_tree = ET.parse( './data/mondial_database.xml' )

In [215]:
def calculatePopulation(row):
    if row.top_ethnicity:
        return (float(row.ethnic_perc) / 100) * int(row.recent_population)
    else:
        return None

In [216]:
country_list = []
country_codes = []
mort_rates = []
pops = []
ethnicities = []
ethnic_percent = []

columns = ['name', 'country_code', 'infant_mortality', 'recent_population', 'top_ethnicity', 'ethnic_perc', 'ethnic_pop', 'longest_river', 'river_length', 'biggest_lake', 'lake_area', 'highest_airport', 'airport_elevation']

for element in document_tree.iterfind('country'):
    country_list.append(element.find('name').text)
    country_codes.append(element.attrib['car_code'])
    try:
        mort_rates.append(element.find('infant_mortality').text)
    except:
        mort_rates.append(None)
        
    # Create list comprehension of dictionaries for top level population elements under 'country'
    pop_dict_list = [{'year': subel.attrib['year'], 'value': subel.text} for subel in element.iterfind('population')]
        
    newlist = sorted(pop_dict_list, key=lambda k: k['year'])[-1:]
    pops.append(newlist[0]['value'])        
    
    # Create list comprehension of dictionaries for ethnic group elements under 'country'
    ethnic_dict_list = [{'percent': subel.attrib['percentage'], 'value': subel.text} for subel in element.iterfind('ethnicgroup')]
        
    newlist = sorted(ethnic_dict_list, key=lambda k: k['percent'])[-1:]
    
    if len(newlist) > 0:
        ethnicities.append(newlist[0]['value'])  
        ethnic_percent.append(newlist[0]['percent'])
    else:
        ethnicities.append(None)
        ethnic_percent.append(None)
        
country_df = pd.DataFrame(columns=columns)
country_df['name'] = country_list
country_df['country_code'] = country_codes
country_df['infant_mortality'] = mort_rates
country_df['recent_population'] = pops
country_df['top_ethnicity'] = ethnicities
country_df['ethnic_perc'] = ethnic_percent
country_df['ethnic_pop'] = country_df.apply(lambda x: calculatePopulation(x), axis=1)

In [123]:
def findGreatestValueOfElement(target_el, comparison_var, col_name, col_name_two):
    # target_el = 'airport', 'lake', 'river'
    # comparison_var = 'elevation', 'area'
        
    element_dict = {}
    biggest_element = []
    element_max_value = []
    element_countries = []
    
    for element in document_tree.iterfind(target_el):        
        element_country = element.attrib['country'].split(' ')
        for country in element_country:
            if country not in element_countries:
                element_countries.append(country)
                                    
            if country not in element_dict:
                if element.find(comparison_var) is None or element.find(comparison_var).text == None:
                    element_dict[country] = [{'comp_var': 0, 'name': element.find('name').text}]
                else:
                    element_dict[country] = [{'comp_var': element.find(comparison_var).text, 'name': element.find('name').text}]
            else:
                if element.find(comparison_var) is None or element.find(comparison_var).text == None:
                    element_dict[country].append({'comp_var': 0, 'name': element.find('name').text})
                else:
                    element_dict[country].append({'comp_var': element.find(comparison_var).text, 'name': element.find('name').text})
                
    for code in country_df['country_code'].tolist():
        if code not in element_dict:
            element_dict[code] = [{'comp_var': 0, 'name': 'NA'}]
    
    for item in element_dict:
        try:
            element_dict[item] = sorted(element_dict[item], key=lambda k: int(k['comp_var']))[-1:]
        except:
            element_dict[item] = sorted(element_dict[item], key=lambda k: float(k['comp_var']))[-1:]
        element_dict[item] = element_dict[item][0]
        biggest_element.append(element_dict[item]['name'])
        element_max_value.append(element_dict[item]['comp_var'])

    country_df[col_name] = biggest_element
    country_df[col_name_two] = element_max_value

In [212]:
findGreatestValueOfElement('airport', 'elevation', 'highest_airport', 'airport_elevation')
findGreatestValueOfElement('river', 'length', 'longest_river', 'river_length')
findGreatestValueOfElement('lake', 'area', 'biggest_lake', 'lake_area')

In [207]:
def findTopValue(df, column_measurement):
    try:
        df[column_measurement] = df[column_measurement].astype('int')
    except:
        df[column_measurement] = df[column_measurement].astype('float')
        
    max_value = df.sort_values(by=column_measurement, ascending=False).head(1)
    return max_value

In [209]:
findTopValue(country_df, 'airport_elevation')

Unnamed: 0,name,country_code,infant_mortality,recent_population,top_ethnicity,ethnic_perc,ethnic_pop,longest_river,river_length,biggest_lake,lake_area,highest_airport,airport_elevation
23,Russia,R,7.08,143666931,Russian,79.8,114646200.0,Elbe,1091,Lake Genezareth,165.0,El Alto Intl,4063


In [210]:
findTopValue(country_df, 'river_length')

Unnamed: 0,name,country_code,infant_mortality,recent_population,top_ethnicity,ethnic_perc,ethnic_pop,longest_river,river_length,biggest_lake,lake_area,highest_airport,airport_elevation
72,Thailand,THA,9.86,65981659,Thai,75,49486244.25,Amazonas,6448,,0.0,Bellegarde,396


In [211]:
findTopValue(country_df, 'lake_area')

Unnamed: 0,name,country_code,infant_mortality,recent_population,top_ethnicity,ethnic_perc,ethnic_pop,longest_river,river_length,biggest_lake,lake_area,highest_airport,airport_elevation
17,Switzerland,CH,3.73,8139631,German,65,5290760.15,Donau,2845,Caspian Sea,386400,Minsk 2,204
