# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


# ================================================
****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

# ================================================

# Import Pandas and XML ElementTree

In [3]:
from xml.etree import ElementTree as ET
import pandas as pd

# Parse XML file

In [4]:
document_tree = ET.parse( './data/mondial_database.xml' )

In [5]:
def calculatePopulation(row):
    if row.top_ethnicity:
        return (float(row.ethnic_perc) / 100) * int(row.recent_population)
    else:
        return None

In [114]:
country_list = []
country_codes = []
mort_rates = []
pops = []
ethnicities = []
ethnic_percent = []
largest_city_pops = []
largest_city_names = []

columns = ['name', 'country_code', 'infant_mortality', 'recent_population', 'top_ethnicity', 'ethnic_perc', 'ethnic_pop', 'longest_river', 'river_length', 'biggest_lake', 'lake_area', 'highest_airport', 'airport_elevation']

for element in document_tree.iterfind('country'):
    # For each country element get the name, country code, and infant mortality rate
    country_list.append(element.find('name').text)
    country_codes.append(element.attrib['car_code'])
    try:
        mort_rates.append(element.find('infant_mortality').text)
    except:
        mort_rates.append(None)
        
    # Create sorted list comprehension of dictionaries for top level population elements under 'country'
    pop_dict_list = [{'year': subel.attrib['year'], 'value': subel.text} for subel in element.iterfind('population')]
        
    newlist = sorted(pop_dict_list, key=lambda k: k['year'])[-1:]
    
    # This will append the most recent population recorded to the list
    pops.append(newlist[0]['value'])   
        
    # Create sorted list comprehension of dictionaries for ethnic group elements under 'country'
    ethnic_dict_list = [{'percent': subel.attrib['percentage'], 'value': subel.text} for subel in element.iterfind('ethnicgroup')]
        
    newlist = sorted(ethnic_dict_list, key=lambda k: k['percent'])[-1:]
    
    if len(newlist) > 0:
        ethnicities.append(newlist[0]['value'])  
        ethnic_percent.append(newlist[0]['percent'])
    else:
        ethnicities.append(None)
        ethnic_percent.append(None)
        
    # Check to see if country has provinces
    has_province = element.find('province')
    if has_province:
        cities = []
        for prov in element.iterfind('province'):
            for city in prov.iterfind('city'):
                city_dict = {}

                name = city.find('name').text
                city_dict['name'] = name
                                
                city_pop_list = [{'year': subel.attrib['year'], 'value': subel.text} for subel in city.iterfind('population')]
                new_city_pop_list = sorted(city_pop_list, key=lambda k: k['year'], reverse=True)

                if len(new_city_pop_list) > 0:
                    city_dict['pop'] = new_city_pop_list[0]['value']
                else:
                    city_dict['pop'] = 0
                                        
                cities.append(city_dict)
    else:
        # For each city in the country
        cities = []
        for city in element.iterfind('city'):
            city_dict = {}

            name = city.find('name').text
            city_dict['name'] = name
            
            # Get the most recent recorded population and sort it
            city_pop_list = [{'year': subel.attrib['year'], 'value': subel.text} for subel in city.iterfind('population')]
            new_city_pop_list = sorted(city_pop_list, key=lambda k: k['year'], reverse=True)

            if len(new_city_pop_list) > 0:
                city_dict['pop'] = new_city_pop_list[0]['value']
            else:
                city_dict['pop'] = 0
                
            cities.append(city_dict)
        
    print(cities)
#     top_city = sorted(cities, key=lambda k: k['pop'])
#     largest_city_pops.append(top_city['pop'])
#     largest_city_names.append(top_city['name'])
        
# Create Pandas dataframe with given information extracted from XML file.
country_df = pd.DataFrame(columns=columns)
country_df['name'] = country_list
country_df['country_code'] = country_codes
country_df['infant_mortality'] = mort_rates
country_df['recent_population'] = pops
country_df['top_ethnicity'] = ethnicities
country_df['ethnic_perc'] = ethnic_percent
country_df['ethnic_pop'] = country_df.apply(lambda x: calculatePopulation(x), axis=1)

[{'name': 'Tirana', 'pop': '418495'}, {'name': 'Shkodër', 'pop': '77075'}, {'name': 'Durrës', 'pop': '113249'}, {'name': 'Vlorë', 'pop': '79513'}, {'name': 'Elbasan', 'pop': '78703'}, {'name': 'Korçë', 'pop': '51152'}]
[{'name': 'Komotini', 'pop': 0}, {'name': 'Kavala', 'pop': '58790'}, {'name': 'Athina', 'pop': '664046'}, {'name': 'Peiraias', 'pop': '163688'}, {'name': 'Peristeri', 'pop': '139981'}, {'name': 'Acharnes', 'pop': '106943'}, {'name': 'Patra', 'pop': '213984'}, {'name': 'Kozani', 'pop': 0}, {'name': 'Kerkyra', 'pop': '102071'}, {'name': 'Ioannina', 'pop': '112486'}, {'name': 'Thessaloniki', 'pop': '325182'}, {'name': 'Iraklio', 'pop': '173993'}, {'name': 'Chania', 'pop': '108642'}, {'name': 'Ermoupoli', 'pop': 0}, {'name': 'Rhodes', 'pop': '115490'}, {'name': 'Tripoli', 'pop': 0}, {'name': 'Lamia', 'pop': '75315'}, {'name': 'Chalkida', 'pop': '102223'}, {'name': 'Larissa', 'pop': '162591'}, {'name': 'Volos', 'pop': '144449'}, {'name': 'Mytilini', 'pop': 0}, {'name': 'Karye

In [13]:
def findGreatestValueOfElement(target_el, comparison_var, col_name, col_name_two):
    # target_el = 'airport', 'lake', 'river'
    # comparison_var = 'elevation', 'area'
        
    element_dict = {}
    biggest_element = []
    element_max_value = []
    element_countries = []
    
    for element in document_tree.iterfind(target_el):        
        element_country = element.attrib['country'].split(' ')
        for country in element_country:
            
            # Add country values to element_countries ['Albania', 'Russia', 'Georgia']
            if country not in element_countries:
                element_countries.append(country)
                                    
            # Add [{'comp_var': 3452, 'name': 'amazon river'}] or [{'comp_var: 0': ''}] if it 
            # does not exist.
            if country not in element_dict:
                if element.find(comparison_var) is None or element.find(comparison_var).text == None:
                    element_dict[country] = [{'comp_var': 0, 'name': element.find('name').text}]
                else:
                    element_dict[country] = [{'comp_var': element.find(comparison_var).text, 'name': element.find('name').text}]
            else:
                # append to the list [{'comp_var': 3452, 'name': 'amazon river'}, {'comp_var': 284, 'random river'}]
                # because a country can have multiple airports/rivers/lakes
                
                if element.find(comparison_var) is None or element.find(comparison_var).text == None:
                    element_dict[country].append({'comp_var': 0, 'name': element.find('name').text})
                else:
                    element_dict[country].append({'comp_var': element.find(comparison_var).text, 'name': element.find('name').text})
                
    # this takes care of any countries that don't have the comparison variable. so it matches 
    # the list length of the original countries dataset when appending the biggest element column.
    for code in country_df['country_code'].tolist():
        if code not in element_dict:
            element_dict[code] = [{'comp_var': 0, 'name': 'NA'}]
    
    # sort all the lists so the comparison variable of type int or float is on top.
    for item in element_dict:
        try:
            element_dict[item] = sorted(element_dict[item], key=lambda k: int(k['comp_var']))[-1:]
        except:
            element_dict[item] = sorted(element_dict[item], key=lambda k: float(k['comp_var']))[-1:]
            
        # remove all other objects, and turn item into one dictionary rather than a list.
        element_dict[item] = element_dict[item][0]
        biggest_element.append(element_dict[item]['name'])
        element_max_value.append(element_dict[item]['comp_var'])

    country_df[col_name] = biggest_element
    country_df[col_name_two] = element_max_value

In [14]:
findGreatestValueOfElement('airport', 'elevation', 'highest_airport', 'airport_elevation')
findGreatestValueOfElement('river', 'length', 'longest_river', 'river_length')
findGreatestValueOfElement('lake', 'area', 'biggest_lake', 'lake_area')

In [18]:
def findTopValue(df, column_measurement):
    try:
        df[column_measurement] = df[column_measurement].astype('int')
    except:
        df[column_measurement] = df[column_measurement].astype('float')
        
    max_value = df.sort_values(by=column_measurement, ascending=False).head(1)
    return max_value

# Biggest airport, largest lake, and longest river

In [19]:
findTopValue(country_df, 'airport_elevation')

Unnamed: 0,name,country_code,infant_mortality,recent_population,top_ethnicity,ethnic_perc,ethnic_pop,longest_river,river_length,biggest_lake,lake_area,highest_airport,airport_elevation
23,Russia,R,7.08,143666931,Russian,79.8,114646200.0,Elbe,1091,Lake Genezareth,165,El Alto Intl,4063


In [20]:
findTopValue(country_df, 'river_length')

Unnamed: 0,name,country_code,infant_mortality,recent_population,top_ethnicity,ethnic_perc,ethnic_pop,longest_river,river_length,biggest_lake,lake_area,highest_airport,airport_elevation
72,Thailand,THA,9.86,65981659,Thai,75,49486244.25,Amazonas,6448,,0,Bellegarde,396


In [21]:
findTopValue(country_df, 'lake_area')

Unnamed: 0,name,country_code,infant_mortality,recent_population,top_ethnicity,ethnic_perc,ethnic_pop,longest_river,river_length,biggest_lake,lake_area,highest_airport,airport_elevation
17,Switzerland,CH,3.73,8139631,German,65,5290760.15,Donau,2845,Caspian Sea,386400.0,Minsk 2,204


# 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [35]:
country_df.sort_values('ethnic_pop', ascending=False).iloc[:10][['name', 'top_ethnicity', 'ethnic_pop']]

Unnamed: 0,name,top_ethnicity,ethnic_pop
55,China,Han Chinese,1245059000.0
67,India,Indo-Aryan,871815600.0
120,United States,European,254958100.0
202,Nigeria,African,162651600.0
65,Bangladesh,Bengali,146776900.0
98,Japan,Japanese,126534200.0
23,Russia,Russian,114646200.0
104,Philippines,Malay,88182650.0
87,Egypt,Eastern Hamitic,82830380.0
74,Vietnam,Viet/Kinh,76078380.0


# 10 countries with the lowest infant mortality rates

In [38]:
country_df.sort_values('infant_mortality').iloc[:10][['name', 'infant_mortality']]

Unnamed: 0,name,infant_mortality
38,Monaco,1.81
30,Romania,10.16
153,Fiji,10.2
69,Brunei,10.48
132,Grenada,10.5
237,Mauritius,10.59
124,Panama,10.7
243,Seychelles,10.77
102,United Arab Emirates,10.92
113,Barbados,10.93


In [39]:
country_df

Unnamed: 0,name,country_code,infant_mortality,recent_population,top_ethnicity,ethnic_perc,ethnic_pop,longest_river,river_length,biggest_lake,lake_area,highest_airport,airport_elevation
0,Albania,AL,13.19,2800138,Albanian,95,2.660131e+06,Thjorsa,230,Saimaa,4370.0,Kabul Intl,1792
1,Greece,GR,4.78,10816286,Greek,93,1.005915e+07,Glomma,604,Mjoesa-See,368.0,Tirana Rinas,38
2,Macedonia,MK,7.9,2059794,Macedonian,64.2,1.322388e+06,Dalaelv,520,Vaenern,5648.0,Tamanrasset,1377
3,Serbia,SRB,6.16,7120666,Serb,82.9,5.903032e+06,Kemijoki,550,Arresoe,40.2,Pago Pago Intl,10
4,Montenegro,MNE,,620029,Bosniak,8,4.960232e+04,Lena,4400,Loch Lomond,71.0,Lubango,1762
5,Kosovo,KOS,,1733872,Albanian,92,1.595162e+06,Thames,346,Bodensee,538.5,Wallblake,39
6,Andorra,AND,3.69,78115,African,5,3.905750e+03,Rhein,1324,Bodensee,538.5,V C Bird Intl,19
7,France,F,3.31,64933400,,,,Maas,925,Lac Leman,581.0,Salta,1246
8,Spain,E,3.33,46815916,Mediterranean Nordic,100,4.681592e+07,Rhein,1324,Lac Leman,581.0,Zvartnots,865
9,Austria,A,4.16,8499759,Austrian,91.1,7.743280e+06,Tajo,1007,Lago di Garda,370.0,Reina Beatrix Intl,18
