# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( 'C:/Users/cantd/OneDrive/Springboard/data_wrangling_xml/data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse('C:/Users/cantd/OneDrive/Springboard/data_wrangling_xml/data/mondial_database.xml')

In [6]:
#1. 10 countries with the lowest infant mortality rates

import pandas as pd
from pandas import DataFrame, Series

pop_dict = {} # create a dictionary to store the country (key), and most recent population and infant mortality rate (values)

for element in document.iterfind('country'): # for every country
    for population in element.iterfind('population'): # find all the population values
        try:
            pop_dict[element.find('name').text] = [int(population.text), float(element.find('infant_mortality').text)]
            # associate the population value and infant mortality rate with the country (in the dictionary)
        except:
            pass # if there are any issues, pass
        
pop_df = DataFrame.from_dict(pop_dict, orient = 'index') # create a dataframe from the dictionary (row by row)

pop_df.columns = ['population', 'infant mortality rate'] # set column names

pop_df.sort_values(by='infant mortality rate').head(10) # display the 10 lowest infant mortality rates

Unnamed: 0,population,infant mortality rate
Monaco,36845,1.81
Japan,127298000,2.13
Norway,5051275,2.48
Bermuda,64237,2.48
Singapore,5076700,2.53
Sweden,9555893,2.6
Czech Republic,10562214,2.63
Hong Kong,7071576,2.73
Macao,552503,3.13
Iceland,318452,3.15


In [7]:
#2. 10 cities with the largest population

pop_df.sort_values(by='population', ascending=False).head(10) # display the 10 largest populations

Unnamed: 0,population,infant mortality rate
China,1360720000,14.79
India,1210854977,43.19
United States,318857056,6.17
Indonesia,252124458,25.16
Brazil,202768562,19.21
Pakistan,173149306,57.48
Nigeria,164294516,74.09
Bangladesh,149772364,45.67
Russia,143666931,7.08
Japan,127298000,2.13


In [8]:
#3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

"""build a list of ethnic groups"""
ethnic_list = [] # initialize list of all ethnic groups

for child in document.getroot(): # for each ethnic group
    try:
        if child.find('ethnicgroup').text not in ethnic_list: # if the ethnic group is not in the list
            ethnic_list.append(child.find('ethnicgroup').text) # add the ethnic group to the list
    except:
        pass # if there's an issue, move to the next item

"""build a dictionary containing the country, and associated ethnic groups and percentages"""    
group_dict = {} # initialize the dictionary to store the country, ethnic groups, and percentage of population

for element in document.iterfind('country'): # for each country
    group_list = [] # initialize a blank list to store the ethnic groups and their percentages
    for group in element.iterfind('ethnicgroup'): # for each ethnic group
        group_list.append([float(group.attrib['percentage']), group.text]) # add that group and its percentage to the list
        try:
            group_dict[element.find('name').text] = group_list # set the list of lists made above as the value for the 
                                                               # country (key) in the dictionary
        except:
            pass # if there's an issue, move on to the next item

"""add the ethnic groups to the dataframe (created in In [6]) using the list created above"""        
for group in ethnic_list: # for each ethnic group
    pop_df[group] = 0 # add a column filled with zero's

"""use the dictionary created above to set the values in the ethinic column to the percentage divided by 100"""
for country in group_dict: # for each country
    for group in group_dict[country]: # for each group in the country
        pop_df.loc[country, group[1]] = group[0]/100 # set the country, group value equal to the percentage/100
        
pop_df.fillna(0, inplace = True) # replace any missing values with zero

"""replace the ethnic group percentages with population totals"""
for column in pop_df.columns[2:]: # for each ethnic group column
    pop_df[column] = pop_df[column]*pop_df['population'] # set the values to the percentage * the country's population
   
"""create a dataframe with the totals of each ethnic group"""
ethic_pop_list = [] # initialize an ethnic population list

for column in pop_df.columns[2:]: # for each ethnic group
     ethic_pop_list.append([column, pop_df[column].sum()])  # add the group's name and the sum of all its populations 
                                                            # to the ethinic population list
        
ethic_pop_df = DataFrame(ethic_pop_list, columns = ['ethnic group', 'population']) # create a dataframe from the list

ethic_pop_df.sort_values(by='population', ascending=False).head(10) # display the 10 largest populations

Unnamed: 0,ethnic group,population
39,Han Chinese,1245059000.0
148,Indo-Aryan,871815600.0
58,European,494871800.0
70,African,318325100.0
48,Dravidian,302713700.0
66,Mestizo,157734400.0
46,Bengali,146776900.0
18,Russian,131857000.0
61,Japanese,126534200.0
51,Malay,121993300.0


In [9]:
#4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

def google(item, characteristic):
    """create a dictionary of item / characteristic pairs"""
    temp_dict = {} # create a dictionary to store the item and it's characteristics

    for element in document.iterfind(item): # for every item
        try:
            temp_dict[element.find('name').text] = int(element.find(characteristic).text)
                # associate the the item with the desired characteristic (in the dictionary)
        except:
            pass # if there are any issues, pass

    "make a dataframe from the dictionary"    
    df = DataFrame.from_dict(temp_dict, orient = 'index') # create a dataframe from the dictionary (row by row)

    df.columns = [characteristic] # set column name as characteristic

    """use the dataframe to find the top value, and make a single item list from it"""

    top_list = df.sort_values(by=characteristic, ascending=False).head(1).index.tolist()
    
    """find the countries associated with the item"""
    
    #country_code_list = []
    for element in document.iterfind(item): # for every item
        if element.find('name').text==top_list[0]: # if the item matches the name of the item
            country_code_list = element.attrib['country'].split() # add the name of the country code to the list
        
    for element in document.iterfind('country'): # for every country
        for code in country_code_list: # for every code in the country_code_list
            if element.attrib['car_code']==code: # if the country matches the country code
                print element.find('name').text # print it
                
    print 'The item you seek is called the ' + top_list[0] + ' and it is located in the countries listed above.'

In [10]:
# a) longest river

google('river', 'length')

Colombia
Brazil
Peru
The item you seek is called the Amazonas and it is located in the countries listed above.


In [11]:
# b) largest lake

google('lake', 'area')

Russia
Iran
Turkmenistan
Azerbaijan
Kazakhstan
The item you seek is called the Caspian Sea and it is located in the countries listed above.


In [12]:
# c) airport at highest elevation

google('airport', 'elevation')

Bolivia
The item you seek is called the El Alto Intl and it is located in the countries listed above.
