# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET
import pandas as pd

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [None]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

In [None]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [4]:
document = ET.parse( './data/mondial_database.xml' )

### 1. 10 countries with the lowest infant mortality rates

In [30]:
# create dataframe to store country names and infant mortality rates
df = pd.DataFrame(columns=('country', 'infmort'))

In [31]:
# iterate through countries and add names and infant mortality rates to dataframe
i = 0
root = document.getroot()
for child in root.findall('country'):
    if child.find('infant_mortality') != None:
        infmort = float(child.find('infant_mortality').text)
        name = child.find('name').text
        df.loc[i] = name, infmort
        i += 1

In [32]:
# find 10 countries with lowest mortality rates
df.sort_values('infmort').head(10)

Unnamed: 0,country,infmort
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


### 2. 10 cities with the largest population

In [33]:
# create dataframe to store city names and populations
df = pd.DataFrame(columns=('city', 'pop'))

In [34]:
# iterate through countries and add city names and populations to dataframe
for child in document.iterfind('country'):
    if child.find('city') != None:
        for subchild in child.getiterator('city'):
            city = subchild.find('name').text
            if subchild.find('population') == None:
                pop = 0
            else:
                pop = float(subchild.find('population').text)
        df.loc[i] = city, pop
        i += 1

In [35]:
# identify 10 cities with largest populations
df.sort_values('pop', ascending=False).head(10)

Unnamed: 0,city,pop
270,Hong Kong,7055071.0
291,Singapore,1445929.0
258,Yerevan,1200000.0
236,Rīga,900000.0
364,Lomé,839566.0
293,Tainan,707658.0
283,Bayrūt,702000.0
306,Santiago,591985.0
292,Colombo,587647.0
312,Kingston,586930.0


### 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [36]:
# create dataframe to store country names, ethnic groups, and populations
df = pd.DataFrame(columns=('ethgroup', 'pop'))

In [37]:
# iterate through countries and add ethnic groups and populations to dataframe
i = 0
root = document.getroot()
for child in root.findall('country'):
    if child.find('ethnicgroup') != None:
        ethgroup = child.find('ethnicgroup').text
        pop = float(child.find('population').text)
    
    df.loc[i] = ethgroup, pop
    i += 1

In [38]:
# add up populations of each ethnic group
df = df.groupby('ethgroup')[['pop']].sum() 

In [39]:
# identify 10 groups with largest overall populations
df.sort_values('pop', ascending=False).head(10)

Unnamed: 0_level_0,pop
ethgroup,Unnamed: 1_level_1
Han Chinese,543776080.0
Japanese,328797880.0
European,297887587.0
Dravidian,238396327.0
Javanese,217776576.0
Russian,102798657.0
German,96828641.0
Arab,64293999.0
African,51894052.0
Mestizo,50797340.0


### 4. Name and country of a) longest river, b) largest lake and c) airport at highest elevation

#### Country with longest river

Construct dictionary of country IDs and names

In [40]:
di = pd.DataFrame(columns=('countryID', 'country'))

i = 0
root = document.getroot()
for child in root.findall('country'):
    countryID = child.get('car_code')
    country = child.find('name').text
    di.loc[i] = countryID, country
    i += 1

# convert di to dictionary
di = di.groupby(['countryID']).first()
di = di['country'].sort_index()

Construct dataframe of country IDs, country names, river names, and river lengths

In [41]:
df = pd.DataFrame(columns=('countryID', 'country', 'river', 'length'))

In [42]:
# add country code, river name, and river length to dataframe:
i = 0
root = document.getroot()
for child in root.findall('river'):
    name = child.find('name').text
    countryID = child.get('country')
    # if length is missing, enter into dataframe as 0
    if child.find('length') == None:
        length = 0
    else:
        length = float(child.find('length').text)
    df.loc[i] = countryID, 'NA', name, length
    i += 1

Identify country with longest river

In [43]:
df = df.sort_values('length', ascending=False).head(1)

# Return name of country/countries
for i in df['countryID']:
    countryIDs = i.split()
    
for i in countryIDs:
    print(di[i])

Colombia
Brazil
Peru


#### Country with largest lake

Construct dataframe of country IDs, country names, lake names, and lake areas

In [44]:
df = pd.DataFrame(columns=('countryID', 'country', 'lake', 'area'))

In [45]:
# add country code, lake name, and lake area to dataframe:
i = 0
root = document.getroot()
for child in root.findall('lake'):
    name = child.find('name').text
    countryID = child.get('country')
    # if area is missing, enter into dataframe as 0
    if child.find('area') == None:
        area = 0
    else:
        area = float(child.find('area').text)
    df.loc[i] = countryID, 'NA', name, area
    i += 1

Identify country with largest lake

In [46]:
df = df.sort_values('area', ascending=False).head(1)

# Return name of country/countries
for i in df['countryID']:
    countryIDs = i.split()
    
for i in countryIDs:
    print(di[i])

Russia
Azerbaijan
Kazakhstan
Iran
Turkmenistan


#### Country with airport at highest elevation

Construct dataframe of country IDs, country names, airport names, and airport elevations

In [47]:
df = pd.DataFrame(columns=('countryID', 'country', 'airport', 'elevation'))

In [48]:
# add country code, airport name, and airport elevation to dataframe:
i = 0
root = document.getroot()
for child in root.findall('airport'):
    name = child.find('name').text
    countryID = child.get('country')
    if child.find('elevation').text == None:
        elev = 0
    else:
        elev = float(child.find('elevation').text)
    df.loc[i] = countryID, 'NA', name, elev
    i += 1

Identify country with highest airport

In [49]:
df = df.sort_values('elevation', ascending=False).head(1)

# Return name of country/countries
for i in df['countryID']:
    countryIDs = i.split()
    
for i in countryIDs:
    print(di[i])

Bolivia
