# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET
import pandas as pd

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [7]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [15]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [43]:
document_tree = ET.parse( './data/mondial_database.xml' )

In [141]:
#1
document_tree = ET.parse( './data/mondial_database.xml' )
dic={}
for child in document_tree.getroot():
    if child.find('infant_mortality') is not None:
        dic[child.find('name').text]=float(child.find('infant_mortality').text)
df=pd.DataFrame(list(dic.items()), columns=['Country', 'Mortality rate'])
df.sort_values(by='Mortality rate', ascending=True).head(10)

Unnamed: 0,Country,Mortality rate
57,Monaco,1.81
71,Japan,2.13
18,Norway,2.48
156,Bermuda,2.48
222,Singapore,2.53
210,Sweden,2.6
170,Czech Republic,2.63
10,Hong Kong,2.73
190,Macao,3.13
209,Iceland,3.15


In [140]:
#2
rFile = r'D:\Tim\ML course\data_wrangling_xml\data\mondial_database.xml'
outDic={}
document_tree = ET.parse(rFile)
for country in document_tree.findall('country'):
    countryName = country.find('name').text

    for city in country.getchildren():  #   This is for the country / city form
        if city.tag == 'city':

            name=city.find('name').text
            cDic={}
            for popCount in city.getchildren():
                if popCount.tag == 'population':
                    cDic[int(popCount.attrib['year'])] = int(popCount.text)

            df=pd.DataFrame(list(cDic.items()), columns=['Year', 'Pop'])
            df.sort_values('Year', ascending=False)
            if len(df.index) > 0:
                latestPop=df.sort_values('Year', ascending=False).iloc[0,1]
                outDic[countryName + ": " + name]=latestPop

    for province in country.getchildren():  #   This is for the country / /province / city form
        if province.tag == 'province':

            for city in province.getchildren():
                if city.tag == 'city':

                    name=city.find('name').text
                    cDic={}
                    for popCount in city.getchildren():
                        if popCount.tag == 'population':
                            cDic[int(popCount.attrib['year'])] = int(popCount.text)

                    df=pd.DataFrame(list(cDic.items()), columns=['Year', 'Pop'])
                    df.sort_values('Year', ascending=False)
                    if len(df.index) > 0:
                        latestPop=df.sort_values('Year', ascending=False).iloc[0,1]
                        outDic[countryName + ": " + name]=latestPop



dfOut=pd.DataFrame(list(outDic.items()), columns=['City', 'Latest Population'])
dfOut.sort_values(by='Latest Population', ascending=False).head(10)

Unnamed: 0,City,Latest Population
1969,China: Shanghai,22315474
1218,Turkey: Istanbul,13710512
2320,India: Mumbai,12442373
2166,Russia: Moskva,11979529
1444,China: Beijing,11716620
3000,Brazil: São Paulo,11152344
346,China: Tianjin,11090314
2487,China: Guangzhou,11071424
2385,India: Delhi,11034555
938,China: Shenzhen,10358381


In [163]:
    #3
    rFile = r'D:\Tim\ML course\data_wrangling_xml\data\mondial_database.xml'
    popDic={}
    document_tree = ET.parse(rFile)
    for country in document_tree.findall('country'):
        countryName = country.find('name').text

        for population in country.getchildren():
            if population.tag == 'population':
                cDic={}
                year=population.attrib['year']
                pop=int(population.text)
                cDic[year]=pop

        df=pd.DataFrame(list(cDic.items()), columns=['Year', 'Pop'])
        df.sort_values('Year', ascending=False)
        if len(df.index) > 0:
            latestPop=df.sort_values('Year', ascending=False).iloc[0,1]

        popDic[countryName]=latestPop

    #   Now loop over countries, get percents, multiply by tot pop
    ethDic={}
    document_tree = ET.parse(rFile)
    document_tree = ET.parse(rFile)
    for country in document_tree.findall('country'):
        countryName = country.find('name').text

        for ethnicgroup in country.getchildren():
            if ethnicgroup.tag == 'ethnicgroup':
                ethName=ethnicgroup.text
                ethPC=float(ethnicgroup.attrib['percentage']) / 100.0
                
                ethDic[ethName] = ethDic.get(ethName, 0.0) + (ethPC * popDic[countryName])

    #ethDic
    dfOut=pd.DataFrame(list(ethDic.items()), columns=['Ethnicity', 'Latest Population'])
    dfOut.sort_values(by='Latest Population', ascending=False).head(10)

Unnamed: 0,Ethnicity,Latest Population
77,Han Chinese,1245059000.0
235,Indo-Aryan,871815600.0
131,European,494872200.0
52,African,318325100.0
107,Dravidian,302713700.0
149,Mestizo,157734400.0
262,Bengali,146776900.0
89,Russian,131857000.0
100,Japanese,126534200.0
260,Malay,121993600.0


In [10]:
    #4a
    rFile = r'D:\Tim\ML course\data_wrangling\data\mondial_database.xml'
    countryDic={}
    document_tree = ET.parse(rFile)
    for country in document_tree.findall('country'):
        countryCode = country.attrib['car_code']
        countryName = country.find('name').text
        countryDic[countryCode]=countryName
    
    # Next loop over all the rivers
    longestRivLen = 0
    for river in document_tree.findall('river'):
        if river.find('length') is not None:
            rivLen = int(float(river.find('length').text))
            if rivLen > longestRivLen:
                rivName = river.find('name').text
                rivCountry = countryDic[river.attrib['country'].split()[0]] #   Associate the river with the first country in the list
                longestRivLen = rivLen
    
    print('Name: ' + rivName + ', length: ' + str(longestRivLen) + ', country: ' + rivCountry)

Name: Amazonas, length: 6448, country: Colombia


In [15]:
    #4b
    largestLake = 0
    for lake in document_tree.findall('lake'):
        if lake.find('area') is not None:
            lakeArea = int(float(lake.find('area').text))
            if lakeArea > largestLake:
                lakeName = lake.find('name').text
                lakeCountry = lake.attrib['country'].split()[0] #   Associate the river with the first country in the list
                largestLake = lakeArea

    print('Name: ' + lakeName + ', area: ' + str(largestLake) + ', country: ' + countryDic[lakeCountry])

Name: Caspian Sea, area: 386400, country: Russia


In [16]:
    #4c
    highestAirport = 0
    for airport in document_tree.findall('airport'):
        if airport.find('elevation') is not None:
            airportHeight = int(float(str(airport.find('elevation').text or '0')))    #   Converts None to '0'
            if airportHeight > highestAirport:
                airportName = airport.find('name').text
                airportCountry = airport.attrib['country'].split()[0]
                highestAirport = airportHeight

    print('Name: ' + airportName + ', elevation: ' + str(highestAirport) + ', country: ' + countryDic[airportCountry])

Name: El Alto Intl, elevation: 4063, country: Bolivia
