# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [8]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [9]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [10]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [11]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [12]:
from xml.etree import ElementTree as ET
document = ET.parse( './data/mondial_database.xml' )
root=document.getroot()
root.tag

'mondial'

To find the Top 10 countries with lowest infant mortality rates:

In [13]:
import pandas as pd
df=pd.DataFrame(columns=["names","infant_mortality"])

for country in document.findall('country'):
     names=country.find('name').text
     for child in country:
         if child.tag=='infant_mortality':
            infant_mortality=float(child.text)
     df.loc[len(df)] = [names,infant_mortality]
    
#print(mortality)
#df.loc[len(df)] = [names,mortality]
#df = df.append(pd.Series(names, index=['country_names']), ignore_index=True)
#df
#len(names)
#names=names[0:len(mortality)]
#df=pd.DataFrame({'country_names':names,'infant_mortality':mortality})
df.sort_values(by ='infant_mortality').head(10)




Unnamed: 0,names,infant_mortality
38,Monaco,1.81
98,Japan,2.13
36,Norway,2.48
117,Bermuda,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


To find the Top 10 cities with largest population:

In [14]:
df2 = pd.DataFrame(columns = ["name","population"])
for country in document.findall('country'):
    for city in country.iter('city'):
        name=city.find('name').text
        pop=city.findall('population')
        #print pop We find many null values in the list and hence we can handle errors using try and except
        try:
            population1 = int(pop[-1].text)
        except:
            population1 = int(0)

        df2.loc[len(df2)] = [name,population1]

df2.sort_values(by= 'population', ascending= False).head(10)


Unnamed: 0,name,population
1341,Shanghai,22315474.0
771,Istanbul,13710512.0
1527,Mumbai,12442373.0
479,Moskva,11979529.0
1340,Beijing,11716620.0
2810,São Paulo,11152344.0
1342,Tianjin,11090314.0
1064,Guangzhou,11071424.0
1582,Delhi,11034555.0
1067,Shenzhen,10358381.0


To find the Top 10 ethnic groups with largest overall population:

In [15]:
# import pandas as pd
df3 = pd.DataFrame(columns = ["name","population","ethnicgroup","percentage","total_estimation"])
for country in document.findall('country'):
    for city in country.iter('city'):
        name=city.find('name').text
        pop=city.findall('population')
        #print pop We find many null values in the list and hence we can handle errors using try and except
        try:
            population1 = int(pop[-1].text)
        except:
            population1 = int(0)

        #df2.loc[len(df2)] = [name,population1]
        for ethnicgroup in country.findall('ethnicgroup'):
            eg = ethnicgroup.text
            percent = float(ethnicgroup.attrib['percentage'])
            percent = (percent/100)
            total_estimate=percent*population1
            df3.loc[len(df3)] = [name, population1, eg, percent,total_estimate]


#df3.sort_values(by='total_estimation', ascending= False).head(10)
df3.groupby('ethnicgroup').sum().sort_values(by ='total_estimation', ascending = False)


Unnamed: 0_level_0,population,percentage,total_estimation
ethnicgroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Han Chinese,325121466.0,280.9050,2.974861e+08
European,404398052.0,438.2042,1.766052e+08
Indo-Aryan,128759604.0,70.5600,9.270691e+07
Mestizo,116967288.0,159.5340,6.429845e+07
Russian,129331943.0,152.2250,6.368289e+07
African,295952487.0,164.6470,6.155356e+07
Japanese,48068708.0,69.5800,4.778030e+07
Mulatto,130939182.0,91.8860,4.287923e+07
Turkish,88870042.0,78.4072,3.996573e+07
Arab,83413197.0,59.2040,3.489736e+07


Aiport with Highest elevation:

In [16]:

elevation_high=int(0)
for air in document.findall('airport'):
    name1=air.find('name').text
    country1=air.get('country')
    for child in air:
        if child.tag=='elevation':
            try:
                air_elevation=float(child.text)
            except:
                air_elevation= int(0)
            #print(air_elevation)
            if air_elevation>elevation_high:
                elevation_high=air_elevation
                airport_name=name1
                country_name=country1
                
print(elevation_high)
print(airport_name)
print(country_name)
#print("The airport name is"airport_name + ' '+country_name)

4063.0
El Alto Intl
BOL


Longest River

In [17]:
river_length=int(0)                
for riv in document.findall('river'):
    name2=riv.find('name').text
    country2=riv.get('country')
    for child in riv:
        if child.tag=='length':
            try:
                river_length_in=float(child.text)
            except:
                river_length_in= int(0)
            #print(air_elevation)
            if river_length_in>river_length:
                river_length=river_length_in
                river_name=name2
                country_river=country2
                
print(river_length)
print(river_name)
print(country_river)

6448.0
Amazonas
CO BR PE


Largest Lake:

In [18]:
lake_area=int(0)                
for lake in document.findall('lake'):
    name3=lake.find('name').text
    country3=lake.get('country')
    for child in lake:
        if child.tag=='area':
            try:
                lake_area_in=float(child.text)
            except:
                lake_area_in= int(0)
            #print(air_elevation)
            if lake_area_in>lake_area:
                lake_area=lake_area_in
                lake_name=name3
                country_name=country3
                
print(lake_area)
print(lake_name)
print(country_name)

386400.0
Caspian Sea
R AZ KAZ IR TM
