# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [4]:
document = ET.parse( './data/mondial_database.xml' )

In [7]:
#first look at document_tree to get a general idea of the file layout
document_tree.findall('country')

[<Element 'country' at 0x3f3fa20>,
 <Element 'country' at 0x40a6b38>,
 <Element 'country' at 0x4067278>,
 <Element 'country' at 0x40f4048>,
 <Element 'country' at 0x4102048>,
 <Element 'country' at 0x4102b00>,
 <Element 'country' at 0x41112b0>]

In [8]:
root = document_tree.getroot()

In [9]:
#see what variable names are
for node in root.getiterator():
    print node.tag, node.attrib, node.text, node.tail

mondial {} 
    None
country {'memberships': 'org-BSEC org-CEI org-CD org-SELEC org-CE org-EAPC org-EBRD org-EITI org-FAO org-IPU org-IAEA org-IBRD org-ICC org-ICAO org-ICCt org-Interpol org-IDA org-IFRCS org-IFC org-IFAD org-ILO org-IMO org-IMF org-IOC org-IOM org-ISO org-OIF org-ITU org-ITUC org-IDB org-MIGA org-NATO org-OSCE org-OPCW org-OAS org-OIC org-PCA org-UN org-UNCTAD org-UNESCO org-UNIDO org-UPU org-WCO org-WFTU org-WHO org-WIPO org-WMO org-UNWTO org-WTO', 'area': '28750', 'car_code': 'AL', 'capital': 'cty-Albania-Tirane'} 
       
   
name {} Albania 
      
population {'measured': 'est.', 'year': '1950'} 1214489 
      
population {'measured': 'est.', 'year': '1960'} 1618829 
      
population {'measured': 'est.', 'year': '1970'} 2138966 
      
population {'measured': 'est.', 'year': '1980'} 2734776 
      
population {'measured': 'est.', 'year': '1990'} 3446882 
      
population {'year': '1997'} 3249136 
      
population {'measured': 'est.', 'year': '2000'} 3304948 
  

In [15]:
import pandas as pd

In [16]:
#find infant mortality rates in small file
def get_inf_mort(xml):
    InfMortList = []
    for country in xml.findall('country'):
        name = country.find('name').text
        try:
            inf_mort = float(country.find('infant_mortality').text)
        except AttributeError:
            inf_mort = None
        InfMortList.append([name, inf_mort])
    
    #create data frame
    InfMortDF = pd.DataFrame(InfMortList, columns=['Country', 'Infant Mortality'])
    InfMortDF.sort_values("Infant Mortality", inplace=True)
    return InfMortDF.head(10)
        
get_inf_mort(document_tree)

Unnamed: 0,Country,Infant Mortality
6,Andorra,3.69
1,Greece,4.78
3,Serbia,6.16
2,Macedonia,7.9
0,Albania,13.19
4,Montenegro,
5,Kosovo,


In [17]:
#now find bottom 10 infant mortality rates in large file
get_inf_mort(document)

Unnamed: 0,Country,Infant Mortality
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


In [32]:
#top 10 cities with the largest population - latest estimate
def get_cities(xml):
    CityList = []
    ec = 0 #error counter
    for country in xml.findall('country'):
        for city in country.findall('city'):
            names = city.iterfind('name')
            name1 = next(names).text
            
            #get an alternative name if one exists
            try: name2 = next(names).text
            except StopIteration: name2 = ""
            
            #get population value
            try:
                """Iterate over all population elements.  For each one, convert the 'year' attribute
                into an integer.  Pick the population value corresponding to the maximum converted year
                value, and convert it to a float."""
                pop = float(max(city.iterfind('population'), key=lambda p: int(p.attrib['year'])).text)
            except ValueError: #no population values so arg of "max" function is empty iterator
                pop = None
                ec += 1
            CityList.append([name1, name2, pop])
    
    CityDF = pd.DataFrame(CityList, columns=['City Name', 'Alt Name', 'Population'])
    CityDF.sort_values('Population', inplace=True, ascending=False)
    print CityDF.shape, ec
    return CityDF.head(10)

get_cities(document_tree)

(14, 3) 0


Unnamed: 0,City Name,Alt Name,Population
8,Beograd,,1639121
6,Skopje,,514967
0,Tirana,Tirane,418495
9,Novi Sad,,335701
10,Niš,,257867
12,Prishtine,Pristina,198214
11,Podgorica,,150977
2,Durrës,,113249
7,Kumanovo,,107745
3,Vlorë,,79513


In [33]:
#now get top 10 cities in big document
get_cities(document)

(430, 3) 36


Unnamed: 0,City Name,Alt Name,Population
176,Seoul,,9708483
164,Al Qahirah,Cairo,8471859
80,Bangkok,,7506700
128,Hong Kong,,7055071
92,Ho Chi Minh,Saigon,5968384
212,Singapore,,5076700
163,Al Iskandariyah,Alexandria,4123869
216,New Taipei,,3939305
177,Busan,Pusan,3403135
107,Pyongyang,,3255288


In [37]:
#find top 10 ethnic groups by population across all countries
def get_ethnic(xml):
    EthnicList = []
    ec = 0 #error counter
    for country in xml.findall('country'):
        cname = country.find('name')
        
        #get most recent country population
        try:
            """Iterate over all population elements.  For each one, convert the 'year' attribute
            into an integer.  Pick the population value corresponding to the maximum converted year
            value, and convert it to a float."""
            cpop = float(max(country.iterfind('population'), key=lambda p: int(p.attrib['year'])).text)
            
            #calculate population of each group in the country
            for ethnic in country.findall('ethnicgroup'):
                ename = ethnic.text
                epct = 0.01 * float(ethnic.attrib['percentage'])
                epop = int(epct * cpop) #round to nearest integer
                EthnicList.append([cname, ename, epop])        
        
        except ValueError: #no population values so arg of "max" function is empty iterator
            ec += 1        
    
    #put each country and ethnic group in a data frame
    EthnicDF = pd.DataFrame(EthnicList, columns=['Country', 'Ethnic Group', 'Population'])
    print EthnicDF.shape, ec
    
    #sum by ethnic group
    return EthnicDF.groupby('Ethnic Group').sum().sort_values('Population', ascending=False).head(10)

#test with small doc
get_ethnic(document_tree)

(25, 3) 0


Unnamed: 0_level_0,Population
Ethnic Group,Unnamed: 1_level_1
Greek,10143149
Serb,6138517
Albanian,4805362
Macedonian,1322387
Montenegrin,330697
Hungarian,277705
Bosniak,177773
Roma,99689
Serbian,86693
Turkish,80331


In [38]:
#now find top 10 ethnic groups in big doc
get_ethnic(document)

(628, 3) 0


Unnamed: 0_level_0,Population
Ethnic Group,Unnamed: 1_level_1
Han Chinese,1245058800
Indo-Aryan,871815583
European,494872201
African,318325104
Dravidian,302713744
Mestizo,157734349
Bengali,146776916
Russian,131856989
Japanese,126534212
Malay,121993548


In [39]:
"""Comment: ethnic groups don't seem to be entirely consistent here.  For instance one might expect 
'Russian' to be a subset of 'European'.  Ideally we would have different levels of ethnic groups identified
in the XML file, but this doesn't appear possible with the data we have, so in a real 
data science exercise, we would explore the smaller groups to see which are subsets of the larger ones
to avoid reporting misleading numbers."""

"Comment: ethnic groups don't seem to be entirely consistent here.  For instance one might expect \n'Russian' to be a subset of 'European'.  Ideally we would have different levels of ethnic groups identified\nin the XML file, but this doesn't appear possible with the data we have, so in a real \ndata science exercise, we would explore the smaller groups to see which are subsets of the larger ones\nto avoid reporting misleading numbers."

In [54]:
#find longest river, largest lake, and highest-elevation airport
#in small file, there are no examples of river lengths, lake sizes, or airport elevations
#need to look at large file to see format...

#create dictionary of country abbreviations
AbbrDict = {}
for country in document.findall('country'):
    AbbrDict[country.attrib['car_code']] = country.find('name').text

def highest_airport(xml):
    AirportList = []
    ec = 0 #error counter
    for airport in xml.findall('airport'):
        try:
            arpt = {
                    'name': airport.find('name').text,
                    'country': AbbrDict[airport.attrib['country']],
                    #'latitude': float(airport.find('latitude').text),
                    #'longitude': float(airport.find('longitude').text),
                    'elevation': int(airport.find('elevation').text),
                    #'GMToffset': float(airport.find('gmtOffset').text)
                }
        except TypeError:
            ec += 1
        except AttributeError:
            ec += 1
        AirportList.append(arpt)
        
    print len(AirportList), ec
    return max(AirportList, key=lambda a: a['elevation'])

highest_airport(document)

1315 26


{'country': 'Bolivia', 'elevation': 4063, 'name': 'El Alto Intl'}

In [55]:
def largest_lake(xml):
    LakeList = []
    ec = 0 #error counter
    for lake in xml.findall('lake'):
        countries = []
        for loc in lake.findall('located'): #lakes can be in multiple countries
            c = AbbrDict[loc.attrib['country']]
            countries.append(c)
        try:
            lk = {
                    'name': lake.find('name').text,
                    'country': countries,
                    'area': float(lake.find('area').text)
                    }
        except TypeError:
            ec += 1
        except AttributeError:
            ec += 1
        LakeList.append(lk)
            
    print len(LakeList), ec
    return max(LakeList, key=lambda x: x['area'])

largest_lake(document)

141 2


{'area': 386400.0,
 'country': ['Russia', 'Kazakhstan', 'Iran', 'Turkmenistan'],
 'name': 'Caspian Sea'}

In [57]:
def longest_river(xml):
    RiverList = []
    ec = 0 #error counter
    for river in xml.findall('river'):
        countries = []
        for loc in river.findall('located'): #lakes can be in multiple countries
            c = AbbrDict[loc.attrib['country']]
            countries.append(c)
        try:
            riv = {
                    'name': river.find('name').text,
                    'country': countries,
                    'length': float(river.find('length').text)
                    }
        except TypeError:
            ec += 1
        except AttributeError:
            ec += 1
        RiverList.append(riv)
            
    print len(RiverList), ec
    return max(RiverList, key=lambda x: x['length'])

longest_river(document)

238 5


{'country': ['Colombia', 'Brazil', 'Peru'],
 'length': 6448.0,
 'name': 'Amazonas'}

In [61]:
#since river and lake functions are basically identical, we should be able to combine them
def water_size(xml, water_type, water_metric):
    WaterList = []
    ec = 0 #error counter
    for water in xml.findall(water_type):
        countries = []
        for loc in water.findall('located'): #lakes can be in multiple countries
            c = AbbrDict[loc.attrib['country']]
            countries.append(c)
        try:
            H2O = {
                    'name': water.find('name').text,
                    'country': countries,
                    water_metric: float(water.find(water_metric).text)
                    }
        except TypeError:
            ec += 1
        except AttributeError:
            ec += 1
        WaterList.append(H2O)
            
    print len(WaterList), ec
    return max(WaterList, key=lambda x: x[water_metric])

water_size(document, water_type='river', water_metric='length')

238 5


{'country': ['Colombia', 'Brazil', 'Peru'],
 'length': 6448.0,
 'name': 'Amazonas'}

In [62]:
water_size(document, water_type='lake', water_metric='area')

141 2


{'area': 386400.0,
 'country': ['Russia', 'Kazakhstan', 'Iran', 'Turkmenistan'],
 'name': 'Caspian Sea'}