# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )
document_tree

<xml.etree.ElementTree.ElementTree at 0x104a3b470>

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
document = ET.parse( './data/mondial_database.xml' )
document

<xml.etree.ElementTree.ElementTree at 0x104a3e278>

In [7]:
root = document.getroot()
root

<Element 'mondial' at 0x1049279a8>

In [17]:
#Question 1
#create a dictionary with the infant mortality rates of all countries by looping through all countries where infant
#mortality is listed and then use heapq to list the lowest 10

mort_dict = {}
for element in root.iterfind('country'):
    if element.find('infant_mortality') is None:
        pass
    else:
        mort_dict[element.find('name').text] = float(element.find('infant_mortality').text)
        
from heapq import nsmallest
from operator import itemgetter

for country, mortality in nsmallest(10, mort_dict.items(), key=itemgetter(1)):
    print (country, mortality)
    


Monaco 1.81
Japan 2.13
Bermuda 2.48
Norway 2.48
Singapore 2.53
Sweden 2.6
Czech Republic 2.63
Hong Kong 2.73
Macao 3.13
Iceland 3.15


In [25]:
#Question 2
#create a dictionary of populations for all cities and then use heapq to list the top ten

city_dict = {}
# print names of all countries and their cities
for element in root.iterfind('country'):
    #print ('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        if subelement.find('population') is None:
            pass
        else:
            city_dict[subelement.find('name').text] = int(subelement.find('population').text)

from heapq import nlargest
from operator import itemgetter

for city, pop in nlargest(10, city_dict.items(), key=itemgetter(1)):
    print (city, pop)

Seoul 10229262
Mumbai 9925891
São Paulo 9412894
Jakarta 8259266
Shanghai 8205598
Ciudad de México 8092449
Moskva 8010954
Tokyo 7843000
Beijing 7362426
Delhi 7206704


In [92]:
#Question3 begin by creating a dictionary of the populations of all countries according to the latest population

#the latest population was found by creating a list of all population figure years for each country and selecting 
#the maximum for each

#the maximum year was then used to select a single population value for each country
country_pop_dict = {}
popul = []
for element in root.iterfind('country'):
    max_list=[]
    country = element.find('name').text
    for x in element.iter("population"):
        max_list.append(int(x.attrib['year']))
    pop_year = max(max_list)
    for x in element.iter("population"):
        if int(x.attrib['year']) == pop_year:
            population = x.text
    country_pop_dict[country] = int(population)
country_pop_dict


{'Afghanistan': 26023100,
 'Albania': 51152,
 'Algeria': 37062820,
 'American Samoa': 55519,
 'Andorra': 22256,
 'Angola': 965288,
 'Anguilla': 13037,
 'Antigua and Barbuda': 22219,
 'Argentina': 3049229,
 'Armenia': 1066264,
 'Aruba': 101484,
 'Australia': 381488,
 'Austria': 52100,
 'Azerbaijan': 290500,
 'Bahamas': 248948,
 'Bahrain': 1234596,
 'Bangladesh': 479837,
 'Barbados': 88529,
 'Belarus': 1905475,
 'Belgium': 97692,
 'Belize': 53532,
 'Benin': 9983884,
 'Bermuda': 64237,
 'Bhutan': 733004,
 'Bolivia': 205346,
 'Bosnia and Herzegovina': 93028,
 'Botswana': 100079,
 'Brazil': 2852372,
 'British Virgin Islands': 23161,
 'Brunei': 279924,
 'Bulgaria': 7284552,
 'Burkina Faso': 17322796,
 'Burundi': 8444784,
 'Cambodia': 14364931,
 'Cameroon': 1384286,
 'Canada': 35591,
 'Cape Verde': 131719,
 'Cayman Islands': 55691,
 'Central African Republic': 4349921,
 'Ceuta': 82376,
 'Chad': 11720781,
 'Chile': 212813,
 'China': 29700000,
 'Christmas Island': 2072,
 'Cocos Islands': 596,
 

In [95]:
# A new data structure then creates a dictionary of lists of ethnicities for each country, 
# within the values is a list of dictionaries where the keys are ethnicities and the values 
# are the percentages of ethnicities

ethnic_dict={}
for element in root.iterfind('country'):
    country = element.find('name').text
    l=[]
    if element.find('ethnicgroup') is None:
            pass
    else:
        for x in element.iter("ethnicgroup"):
            e={}
            e[x.text]=float(x.attrib['percentage'])
            l.append(e)
    ethnic_dict[country]=l
ethnic_dict
        
        #popul.append(element.find('population'))
        #for subelement in element.getiterator('e'):

{'Afghanistan': [{'Tajik': 25.0},
  {'Pashtun': 38.0},
  {'Uzbek': 6.0},
  {'Hazara': 19.0}],
 'Albania': [{'Albanian': 95.0}, {'Greek': 3.0}],
 'Algeria': [{'European': 1.0}, {'Arab-Berber': 99.0}],
 'American Samoa': [{'Caucasian': 2.0}, {'Tongan': 4.0}, {'Samoan': 89.0}],
 'Andorra': [{'Spanish': 43.0},
  {'Andorran': 33.0},
  {'Portuguese': 11.0},
  {'French': 2.0},
  {'African': 5.0}],
 'Angola': [{'European': 1.0},
  {'Ovimbundu': 37.0},
  {'Kimbundu': 25.0},
  {'Bakongo': 13.0}],
 'Anguilla': [{'Black': 90.1}, {'Mulatto': 4.6}, {'White': 3.7}],
 'Antigua and Barbuda': [],
 'Argentina': [{'European': 97.0}],
 'Armenia': [{'Armenian': 97.7}, {'Russian': 0.5}, {'Yezidi': 1.3}],
 'Aruba': [{'European/Caribbean Amerindian': 80.0}],
 'Australia': [{'European': 92.0}, {'Asian': 7.0}],
 'Austria': [{'Austrian': 91.1},
  {'Turkish': 1.6},
  {'Slovene': 1.0},
  {'Croat': 2.0},
  {'Serbs': 2.0},
  {'German': 0.9}],
 'Azerbaijan': [{'Azeri': 90.6},
  {'Dagestani': 2.2},
  {'Russian': 1.8},


In [109]:
# for each ethnicity in each country the percentage ethnicity is multiplied by the the population of that country
# unheap then lists the top ten ethnicities from the resulting dictionary of all with their associated total populations

ethnic_pop_dict={}
for country, list in ethnic_dict.items():
    for ethnicity in list:
        for ethnicity_name, percentage in ethnicity.items():
            if ethnicity_name in ethnic_pop_dict:
                ethnic_pop_dict[ethnicity_name] += int(percentage*country_pop_dict[country]/100)
            else:
                ethnic_pop_dict[ethnicity_name] = int(percentage*country_pop_dict[country]/100)

from heapq import nlargest
from operator import itemgetter

for ethnicity, pop in nlargest(10, ethnic_pop_dict.items(), key=itemgetter(1)):
    print (ethnicity, pop)
    

Malay 89414169
Eastern Hamitic 82830376
Viet/Kinh 76078375
Thai 51084156
Arab-Berber 50583950
Arab 42402733
African 40986968
Mangbetu-Azande 27986022
Han Chinese 27175500
Chinese 22357554


In [183]:
#Question 4 first create a dictionary of country codes with their associated country name
country_code_dict={}
for element in root.iterfind('country'):
    code = element.attrib['car_code']
    name = element.find('name').text
    country_code_dict[code]=name

In [255]:
# create a finder function which creates two dictionaries, one of the locations of the item type
# and another that lists the location of each item

def find_stuff(type, metric, location_type):
    metric_dict={}
    location_dict={}
    for element in root.iterfind(type):
        if element.find(metric) is None:
            pass
        elif element.find(location_type) is None:
            pass
        else:
            location = element.find(location_type).attrib['country']
            name = element.find('name').text
            size = float(element.find(metric).text)
            metric_dict[name]=size
            location_dict[name]=location
            
    #pick the biggest thing

    max_name = max(metric_dict, key=metric_dict.get)
    
    #display the biggest thing, its size and its location
        
    max_dict = {
    'name': max_name,
     metric : metric_dict[max_name],
     location_type: country_code_dict[location_dict[max_name]]
    }
    return max_dict




    




In [256]:
#4a
longest_river = str(find_stuff('river', 'length', 'source'))
print('longest river: ' + longest_river)
        
    
    

longest river: {'name': 'Amazonas', 'source': 'Peru', 'length': 6448.0}


In [257]:
#4b
biggest_lake = str(find_stuff('lake', 'area', 'located'))
print('biggest lake: ' + biggest_lake)

biggest lake: {'name': 'Caspian Sea', 'located': 'Russia', 'area': 386400.0}


In [269]:
#4c

#had trouble getting the finder function to work for airports so I rewrote the code a little to make it work

elevation_dict={}
location_dict={}
for element in root.iterfind('airport'):
    if element.find('elevation').text is None:
        pass
    else:
        location = element.attrib['country']
        name = element.find('name').text
        size = float(element.find('elevation').text)
        elevation_dict[name]=size
        location_dict[name]=location

location_dict
elevation_dict

max_name = max(elevation_dict, key=elevation_dict.get)

max_dict = {
'name': max_name,
'elevation' : elevation_dict[max_name],
'location': country_code_dict[location_dict[max_name]]
}

highest_airport = str(max_dict)
    
print('the highest airport is: ' + highest_airport)

the highest airport is: {'location': 'Bolivia', 'name': 'El Alto Intl', 'elevation': 4063.0}
