# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [156]:
import numpy as np
import pandas as pd
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [157]:
document_tree = ET.parse('E:\Data Science\Assignments\data_wrangling_xml\data\mondial_database_less.xml' )

In [158]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [159]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [180]:
document = ET.parse( 'E:\Data Science\Assignments\data_wrangling_xml\data\mondial_database.xml' )

# 1. 10 countries with the lowest infant mortality rates

In [181]:
# root is an instance of Element class
root = document.getroot()
root

<Element 'mondial' at 0x000001EDDEE03638>

In [182]:
# Create an empty dictionary lim
lim = {}
# Loop through each subelement in element country
for el in document.iter('country'):
    # Get the name of each country
    cntry = el.find('name')
    # Get the infant_mortality of each country
    inf = el.find('infant_mortality')
    if cntry is not None:
        if inf is not None:
            # Create a key value pair and fill the dictionary 
            lim[cntry.text] = float(inf.text)
        #else:
            #lim[cntry.text] = np.nan

In [183]:
lim

{'Afghanistan': 117.23,
 'Albania': 13.19,
 'Algeria': 21.76,
 'American Samoa': 8.92,
 'Andorra': 3.69,
 'Angola': 79.99,
 'Anguilla': 3.4,
 'Antigua and Barbuda': 13.29,
 'Argentina': 9.96,
 'Armenia': 13.97,
 'Aruba': 11.74,
 'Australia': 4.43,
 'Austria': 4.16,
 'Azerbaijan': 26.67,
 'Bahamas': 12.5,
 'Bahrain': 9.68,
 'Bangladesh': 45.67,
 'Barbados': 10.93,
 'Belarus': 3.64,
 'Belgium': 4.18,
 'Belize': 20.31,
 'Benin': 57.09,
 'Bermuda': 2.48,
 'Bhutan': 37.89,
 'Bolivia': 38.61,
 'Bosnia and Herzegovina': 5.84,
 'Botswana': 9.38,
 'Brazil': 19.21,
 'British Virgin Islands': 13.45,
 'Brunei': 10.48,
 'Bulgaria': 15.08,
 'Burkina Faso': 76.8,
 'Burundi': 63.44,
 'Cambodia': 51.36,
 'Cameroon': 55.1,
 'Canada': 4.71,
 'Cape Verde': 24.28,
 'Cayman Islands': 6.21,
 'Central African Republic': 92.86,
 'Chad': 90.3,
 'Chile': 7.02,
 'China': 14.79,
 'Colombia': 15.02,
 'Comoros': 65.31,
 'Congo': 59.34,
 'Cook Islands': 14.33,
 'Costa Rica': 8.7,
 'Cote dIvoire': 60.16,
 'Croatia': 5

In [184]:
# Create a DataFrame (infdf) from the dictionary (lim)
infdf = pd.DataFrame(list(lim.items()), columns=['Country', 'Infant_Mortality'])

In [185]:
infdf

Unnamed: 0,Country,Infant_Mortality
0,Albania,13.19
1,Greece,4.78
2,Macedonia,7.90
3,Serbia,6.16
4,Andorra,3.69
5,France,3.31
6,Spain,3.33
7,Austria,4.16
8,Czech Republic,2.63
9,Germany,3.46


In [186]:
# Sort the countries with lowest infant mortality rates and display the first 'ten' in the list
infdf = infdf.sort_values('Infant_Mortality').head(10)
infdf

Unnamed: 0,Country,Infant_Mortality
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


# 2. 10 cities with the largest population

In [188]:
# Looking carefully at the XML Document tree, one can notice that some cities are subelements to the subelement province.
# To retrieve those cities and population I defined the path "./country/province/city"
#Create a dictionary 'provdict'
provdict = {}
for cty in document.findall("./country/province/city"):
    # Retreive all the names of the cities, and corresponding population figures
    name = cty.find('name')
    pop = cty.findall('population')
    if pop:
        # To retrieve the most recent year's population
        provdict[name.text] = int(pop[len(pop)-1].text)    

In [189]:
provdict

{'Kavala': 58790,
 'Athina': 664046,
 'Peiraias': 163688,
 'Peristeri': 139981,
 'Acharnes': 106943,
 'Patra': 213984,
 'Kerkyra': 102071,
 'Ioannina': 112486,
 'Thessaloniki': 325182,
 'Iraklio': 173993,
 'Chania': 108642,
 'Rhodes': 115490,
 'Lamia': 75315,
 'Chalkida': 102223,
 'Larissa': 162591,
 'Volos': 144449,
 'Karyes': 233,
 'Strasbourg': 272222,
 'Mulhouse': 110351,
 'Bordeaux': 239399,
 'Clermont-Ferrand': 140957,
 'Caen': 108793,
 'Rennes': 208033,
 'Brest': 309764,
 'Dijon': 151672,
 'Orléans': 114185,
 'Tours': 134633,
 'Reims': 180752,
 'Ajaccio': 66245,
 'Besançon': 115879,
 'Rouen': 111553,
 'Le Havre': 174156,
 'Paris': 2249975,
 'Boulogne-Billancourt': 116220,
 'Argenteuil': 104282,
 'Montreuil': 103068,
 'Saint-Paul': 103916,
 'Montpellier': 264538,
 'Nîmes': 144940,
 'Perpignan': 118238,
 'Limoges': 137758,
 'Metz': 119962,
 'Nancy': 105382,
 'Toulouse': 447340,
 'Lille': 227533,
 'Nantes': 287845,
 'Angers': 148803,
 'Le Mans': 143240,
 'Amiens': 133327,
 'Poitier

In [190]:
# Create DataFrame 'cpdf' from dictionary 'provdict'
cpdf = pd.DataFrame(list(provdict.items()), columns=['City', 'Population'])

In [191]:
#Create a dictionary 'dict'
dict = {}
for city in document.findall("./country/city"):
    # Retreive all the names of the cities, and thier respective population figures
    name = city.find('name')
    pop = city.findall('population')
    if pop:
        # To retrieve the most recent year's population
        dict[name.text] = int(pop[len(pop)-1].text)

In [192]:
dict

{'Abomey-Calavi': 307745,
 'Abu Dhabi': 552000,
 'Aden': 570551,
 'Akureyri': 17490,
 'Al Ain': 348000,
 'Al Fashir': 141884,
 'Al Fayyum': 316772,
 'Al Gazira': 211362,
 'Al Gedaref': 191164,
 'Al Iskandariyah': 4123869,
 "Al Isma'iliyah": 300449,
 'Al Jizah': 2681863,
 'Al Mahallah al Kubra': 442884,
 'Al Mansurah': 437311,
 'Al Manāmah': 143035,
 'Al Minya': 235234,
 'Al Qahirah': 8471859,
 'Al Rayyan': 272860,
 'Al Sharjah': 519000,
 'Al Ubayyiḑ': 229425,
 'Al Uqsur': 451318,
 'Algiers': 2364230,
 'Amman': 1812941,
 'Andorra la Vella': 22256,
 'Anju': 240117,
 'Annaba': 342703,
 'Ansan': 722598,
 'Antananarivo': 710236,
 'Antsirabe': 126062,
 'Antsiranana': 59040,
 'Anyang': 603184,
 'Apia': 36735,
 'Apopa': 143718,
 'As Seeb': 302992,
 'As Suways': 510935,
 'Asmara': 380568,
 'Asunción': 518792,
 'Aswan': 265004,
 'Asyut': 386086,
 'Auckland': 419418,
 'Az Zaqaziq': 302611,
 'Baku': 2150800,
 'Bandar Seri Begawan': 279924,
 'Bangkok': 7506700,
 'Bangui': 622771,
 'Banha': 158389,


In [193]:
cpdf1 = pd.DataFrame(list(dict.items()), columns=['City', 'Population'])

In [194]:
cpdf.append(cpdf1)

Unnamed: 0,City,Population
0,Kavala,58790
1,Athina,664046
2,Peiraias,163688
3,Peristeri,139981
4,Acharnes,106943
5,Patra,213984
6,Kerkyra,102071
7,Ioannina,112486
8,Thessaloniki,325182
9,Iraklio,173993


In [195]:
cpdf.sort_values('Population', ascending=False).head(10)

Unnamed: 0,City,Population
1189,Shanghai,22315474
673,Istanbul,13710512
1350,Mumbai,12442373
421,Moskva,11979529
1188,Beijing,11716620
2269,São Paulo,11152344
1190,Tianjin,11090314
915,Guangzhou,11071424
1396,Delhi,11034555
918,Shenzhen,10358381


# 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [244]:
c_p = {}
for e in document.iter('country'):
    cntry = e.find('name')
    pop = e.find('.//population[last()]')
    if cntry != None:
        if pop != None:
            c_p[cntry.text] = int(pop.text)

In [245]:
c_p

{'Afghanistan': 26023100,
 'Albania': 2800138,
 'Algeria': 37062820,
 'American Samoa': 55519,
 'Andorra': 78115,
 'Angola': 24383301,
 'Anguilla': 13037,
 'Antigua and Barbuda': 81799,
 'Argentina': 42669500,
 'Armenia': 3026879,
 'Aruba': 101484,
 'Australia': 23135281,
 'Austria': 8499759,
 'Azerbaijan': 9356500,
 'Bahamas': 353658,
 'Bahrain': 1234596,
 'Bangladesh': 149772364,
 'Barbados': 277821,
 'Belarus': 9460692,
 'Belgium': 11099554,
 'Belize': 312971,
 'Benin': 9983884,
 'Bermuda': 64237,
 'Bhutan': 733004,
 'Bolivia': 10027262,
 'Bosnia and Herzegovina': 3791622,
 'Botswana': 2038228,
 'Brazil': 202768562,
 'British Virgin Islands': 23161,
 'Brunei': 393372,
 'Bulgaria': 7284552,
 'Burkina Faso': 17322796,
 'Burundi': 8444784,
 'Cambodia': 14364931,
 'Cameroon': 19406100,
 'Canada': 35158304,
 'Cape Verde': 491875,
 'Cayman Islands': 55691,
 'Central African Republic': 4349921,
 'Ceuta': 82376,
 'Chad': 11720781,
 'Chile': 16341929,
 'China': 1360720000,
 'Christmas Island

In [240]:
co_etg = []
for el in document.iterfind('country'):
    co = el.find('name').text
    for et in el.iterfind('ethnicgroup'):
        etg_name = et.text
        etg_prcnt = et.attrib['percentage']

In [234]:
co_etg.append([co,etg_name,etg_prcnt])

In [235]:
co_etg

[['Seychelles', 'Seychellois', '100']]