# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [50]:
import numpy as np
import pandas as pd
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [51]:
document_tree = ET.parse('~\data\mondial_database_less.xml' )

In [52]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [53]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [54]:
document = ET.parse( 'E:\Data Science\Assignments\data_wrangling_xml\data\mondial_database.xml' )

# 1. 10 countries with the lowest infant mortality rates

In [55]:
# root is an instance of Element class
root = document.getroot()
root

<Element 'mondial' at 0x00000154B5DCB318>

In [56]:
# Create an empty dictionary lim
lim = {}
# Loop through each subelement in element country
for el in document.iter('country'):
    # Get the name of each country
    cntry = el.find('name')
    # Get the infant_mortality of each country
    inf = el.find('infant_mortality')
    # If 'country name' and 'infant_mortality' is not null, then fill the dictionary 'lim'
    if cntry is not None:
        if inf is not None:
            # Create a key value pair and fill the dictionary 
            lim[cntry.text] = float(inf.text)

In [57]:
# Create a DataFrame (infdf) from the dictionary (lim)
infdf = pd.DataFrame(list(lim.items()), columns=['Country', 'Infant_Mortality'])

In [58]:
infdf.head()

Unnamed: 0,Country,Infant_Mortality
0,Albania,13.19
1,Greece,4.78
2,Macedonia,7.9
3,Serbia,6.16
4,Andorra,3.69


In [59]:
# Sort the countries with lowest infant mortality rates and display the first 'ten' in the list
infdf = infdf.sort_values('Infant_Mortality').head(10)
infdf

Unnamed: 0,Country,Infant_Mortality
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


# 2. 10 cities with the largest population

In [60]:
# Looking carefully at the XML Document tree, one can notice that some cities are subelements to the subelement province.
# To retrieve those cities and population I defined the XML path "./country/province/city"
#Create a dictionary 'provdict'
provdict = {}
for cty in document.findall("./country/province/city"):
    # Retreive all the names of the cities, and corresponding population figures
    name = cty.find('name')
    pop = cty.findall('population')
    if pop:
        # To retrieve the most recent year's population
        provdict[name.text] = int(pop[len(pop)-1].text)    

In [61]:
# Create DataFrame 'cpdf' from dictionary 'provdict'
cpdf = pd.DataFrame(list(provdict.items()), columns=['City', 'Population']).head()

In [62]:
#Create a dictionary 'dict'
dict = {}
for city in document.findall("./country/city"):
    # Retreive all the names of the cities, and thier respective population figures
    name = city.find('name')
    pop = city.findall('population')
    if pop:
        # To retrieve the most recent year's population
        dict[name.text] = int(pop[len(pop)-1].text)

In [63]:
# Create a dataframe cpdf1 from the dictionary dict
cpdf1 = pd.DataFrame(list(dict.items()), columns=['City', 'Population'])

In [64]:
cpdf.append(cpdf1).head()

Unnamed: 0,City,Population
0,Kavala,58790
1,Athina,664046
2,Peiraias,163688
3,Peristeri,139981
4,Acharnes,106943


In [65]:
# Sort the values and displauy the top ten cities with highest population
cpdf.sort_values('Population', ascending=False).head(10)

Unnamed: 0,City,Population
1,Athina,664046
2,Peiraias,163688
3,Peristeri,139981
4,Acharnes,106943
0,Kavala,58790


# 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [66]:
# Define a dictionary c_p
c_p = {}
# Define a loop to iterate over countries
for e in document.iter('country'):
    # Get all the names of countries
    cntry = e.find('name')
    # Pick the most recent population figure
    pop = e.find('.//population[last()]')
    # If 'country name' and 'population' is not null, then fill the dictionary 'c_p'
    if cntry != None:
        if pop != None:
            # Create the dictionary with the key(country name) and value(population)
            c_p[cntry.text] = int(pop.text)

In [67]:
# Create a dataframe cpdf from the dictionay c_p
cpdf = pd.DataFrame(list(c_p.items()), columns=['Country', 'Population'])
cpdf.head()

Unnamed: 0,Country,Population
0,Albania,2800138
1,Greece,10816286
2,Macedonia,2059794
3,Serbia,7120666
4,Montenegro,620029


In [68]:
# Define a list co_etg
co_etg = []
# Loop over the countries
for el in document.iterfind('country'):
    # Get the names of each country
    co = el.find('name').text
    # Loop over the element ethnicgroup
    for et in el.iterfind('ethnicgroup'):
        # Get the name of each ethnic_group
        etg_name = et.text
        # Get the attribute 'percentage' value and make the dtype as float
        etg_prcnt = float(et.attrib['percentage'])
        # Create the list co_etg by appending the required data
        co_etg.append([co,etg_name,etg_prcnt])

In [69]:
# Create a data frame co_etgdf from the list co_etg with defined columns
co_etgdf = pd.DataFrame(co_etg, columns = ['Country', 'Ethnic_group', 'Ethnic_group_prcnt'])

In [70]:
co_etgdf.head()

Unnamed: 0,Country,Ethnic_group,Ethnic_group_prcnt
0,Albania,Albanian,95.0
1,Albania,Greek,3.0
2,Greece,Greek,93.0
3,Macedonia,Macedonian,64.2
4,Macedonia,Albanian,25.2


In [71]:
# Merge the two data frames 'co_etgdf' and 'cpdf' 
largest_ethnic_groups = pd.merge(co_etgdf, cpdf, on='Country')

In [72]:
largest_ethnic_groups.head()

Unnamed: 0,Country,Ethnic_group,Ethnic_group_prcnt,Population
0,Albania,Albanian,95.0,2800138
1,Albania,Greek,3.0,2800138
2,Greece,Greek,93.0,10816286
3,Macedonia,Macedonian,64.2,2059794
4,Macedonia,Albanian,25.2,2059794


In [73]:
# Calculate the population of each ethnic group of the overall population
largest_ethnic_groups['Ethnic_group_pop'] = (largest_ethnic_groups['Population']*largest_ethnic_groups['Ethnic_group_prcnt'])/100

In [74]:
# Define the data type of Ethnic Group population as integer
largest_ethnic_groups.Ethnic_group_pop = largest_ethnic_groups.Ethnic_group_pop.astype(int)

In [75]:
# Rearrange the columns of the data frame 
largest_ethnic_groups = largest_ethnic_groups[['Country', 'Ethnic_group', 'Population','Ethnic_group_prcnt', 'Ethnic_group_pop']]

In [76]:
# Sort the values of 'Ethnic_group_pop' and display the top ten 'Ethnic group population
largest_ethnic_groups.sort_values('Ethnic_group_pop', ascending=False).head(10)

Unnamed: 0,Country,Ethnic_group,Population,Ethnic_group_prcnt,Ethnic_group_pop
176,China,Han Chinese,1360720000,91.5,1245058800
221,India,Indo-Aryan,1210854977,72.0,871815583
220,India,Dravidian,1210854977,25.0,302713744
345,United States,European,318857056,79.96,254958101
520,Nigeria,African,164294516,99.0,162651570
212,Bangladesh,Bengali,149772364,98.0,146776916
299,Japan,Japanese,127298000,99.4,126534212
93,Russia,Russian,143666931,79.8,114646210
278,Indonesia,Javanese,252124458,45.0,113456006
461,Brazil,European,202768562,53.7,108886717


# 4. Name and Country of a) longest river, b) largest lake and c) airport at highest elevation

In [77]:
# Define a dictionary 'cntry'
cntry = {}
# Loop over all the countries
for el in document.iterfind('country'):
    # Find the name of each country
    co_name = el.find('name').text
    # Retrieve the code of each country which is present in the attribute "car_code"
    co_code = el.attrib['car_code']
    # Create the dictionary 'cntry'
    cntry[co_code] = co_name    

In [78]:
# Create a data frame 'cntry_df' from the dictionary 'cntry'
cntry_df = pd.DataFrame(list(cntry.items()), columns=['Country_Code', 'Country_Name'])

In [79]:
cntry_df.head()

Unnamed: 0,Country_Code,Country_Name
0,AL,Albania
1,GR,Greece
2,MK,Macedonia
3,SRB,Serbia
4,MNE,Montenegro


In [80]:
# Define a list 'r_list'
r_list = []
# Loop over each river and retrieve the 'river name', 'length'
for el in document.iterfind('river'):
    r_name = el.find('name').text
    ln = el.find('length')
    # If the length of the river is not null, get the text of the subelement 'length'
    if ln != None:
        r_ln = ln.text
        # Loop over all the countries and split the text if they are multiple countries
        for se in el.attrib['country'].split():
            r_cntry = cntry[se]
            r_list.append([se, r_cntry, r_name, ln.text])

In [81]:
# Create a dataframe 'rdf' from the list 'r_list'
rdf = pd.DataFrame(r_list, columns=['Country_Code', 'Country_Name', 'River_Name', 'River_Length'])

In [82]:
# Define the data type of 'River_Length' as float
rdf.River_Length = rdf.River_Length.astype(float)

In [83]:
# Country with the Largest River
rdf.sort_values('River_Length', ascending=False).head(1)

Unnamed: 0,Country_Code,Country_Name,River_Name,River_Length
300,PE,Peru,Amazonas,6448.0


In [84]:
# Define a list 
l_list = []
# Loop over all the lakes and find thier names and areas
for el in document.iterfind('lake'):
    l_name = el.find('name').text
    l_area = el.find('area')
    if l_area != None:
        l_ar = l_area.text
        # Loops over all the countries and splits in case of multiple countries
        for se in el.attrib['country'].split():
            l_co = cntry[se]
            l_list.append([se, l_co, l_name, l_ar])

In [85]:
# Create a data frame ldf from the list l_list
ldf = pd.DataFrame(l_list, columns=['Country_Code', 'Country_Name', 'Lake_Name', 'Lake_Area'])

In [86]:
# Change the data type of 'Lake_Area' to float
ldf.Lake_Area = ldf.Lake_Area.astype(float)

In [87]:
# Country with the largest lake
ldf.sort_values('Lake_Area', ascending=False).head(1)

Unnamed: 0,Country_Code,Country_Name,Lake_Name,Lake_Area
68,R,Russia,Caspian Sea,386400.0


In [88]:
# define a list
a_list = []
# Loop over all the airports and find thier names and elevation
for e in document.iterfind('airport'):
    a_name = e.find('name').text
    a_elev = e.find('elevation')
    if a_elev != None:
        air_elev = a_elev.text
        # Loops over all the countries and splits in case of multiple values
        for se in e.attrib['country'].split():
            a_co = cntry[se]
            a_list.append([se, a_co, a_name, air_elev])            

In [89]:
# Create a data frame a_df from the list a_list
a_df = pd.DataFrame(a_list, columns = ['Country_Code', 'Country_Name', 'Airport_Name', 'Airport_Elevation'])

In [90]:
# Covert the data type of 'Airport_Elevation' to float
a_df.Airport_Elevation = a_df.Airport_Elevation.astype(float)

In [91]:
# Display the country with highest elevation
a_df.sort_values('Airport_Elevation', ascending=False).head(1)

Unnamed: 0,Country_Code,Country_Name,Airport_Name,Airport_Elevation
80,BOL,Bolivia,El Alto Intl,4063.0
