In [1]:
from xml.etree import ElementTree as ET
import pandas as pd

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
# read data with elementTree
doc = ET.parse( './data/mondial_database.xml' )

In [7]:
## Q1 ## - Find 10 countries with the lowest infant moratilty rate

# utilizing the xml library ElementTree
# create empty list to append country and inf_moratility rate
country_rate = []
# iterate through countries
# exclude those that have no data for infant_mortality
for element in doc.iterfind('country'):    
    if element.find('infant_mortality') is not None:
        im_rate = element.find('infant_mortality').text
        cntry = element.find('name').text
        country_rate.append([cntry, im_rate])
# create pandas dataframe from list of lists
infMort = pd.DataFrame(country_rate, columns=['country', 'inf_mortality_rate'])
# sort ascending by infant_mortality rate and show lowest 10 with head method
print('10 Countries with the Lowest Infant Mortality Rate')
infMort.sort_values('inf_mortality_rate').head(10)

10 Countries with the Lowest Infant Mortality Rate


Unnamed: 0,country,inf_mortality_rate
36,Monaco,1.81
28,Romania,10.16
142,Fiji,10.2
63,Brunei,10.48
124,Grenada,10.5
221,Mauritius,10.59
116,Panama,10.7
227,Seychelles,10.77
94,United Arab Emirates,10.92
105,Barbados,10.93


In [26]:
## Q2 ## - Find 10 cities with highest Population

# create empty list to append country, city, & population
city_pop = []
# utilizing the xml library ElementTree
# iterate through countries then...
for element in doc.iterfind('country'): 
    cntry = element.find('name').text
    # ..iterate through each city
    for subel in element.getiterator('city'):
        city = subel.find('name').text
        # exclude those that have no data for city population
        if subel.find('population') is not None:
            pop = subel.find('population').text
            city_pop.append([cntry, city, pop])
# create pandas dataframe from list of lists
cityPop = pd.DataFrame(city_pop, columns=['country', 'city', 'population'])
# sort descending by city population and show largest 10 with head method
print('Top 10 Cities by Population')
cityPop.sort_values('population', ascending=False).head(10)

Top 10 Cities by Population


Unnamed: 0,country,city,population
176,Germany,Cottbus,99984
519,Russia,Perm,999157
173,Germany,Erlangen,99808
2752,Algeria,Laghouat,99536
2461,Brazil,Alagoinhas,99508
3035,Mozambique,Xai-Xai,99442
2539,Brazil,Camaragibe,99407
2857,Nigeria,Kaduna,993642
56,France,Nancy,99351
486,Russia,Kazan,992675


In [27]:
## Q3 ## - Find the 10 Ethnic Groups with the Highest Population

eth_pop = []
# create empty list to append ethnic groups,...
# ...their percentage size in the country, and...
# ...population from their country
# iterate over countries and find each ethnic group
# record ethnic group name and pop
for element in doc.iterfind('country'):
    for subel in element.iterfind('ethnicgroup'):
        pop = element.find('population').text
        ethnicity = subel.text
        ethPer = subel.attrib['percentage']
        eth_pop.append([ethnicity, float(ethPer), int(pop)])
        
# dataframe from lists
ethPop = pd.DataFrame(eth_pop, columns=['ethnicGroup', 'ethnicPer', 'countryPop'])
# compute estimate of ethnic Population for each country
ethPop['ethnicPop'] = ((ethPop.ethnicPer/100) * ethPop.countryPop).astype('int')
print('Top 10 World Ethnic Groups by Population across All included Countries')
ethPop.groupby('ethnicGroup').sum().sort_values('ethnicPop', ascending=False).head(10)
        

Top 10 World Ethnic Groups by Population across All included Countries


Unnamed: 0_level_0,ethnicPer,countryPop,ethnicPop
ethnicGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Han Chinese,91.5,543776080,497555113
European,970.82,362717873,192865792
Indo-Aryan,72.0,238396327,171645355
Russian,224.1,202263854,92758431
African,1868.55,357529690,86329356
Japanese,99.4,82199470,81706273
German,165.6,145710759,66232183
Dravidian,25.0,238396327,59599081
English,83.6,50616012,42314986
Mestizo,870.7,67185932,35542318


In [21]:
## Q4 ## - Function to return locale name, size, and countries data arrray
def localeSize(locale, metric):
    '''
    Function to return locale name, size, and countries data arrray.
    Accepts two string arguments for locale and size metric
    '''
    # create empty list placeholder
    locales = []
    # utilizing the xml library ElementTree
    # iterate through locales finding the size and countries...
    for element in doc.iterfind(locale): 
        if element.find(metric) is not None and element.find(metric).text is not None:
            locale = element.find('name').text
            size = element.find(metric).text
            countryId = element.attrib['country']
            # append name, size and countries list to output list
            locales.append([locale, size, countryId])
    return locales

In [22]:
## Q4 ## - Find the longest River and its countries
# call function with appropriate locale and metric
rivers = localeSize('river', 'length')
# convert size values to float
lengths = [float(r[1]) for r in rivers]
# find index of maximum River size
midx = lengths.index(max(lengths))
# use max index to access the name and countries in the rivers lists
print('{0} River running through countries {2} is the longest river: {1} km (in these data)'.format(rivers[midx][0], rivers[midx][1], rivers[midx][2]))

Amazonas River running through countries CO BR PE is the longest river: 6448 km (in these data)


In [28]:
## Q4 ## - Find the largest lake and its country
# call function with appropriate locale and metric
lakes = localeSize('lake', 'area')
# convert size values to float
areas = [float(a[1]) for a in lakes]
# find index of maximum Lake size
midx = areas.index(max(areas))
# use max index to access the name and countries in the lakes lists
print('{0} in countries {2} is the largest lake by area: {1} km^2 (in these data)'.format(lakes[midx][0], lakes[midx][1], lakes[midx][2]))

Caspian Sea in countries R AZ KAZ IR TM is the largest lake by area: 386400 km^2 (in these data)


In [25]:
## Q4 ## - Find the highest elevated Airport and its country
# call function with appropriate locale and metric
airports = localeSize('airport', 'elevation')
# convert size values to float
elevations = [float(a[1]) for a in airports]
# find index of maximum airport elevation
midx = elevations.index(max(elevations))
# use max index to access the name and countries in the airports lists
print('{0} Airport in countries {2} is the highest airport by elevation: {1} m (in these data)'.format(airports[midx][0], airports[midx][1], airports[midx][2]))

El Alto Intl Airport in countries BOL is the highest airport by elevation: 4063 m (in these data)
