In [4]:
from xml.etree import ElementTree as ET

In [5]:
document_tree = ET.parse( './data/mondial_database.xml' )

# 1. 10 countries with the lowest infant mortality rates

In [6]:
# 1. 10 countries with the lowest infant mortality rates
# print names of all countries and their cities

# importing itemgetter library to use in sorting
from operator import itemgetter

# retrieving all the Counties from xml document tree
countries = document_tree.findall(".//country")


countires_infantMortality_list = [(t.find('name').text, float(t.find('infant_mortality').text)) for t in countries if t.find('infant_mortality') is not None]

# sorting list for lowest to highest infant mortality rate
countires_infantMortality_list.sort(key=itemgetter(1))

# used slicing technique to get top 10 countries
countires_infantMortality_list[:10]
    

[('Monaco', 1.81),
 ('Japan', 2.13),
 ('Norway', 2.48),
 ('Bermuda', 2.48),
 ('Singapore', 2.53),
 ('Sweden', 2.6),
 ('Czech Republic', 2.63),
 ('Hong Kong', 2.73),
 ('Macao', 3.13),
 ('Iceland', 3.15)]

# 2. 10 cities with the largest population

In [7]:

# retrieving all the cities from xm document tree
cities = document_tree.findall(".//country/province/city")

# creating list with cities and their population
cities_list = list()
for t in cities:
    for i in t.findall('population'):
        if i.get('year') == '2011':
            #print("City Name: {0}, Population: {1}".format(t.find('name').text, i.text))
            cities_list.append((t.find('name').text, float(i.text)))

# sorting list for largest to smallest population         
cities_list.sort(key=itemgetter(1), reverse=True)

# used slicing technique to get top 10 cities
cities_list[:10]


[('Mumbai', 12442373.0),
 ('Delhi', 11034555.0),
 ('Bangalore', 8443675.0),
 ('London', 8250205.0),
 ('Tehran', 8154051.0),
 ('Dhaka', 7423137.0),
 ('Hyderabad', 6731790.0),
 ('Ahmadabad', 5577940.0),
 ('Luanda', 5000000.0),
 ('Chennai', 4646732.0)]

# 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [8]:
# functions used in following calculations

def findUnique(egroup_list):
    # finding unique ethnic groups from list of tuples
    return set([x for (x,y) in egroup_list])


def CalculateSum(egroup_list, egroup_unique):
    ''' enumeration through unique ethnic groups to calculate sum of population 
    percentage in all countries for each ethnic group'''
    egroup_population_sum = list()
    for eg in egroup_unique:
        population_list = [float(y) for (x,y) in egroup_list if x == eg]
        egroup_population_sum.append((eg, sum(population_list)))
    
    return egroup_population_sum

In [9]:
''' Implemented this piece thinking that in question ask is to get ethnic 
    groups with highest population by percentage
    '''

# getting all ethnic groups
ethnic_groups = document_tree.findall(".//country/ethnicgroup")

# converting all the elements into list of tuple(city, population percentage)
egroup_list = [(egroup.text, egroup.get('percentage')) for egroup in ethnic_groups]

# find unique ethnic groups using findUnique def
egroup_unique = findUnique(egroup_list)

# calculating sum using CalculateSum def
egroup_population_sum = CalculateSum(egroup_list, egroup_unique)

# sorting ethnic group total in order of highest to lowest population percentage 
egroup_population_sum.sort(key=itemgetter(1), reverse=True)

# slicing to get to 10 ethnic groups
egroup_population_sum[:10]
    

[('African', 1868.55),
 ('European', 970.82),
 ('Mestizo', 870.7),
 ('Polynesian', 666.5),
 ('Arab', 638.3),
 ('Chinese', 472.90000000000003),
 ('Arab-Berber', 296.1),
 ('Black', 255.5),
 ('Malay', 242.3),
 ('Melanesian', 231.1)]

In [10]:
''' Implemented same question thinking that in question ask is to get ethnic 
    groups with highest population 
    example: Kosovo country has 92 % of Albanians that means 
        92 % of 1733872 (population from 2011 census) = 1595162 
        
    By calculating the population by percentage for each ethnic group 
    in each country and getting the top 10 highest populations by ethnic groups
    '''
ethgp_list = list()
for country in countries:
    country_population = 0
    for population in country.findall('population'):
        if population.get('year') == '2011':
            country_population = float(population.text)
            break

    for ethgrp in country.findall('ethnicgroup'):
        ethgp_list.append((ethgrp.text, float(ethgrp.get('percentage')) * country_population / 100))

# find unique ethnic groups using findUnique def
ethgroup_unique = findUnique(ethgp_list)

# calculating sum using CalculateSum def
ethgroup_population_sum = CalculateSum(ethgp_list, ethgroup_unique)

# sorting ethnic group total in order of highest to lowest population 
ethgroup_population_sum.sort(key=itemgetter(1), reverse=True)

# used slicing technique to get to 10 ethnic groups
ethgroup_population_sum[:10]
            

[('Indo-Aryan', 871815583.44),
 ('Dravidian', 302713744.25),
 ('African', 166391983.871),
 ('Bengali', 146776916.72),
 ('German', 74278485.28199998),
 ('English', 52820300.80799999),
 ('Mediterranean Nordic', 46815916.0),
 ('Persian', 38326331.19),
 ('Polish', 38018419.995),
 ('Mongol', 36325649.31)]

# 4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

## Calculating Longest River

In [11]:
# retreiving rivers from xml document tree
rivers = document_tree.findall(".//river")

rivers_list = list()
for r in rivers:
    #print("river Name: {0}".format(r.find('name').text)) 
    if r.find('length') is None:
        continue
    #print("Length: {0}".format(r.find('length').text))
    rivers_list.append((r.find('name').text, float(r.find('length').text), r.get('country')))

rivers_list.sort(key=itemgetter(1), reverse=True)

rivers_list[:1]

[('Amazonas', 6448.0, 'CO BR PE')]

In [12]:
# creating country lookup with country name and country code

countries_lookup = {c.get('car_code'): c.find('name').text for c in countries}

# displaying first 2 key: value pairs of countries
{k: countries_lookup[k] for k in sorted(countries_lookup.keys())[:2]}

{'A': 'Austria', 'AFG': 'Afghanistan'}

In [13]:
# Longest river
longest_river = rivers_list[:1]
country_name_for_longest_river = list()
for cn in longest_river[0][2].split():
    country_name_for_longest_river.append(countries_lookup.get(cn))

print("Longest River Name: {0} \nCountries: {1}"
      .format(longest_river[0][0], country_name_for_longest_river))

Longest River Name: Amazonas 
Countries: ['Colombia', 'Brazil', 'Peru']


## Calculating Largest Lake

In [14]:
# retreiving lakes from xml document tree
lakes = document_tree.findall(".//lake")

lakes_list = list()
for l in lakes:
    #print("lake Name: {0}".format(r.find('name').text)) 
    if l.find('area') is None:
        continue
    #print("Area: {0}".format(r.find('area').text))
    lakes_list.append((l.find('name').text,
                       float(l.find('area').text), l.get('country')))

lakes_list.sort(key=itemgetter(1), reverse=True)

lakes_list[:1]

[('Caspian Sea', 386400.0, 'R AZ KAZ IR TM')]

In [15]:
# Largest lake
largest_lake = lakes_list[:1]
country_name_for_largest_lake = list()
for cn in largest_lake[0][2].split():
    country_name_for_largest_lake.append(countries_lookup.get(cn))

print("Largest Lake Name: {0} \nCountries: {1}"
      .format(largest_lake[0][0], country_name_for_largest_lake))

Largest Lake Name: Caspian Sea 
Countries: ['Russia', 'Azerbaijan', 'Kazakhstan', 'Iran', 'Turkmenistan']


## Calculating Highest Airport

In [16]:
# retreiving airports from xml document tree
airports = document_tree.findall(".//airport")

# creating airports list with names, country codes and elevation of each airport
airports_list = list()
for a in airports:
    #print("airport Name: {0}".format(a.find('name').text))
    # checking if the elevation tag exisits or not
    if a.find('elevation') is None:
        continue
    # checking if elevation element has text or not
    elif a.find('elevation').text is None:
        continue

    #print("Elevation: {0}".format(a.find('elevation').text))
    #print(a.find('name').text)
    airports_list.append((a.find('name').text, 
                          float(a.find('elevation').text), 
                          a.get('country')))

# sorting airports with highest to lowest elevation
airports_list.sort(key=itemgetter(1), reverse=True)

# used slicing technique to get airport with highest elevation
airports_list[:1]

[('El Alto Intl', 4063.0, 'BOL')]

In [17]:
# Highest elevation airport
highest_airport = airports_list[:1]
country_name_for_highest_airport = list()
for cn in highest_airport[0][2].split():
    country_name_for_highest_airport.append(countries_lookup.get(cn))

print("Highest Airport Name: {0} \nCountries: {1}"
      .format(highest_airport[0][0], country_name_for_highest_airport))

Highest Airport Name: El Alto Intl 
Countries: ['Bolivia']
