# XML / Mondial Exercise
## 1) Find the 10 countries with the lowest infant mortality rates

In [1]:
import pandas as pd
import numpy as np
from xml.etree import ElementTree as ET

In [2]:
document = ET.parse( './data/mondial_database.xml' )
document = document.getroot()

In [3]:
#iterate through each country to create dictionary for each country name (key) and corresponding infant mortality rate (value)
dict = {}

for element in document.iter('country'):
    x = element.find('name').text
    y = element.findtext('infant_mortality')
    dict[x] = y

In [4]:
#read dictionary into Pandas dataframe for manipulation
df = pd.DataFrame.from_dict(dict, orient = 'index')

In [5]:
#format and sort
df = df.astype(float)
df = df.dropna()
df = df.sort_values(0)

In [6]:
#list top 10
df.head(10)

Unnamed: 0,0
Monaco,1.81
Japan,2.13
Norway,2.48
Bermuda,2.48
Singapore,2.53
Sweden,2.6
Czech Republic,2.63
Hong Kong,2.73
Macao,3.13
Iceland,3.15


## 2) Find the 10 cities with the largest population

In [7]:
#want to use lxml methods for this part
from lxml import etree

In [8]:
document = etree.parse( './data/mondial_database.xml' )
document = document.getroot()

In [9]:
dict = {}
#use Xpath to ensure that population is the most recent census
pops = document.findall('./country/city/population[last()]')

for pop in pops:
    b = float(pop.text)
    #use lxml iterancestors to work backwards and only include cities with population counts
    for city in pop.iterancestors('city'):
        a = (city.find('name').text)
    dict[a] = b

In [10]:
#as before, read dictionary into Pandas dataframe
df = pd.DataFrame.from_dict(dict, orient = 'index')

In [11]:
#format and sort
df = df.astype(float)
df = df.dropna()
df = df.sort_values(0, ascending = False)

In [12]:
#get the top 10
df.head(10)

Unnamed: 0,0
Seoul,9708483.0
Al Qahirah,8471859.0
Bangkok,7506700.0
Hong Kong,7055071.0
Ho Chi Minh,5968384.0
Singapore,5076700.0
Al Iskandariyah,4123869.0
New Taipei,3939305.0
Busan,3403135.0
Pyongyang,3255288.0


## 3) Find the 10 ethnic groups with largest overall populations

In [13]:
#create lists for name of each ethnic group, the percentage each group is of each countries' population, and the size of that population
a = []
b = []
c = []

for country in document.findall('country'):
    pops = country.findall('./population[last()]')
    for pop in pops:
        groups = country.findall('ethnicgroup')
        for group in groups:
            a.append(float(pop.text)), 
            b.append(group.text), 
            c.append(float(group.get('percentage')))

In [14]:
#read into Pandas dataframe for manipulation
df = pd.DataFrame({'pop': a, 'group': b, 'percent': c})

In [15]:
#create new column to give size of ethnic group population in each country
df['number'] = (df['percent'] / 100) * df['pop']

In [16]:
#remove now unecessary columns
del df['percent']
del df['pop']

In [17]:
#group the data by ethnic groups
df = df.groupby(df['group']).sum()

In [18]:
#get the top 10
df.sort_values('number', ascending = False).head(10)

Unnamed: 0_level_0,number
group,Unnamed: 1_level_1
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0
