# This is a markdown file exploring and cleaning WHO Data.  This is just to get a feel for the data and to see how they compare across resources.  We may find that the data is basically the same, which is highly possible.

In [1]:
import pandas as pd

## Load the data

In [5]:
who = pd.read_csv("WHO_data/WHO 2017-2019.csv")
unicef = pd.read_csv("UNICEF_data/UNICEF-IMR-2018.csv")

## Step 1: Examine and clean the WHO data

In [13]:
who.head()

Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,UNREGION,WORLDBANKINCOMEGROUP,COUNTRY,SEX,Display Value,Numeric,Low,High,Comments
0,MDG_0000000001,PUBLISHED,2018,EMR,,,AFG,BTSX,48.0,48.04333,39.9443,56.32006,
1,MDG_0000000001,PUBLISHED,2018,EMR,,,AFG,FMLE,44.6,44.55944,36.87615,52.43261,
2,MDG_0000000001,PUBLISHED,2018,EMR,,,AFG,MLE,51.3,51.3471,42.55961,60.39946,
3,MDG_0000000001,PUBLISHED,2018,AFR,,,AGO,BTSX,51.9,51.87144,28.2057,87.37608,
4,MDG_0000000001,PUBLISHED,2018,AFR,,,AGO,FMLE,46.3,46.32707,25.06762,78.38194,


In [32]:
who_cols = [
    'GHO', 
    'YEAR', 
    'REGION',
    'COUNTRY',    
    'SEX',
    'Display Value',
    'Numeric',
    'Low',
    'High'
]

### Based on the numbers in the WHO dataset vs. UNICEF, it's safe to say that BTSX is a code for the combined IMR of Male and Female, as the numbers match with UNICEF in that regard

In [37]:
who = who[who_cols]
sex_filter = who['SEX'] == 'BTSX'
year_filter = who['YEAR'] == 2018
who = who[sex_filter]
who = who[year_filter]
who.head()

Unnamed: 0,GHO,YEAR,REGION,COUNTRY,SEX,Display Value,Numeric,Low,High
0,MDG_0000000001,2018,EMR,AFG,BTSX,48.0,48.04333,39.9443,56.32006
3,MDG_0000000001,2018,AFR,AGO,BTSX,51.9,51.87144,28.2057,87.37608
6,MDG_0000000001,2018,EUR,ALB,BTSX,8.5,8.47357,8.06406,8.91675
9,MDG_0000000001,2018,EUR,AND,BTSX,2.9,2.94139,0.86802,10.54429
12,MDG_0000000001,2018,EMR,ARE,BTSX,6.5,6.50609,5.75564,7.35071


## Step 2: Examine UNICEF Data

In [16]:
unicef.head()

Unnamed: 0,DATAFLOW,REF_AREA:Geographic area,INDICATOR:Indicator,SEX:Sex,TIME_PERIOD:Time period,OBS_VALUE:Observation Value,UNIT_MULTIPLIER:Unit multiplier,UNIT_MEASURE:Unit of measure,OBS_STATUS:Observation Status,OBS_CONF:Observation confidentaility,...,WGTD_SAMPL_SIZE:Weighted Sample Size,OBS_FOOTNOTE:Observation footnote,SERIES_FOOTNOTE:Series footnote,DATA_SOURCE:Data Source,SOURCE_LINK:Citation of or link to the data source,CUSTODIAN:Custodian,TIME_PERIOD_METHOD:Time period activity related to when the data are collected,REF_PERIOD:Reference Period,COVERAGE_TIME:The period of time for which data are provided,AGE:Current age
0,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,AFG: Afghanistan,CME_MRY0: Infant mortality rate,F: Female,2018,44.55944,,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,,...,,,,,,,,,,_T: Total
1,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,AFG: Afghanistan,CME_MRY0: Infant mortality rate,M: Male,2018,51.347098,,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,,...,,,,,,,,,,_T: Total
2,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,AFG: Afghanistan,CME_MRY0: Infant mortality rate,_T: Total,2018,48.043335,,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,,...,,,,,,,,,,_T: Total
3,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,ALB: Albania,CME_MRY0: Infant mortality rate,F: Female,2018,7.5519,,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,,...,,,,,,,,,,_T: Total
4,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,ALB: Albania,CME_MRY0: Infant mortality rate,M: Male,2018,9.34091,,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,,...,,,,,,,,,,_T: Total


## The data and column names in UNICEF are much more specific, which is good and bad.  We're going to reformat some of it to make it easier to join data, and we will only look at columns we find relevant for the analysis to make it easier to work with

In [23]:
unicef[['REF_AREA','Geographic area']] = unicef['REF_AREA:Geographic area'].str.split(":",expand=True)

In [25]:
unicef_cols = ['DATAFLOW', 'REF_AREA', 'Geographic area', 'INDICATOR:Indicator',
       'SEX:Sex', 'TIME_PERIOD:Time period', 'OBS_VALUE:Observation Value',
       'UNIT_MEASURE:Unit of measure',
       'OBS_STATUS:Observation Status',
       'LOWER_BOUND:Lower Bound', 'UPPER_BOUND:Upper Bound']

In [28]:
unicef = unicef[unicef_cols].copy()

In [35]:
unicef_filter = unicef["SEX:Sex"] == "_T: Total"
unicef = unicef[unicef_filter]
unicef.head()

Unnamed: 0,DATAFLOW,REF_AREA,Geographic area,INDICATOR:Indicator,SEX:Sex,TIME_PERIOD:Time period,OBS_VALUE:Observation Value,UNIT_MEASURE:Unit of measure,OBS_STATUS:Observation Status,LOWER_BOUND:Lower Bound,UPPER_BOUND:Upper Bound
2,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,AFG,Afghanistan,CME_MRY0: Infant mortality rate,_T: Total,2018,48.043335,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,39.944302,56.320057
5,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,ALB,Albania,CME_MRY0: Infant mortality rate,_T: Total,2018,8.473569,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,8.064059,8.916751
8,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,DZA,Algeria,CME_MRY0: Infant mortality rate,_T: Total,2018,20.429315,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,19.960411,20.895252
11,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,AND,Andorra,CME_MRY0: Infant mortality rate,_T: Total,2018,2.941386,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,0.868024,10.544288
14,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,AGO,Angola,CME_MRY0: Infant mortality rate,_T: Total,2018,51.87144,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,28.205699,87.376078


In [30]:
unicef

Unnamed: 0,DATAFLOW,REF_AREA,Geographic area,INDICATOR:Indicator,SEX:Sex,TIME_PERIOD:Time period,OBS_VALUE:Observation Value,UNIT_MEASURE:Unit of measure,OBS_STATUS:Observation Status,LOWER_BOUND:Lower Bound,UPPER_BOUND:Upper Bound
2,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,AFG,Afghanistan,CME_MRY0: Infant mortality rate,_T: Total,2018,48.043335,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,39.944302,56.320057
5,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,ALB,Albania,CME_MRY0: Infant mortality rate,_T: Total,2018,8.473569,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,8.064059,8.916751
8,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,DZA,Algeria,CME_MRY0: Infant mortality rate,_T: Total,2018,20.429315,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,19.960411,20.895252
11,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,AND,Andorra,CME_MRY0: Infant mortality rate,_T: Total,2018,2.941386,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,0.868024,10.544288
14,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,AGO,Angola,CME_MRY0: Infant mortality rate,_T: Total,2018,51.871440,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,28.205699,87.376078
...,...,...,...,...,...,...,...,...,...,...,...
680,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,YEM,Yemen,CME_MRY0: Infant mortality rate,_T: Total,2018,43.639697,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,29.245932,63.183587
683,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,ZMB,Zambia,CME_MRY0: Infant mortality rate,_T: Total,2018,43.364637,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,36.935594,50.962951
686,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,ZWE,Zimbabwe,CME_MRY0: Infant mortality rate,_T: Total,2018,39.273756,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,32.471826,47.225792
691,UNICEF:GLOBAL_DATAFLOW(1.0): Cross-sector indi...,UNICEF_SSA,sub-Saharan Africa,CME_MRY0: Infant mortality rate,_T: Total,2018,53.021824,D_PER_1000_B: Deaths per 1000 live births,A: Normal value,49.846879,57.927077
