## Datasets

In [1]:
import pandas as pd
import data_cleaner as clean

In [2]:
fertility = pd.read_csv('../data/UNICEF-FERTILITY-RATE-2018.csv')
pregnant_women_dr_visits2018_4x = pd.read_csv("../data/UNICEF-PREGNANT-WOMEN-WHO-VISIT-DOCTOR-AT-LEAST-FOUR-TIMES-%UNIT-2018.csv")
imr2018 = pd.read_csv("../data/UNICEF-IMR-2018.csv")
pregnant_women_dr_visits2018_1x = pd.read_csv("../data/UNICEF-PREGNANT-WOMEN-WHO-VISIT-DOCTOR-AT-LEAST-ONCE-%UNIT-2018.csv")
infant_death2018 = pd.read_csv("../data/UNICEF-INFANT-DEATH-2018.csv")
low_birth_wt2015 = pd.read_csv("../data/UNICEF-PREVALENCE-OF-LOW-BIRTH-WEIGHT-2015.csv")
#links = pd.read_excel("../data/UNICEF-LINKS.xlsx")
vaccine_coverage2018 = pd.read_csv("../data/UNICEF-VACCINE-COVERAGE-BY-ANTIGEN-2018.csv")
maternal_death2017 = pd.read_csv("../data/UNICEF-MATERNAL-DEATH-2017.csv")
#unicef_who_joint_immun_report2016 = pd.read_excel("../data/UNICEF-WHO-IMMUNIZATION-JOINT-REPORT-2016.xls")
births2018 = pd.read_csv("../data/UNICEF-NUMBER-OF-BIRTH-2018.csv")

In [3]:
datasets = [
    fertility,  # DATAFLOW, INDICATOR:Indicator, SEX:Sex, TIME_PERIOD:Time period, OBS_VALUE:Observation Value, UNIT_MULTIPLIER:Unit multiplier, UNIT_MEASURE: Unit of measure, DATA_SOURCE:Data Source, REF_AREA, Geographic area
    pregnant_women_dr_visits2018_4x,
    pregnant_women_dr_visits2018_1x,
    imr2018,  # DATAFLOW, REF_AREA, Geographic area, INDICATOR:Indicator, SEX:Sex, TIME_PERIOD:Time period, OBS_VALUE:Observation Value, UNIT_MULTIPLIER:Unit multiplier, UNIT_MEASURE:Unit of measure, OBS_STATUS:Obs Status...
    infant_death2018,
    low_birth_wt2015,
    vaccine_coverage2018,
    maternal_death2017,
    births2018
]

In [4]:
for df in datasets:
    df.dropna(how="all", axis=1, inplace=True)
    if "REF_AREA:Geographic area" in df.columns:
        clean.split_cols(df, "REF_AREA:Geographic area")
    #print(df.describe())

## CLEAN & JOIN DATAFRAMES INTO ONE TABLE WITH ALL DATA

In [5]:
# Filter for only total readings (don't split by male and female for now)
gender_filter = imr2018['SEX:Sex'] == '_T: Total'
imr2018 = imr2018[gender_filter]

In [6]:
# Join Fertility & IMR
fertility.rename(columns = {"OBS_VALUE:Observation Value":"Total_Fertility_Rate_%"}, inplace=True)
fertility_selection = fertility[["Total_Fertility_Rate_%", "REF_AREA", "Geographic area"]]
imr2018.rename(columns = {"OBS_VALUE:Observation Value": "Infant_Mortality_Rate"}, inplace=True)
imr2018.rename(columns={"LOWER_BOUND:Lower Bound": "IMR_Lower_Bound"}, inplace=True)
imr2018.rename(columns={"UPPER_BOUND:Upper Bound": "IMR_Upper_Bound"}, inplace=True)

imr2018_selection = imr2018[["REF_AREA", "Geographic area", "Infant_Mortality_Rate", 
                             "IMR_Lower_Bound", "IMR_Upper_Bound", "SEX:Sex"]]

joined = imr2018_selection.merge(fertility_selection, how="left", sort=False, on=["REF_AREA", "Geographic area"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [7]:
joined.head()

Unnamed: 0,REF_AREA,Geographic area,Infant_Mortality_Rate,IMR_Lower_Bound,IMR_Upper_Bound,SEX:Sex,Total_Fertility_Rate_%
0,AFG,Afghanistan,48.043335,39.944302,56.320057,_T: Total,4.473
1,ALB,Albania,8.473569,8.064059,8.916751,_T: Total,1.617
2,DZA,Algeria,20.429315,19.960411,20.895252,_T: Total,3.023
3,AND,Andorra,2.941386,0.868024,10.544288,_T: Total,
4,AGO,Angola,51.87144,28.205699,87.376078,_T: Total,5.519


In [8]:
# Filter only for combined readings (don't split by Male/Female)
gender_filter = infant_death2018['SEX:Sex'] == '_T: Total'
infant_death2018 = infant_death2018[gender_filter]

In [9]:
# Join Infant Deaths
infant_death2018.rename(columns = {"OBS_VALUE:Observation Value": "Number_of_Infant_Deaths"}, inplace=True)
infant_death2018.rename(columns={"LOWER_BOUND:Lower Bound": "Infant_Death_Lower_Bound"}, inplace=True)
infant_death2018.rename(columns={"UPPER_BOUND:Upper Bound": "Infant_Death_Upper_Bound"}, inplace=True)
infant_death2018_selection = infant_death2018[["Number_of_Infant_Deaths", "Infant_Death_Lower_Bound", "Infant_Death_Upper_Bound", "SEX:Sex", "REF_AREA", "Geographic area"]]
joined = joined.merge(infant_death2018_selection, how='left', sort=False, on=['REF_AREA', "Geographic area", "SEX:Sex"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [10]:
joined.head()

Unnamed: 0,REF_AREA,Geographic area,Infant_Mortality_Rate,IMR_Lower_Bound,IMR_Upper_Bound,SEX:Sex,Total_Fertility_Rate_%,Number_of_Infant_Deaths,Infant_Death_Lower_Bound,Infant_Death_Upper_Bound
0,AFG,Afghanistan,48.043335,39.944302,56.320057,_T: Total,4.473,57394,47776,67195
1,ALB,Albania,8.473569,8.064059,8.916751,_T: Total,1.617,289,275,304
2,DZA,Algeria,20.429315,19.960411,20.895252,_T: Total,3.023,20875,20397,21350
3,AND,Andorra,2.941386,0.868024,10.544288,_T: Total,,2,1,6
4,AGO,Angola,51.87144,28.205699,87.376078,_T: Total,5.519,64019,34947,107229


In [11]:
joined.describe()

Unnamed: 0,Infant_Mortality_Rate,IMR_Lower_Bound,IMR_Upper_Bound,Total_Fertility_Rate_%,Number_of_Infant_Deaths,Infant_Death_Lower_Bound,Infant_Death_Upper_Bound
count,231.0,231.0,231.0,184.0,231.0,231.0,231.0
mean,21.747269,17.173082,28.176669,2.734054,113230.2,105022.1,124329.6
std,19.135792,15.15749,26.145611,1.270329,409737.3,390004.0,438517.5
min,1.552569,0.825851,1.906501,1.11,0.0,0.0,1.0
25%,6.152161,5.323662,7.427986,1.74125,291.5,264.5,318.5
50%,14.852143,11.651276,19.461561,2.273,3192.0,2775.0,3990.0
75%,33.17007,26.747144,42.518052,3.622,31476.5,25961.5,35915.5
max,83.441195,73.039362,138.356631,6.913,4010099.0,3880481.0,4215378.0


In [12]:
# Join # of Births
births2018.rename(columns={"OBS_VALUE": "Number_of_Births_Thousands"}, inplace=True)
births2018_selection = births2018[["Geographic area", "Number_of_Births_Thousands"]]
joined = joined.merge(births2018_selection, how='left', sort=False, on="Geographic area") # -- FIXME THIS ISN'T ADDING CORRECTLY

In [13]:
joined.describe()

Unnamed: 0,Infant_Mortality_Rate,IMR_Lower_Bound,IMR_Upper_Bound,Total_Fertility_Rate_%,Number_of_Infant_Deaths,Infant_Death_Lower_Bound,Infant_Death_Upper_Bound,Number_of_Births_Thousands
count,231.0,231.0,231.0,184.0,231.0,231.0,231.0,184.0
mean,21.747269,17.173082,28.176669,2.734054,113230.2,105022.1,124329.6,759.858902
std,19.135792,15.15749,26.145611,1.270329,409737.3,390004.0,438517.5,2335.010181
min,1.552569,0.825851,1.906501,1.11,0.0,0.0,1.0,1.475
25%,6.152161,5.323662,7.427986,1.74125,291.5,264.5,318.5,46.314
50%,14.852143,11.651276,19.461561,2.273,3192.0,2775.0,3990.0,166.243
75%,33.17007,26.747144,42.518052,3.622,31476.5,25961.5,35915.5,635.25575
max,83.441195,73.039362,138.356631,6.913,4010099.0,3880481.0,4215378.0,24164.357


## Datasets to hold off on or explore separately

Because of how few datapoints are present in the pregnant women doctor visits datasets, we will hold off joining them for now with the larger dataset, and focus on the other datasets for now.  We may come back to these, but they have very little coverage compared to the rest of the data (31 / ~230), which will throw off the rest of the analyses

In [14]:
# # Join # of DR Visits
# pregnant_women_dr_visits2018_4x.rename(columns={"OBS_VALUE:Observation Value": "Women_4+_dr_visits_%"}, inplace = True)
# pregnant_women_dr_visits2018_4x_selection = pregnant_women_dr_visits2018_4x[["Women_4+_dr_visits_%","REF_AREA", "Geographic area"]]
# pregnant_women_dr_visits2018_1x.rename(columns={"OBS_VALUE:Observation Value": "Women_1+_dr_visit_%"}, inplace = True)
# pregnant_women_dr_visits2018_1x_selection = pregnant_women_dr_visits2018_1x[["Women_1+_dr_visit_%", "REF_AREA", "Geographic area"]]
# joined = joined.merge(pregnant_women_dr_visits2018_4x_selection, how='left', sort=False, on=['REF_AREA', "Geographic area"])
# joined = joined.merge(pregnant_women_dr_visits2018_1x_selection, how='left', sort=False, on=['REF_AREA', "Geographic area"])

We're going to hold off and do vaccine coverage separately as well because it has 2410 rows as opposed to the 231 we have been working with, which will once again massively throw off the joining of datasets

In [15]:
# # Join Vaccine coverage
# vaccine_coverage2018.rename(columns = {"INDICATOR:Indicator":"Vaccine_Type", "OBS_VALUE:Observation Value":"%_vaccinated", "AGE:Current age":"Vaccinated_current_age_2018"}, inplace=True)
# vaccine_coverage2018_selection = vaccine_coverage2018[["Vaccine_Type", "%_vaccinated", "Vaccinated_current_age_2018", "SEX:Sex", "REF_AREA", "Geographic area"]]
# joined = joined.merge(vaccine_coverage2018_selection, how='left', sort=False, on=["SEX:Sex", "REF_AREA", "Geographic area"])

We're holding off on Maternal death because it is 2017 data, and should be compared with 2017 IMR data, not 2018.

In [17]:
# # Join Maternal death
# maternal_death2017.rename(columns = {"OBS_VALUE:Observation Value":"Est_Maternal_Deaths"}, inplace=True)
# maternal_death2017_selection = maternal_death2017[["Est_Maternal_Deaths", "REF_AREA", "Geographic area"]]
# joined = joined.merge(maternal_death2017_selection, how='left', on=['REF_AREA', "Geographic area"])

We're holding off on low birth weight data because it is 2015 data, and should be compared with 2015 IMR data, not 2018

In [19]:
# # Join Low Birth Weight
# low_birth_wt2015.rename(columns = {"OBS_VALUE:Observation Value": "Low_birth_weight_%", "LOWER_BOUND:Lower Bound": "Low_birth_weight_lower", "UPPER_BOUND:Upper Bound":"Low_birth_weight_upper"}, inplace=True)
# low_birth_wt2015_selection = low_birth_wt2015[["Low_birth_weight_%","Low_birth_weight_lower", "Low_birth_weight_upper", "REF_AREA", "Geographic area"]]
# joined = joined.merge(low_birth_wt2015_selection, how='left', on=['REF_AREA','Geographic area'])