## Load packages 

In [1]:
import pandas as pd
import altair as alt
from vega_datasets import data
import pandas_profiling
import seaborn as sns

# Need to enable this to allow work with larger datasets (https://altair-viz.github.io/user_guide/faq.html)
alt.data_transformers.enable('json')

# Need to enable this to render in notebook
alt.renderers.enable('notebook')

RendererRegistry.enable('notebook')

## Loading datasets

In [12]:
# World Bank data
raw_df = pd.read_csv('../data/GDIMMay2018.csv')
raw_df.head()

Unnamed: 0,countryname,wbcode,iso3,region,incgroup2,incgroup4,fragile,survey,year,status,...,Cores2125_MAcatC1,Shortfall0611_obs,Shortfall0611_IGP,Shortfall1217_obs,Shortfall1217_IGP,IGEincome,S1,S2,S3,MLD_psu
0,Afghanistan,AFG,AFG,South Asia,Developing economies,Low income,1,NRVA,1980,Co-residents only,...,,25103.0,0.086197,18054.0,0.345224,,,,,0.1
1,Afghanistan,AFG,AFG,South Asia,Developing economies,Low income,1,NRVA,1980,Co-residents only,...,,12107.0,0.083271,8538.0,0.389952,,,,,0.1
2,Afghanistan,AFG,AFG,South Asia,Developing economies,Low income,1,NRVA,1980,Co-residents only,...,,12996.0,0.089161,9516.0,0.307687,,,,,0.1
3,Afghanistan,AFG,AFG,South Asia,Developing economies,Low income,1,NRVA,1980,Co-residents only,...,,25396.0,0.050447,18387.0,0.218062,,,,,0.1
4,Afghanistan,AFG,AFG,South Asia,Developing economies,Low income,1,NRVA,1980,Co-residents only,...,,12246.0,0.047961,8677.0,0.230909,,,,,0.1


In [13]:
# country code data to be able to plot on choropleth
country_code = pd.read_csv('https://pkgstore.datahub.io/JohnSnowLabs/country-and-continent-codes-list/country-and-continent-codes-list-csv_csv/data/b7876b7f496677669644f3d1069d3121/country-and-continent-codes-list-csv_csv.csv')

# rename column so it can be joined later 
country_code = country_code.rename(columns={"Three_Letter_Country_Code": 'iso3'})
country_code.head()

Unnamed: 0,Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,iso3,Country_Number
0,Asia,AS,"Afghanistan, Islamic Republic of",AF,AFG,4.0
1,Europe,EU,"Albania, Republic of",AL,ALB,8.0
2,Antarctica,AN,Antarctica (the territory South of 60 deg S),AQ,ATA,10.0
3,Africa,AF,"Algeria, People's Democratic Republic of",DZ,DZA,12.0
4,Oceania,OC,American Samoa,AS,ASM,16.0


## Wrangling

In [14]:
# drop rows where we don't have education mobility info, and drop unneeded columns
raw_df = raw_df.dropna(subset=['IGP', 'year'])
# keep relevant columns, group by variable categories and report mean IGP (education persistance)
ed_mob_df = ed_mob_df.groupby(['countryname', 'iso3', 'region', 'incgroup2', 'incgroup4', 'year', 'child']).agg({'IGP': 'mean'})
ed_mob_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,IGP
countryname,iso3,region,incgroup2,incgroup4,year,child,Unnamed: 7_level_1
Afghanistan,AFG,South Asia,Developing economies,Low income,1980,all,0.621688
Afghanistan,AFG,South Asia,Developing economies,Low income,1980,daughter,0.669084
Afghanistan,AFG,South Asia,Developing economies,Low income,1980,son,0.622239
Albania,ALB,Europe & Central Asia,Developing economies,Upper middle income,1940,all,0.593197
Albania,ALB,Europe & Central Asia,Developing economies,Upper middle income,1940,daughter,0.555591
...,...,...,...,...,...,...,...
"Yemen, Rep.",YEM,Middle East & North Africa,Developing economies,Lower middle income,1980,daughter,0.584839
"Yemen, Rep.",YEM,Middle East & North Africa,Developing economies,Lower middle income,1980,son,0.250859
Zambia,ZMB,Sub-Saharan Africa,Developing economies,Lower middle income,1980,all,0.408004
Zambia,ZMB,Sub-Saharan Africa,Developing economies,Lower middle income,1980,daughter,0.419393


In the next cell, I convert the `IGP` (intergenerational persistance) to a new 'education mobility index' (`EMI`) where 1 represents a high mobility and 0 represents the lowest mobility

In [16]:
#find max and min IGP, then calculate EMI by mapping min IGP to 1 and max IGP to 0
max_igp = ed_mob_df['IGP'].max()
min_igp = ed_mob_df['IGP'].min()
#scale 
ed_mob_df['EMI'] = (1 / (min_igp - max_igp))* (ed_mob_df['IGP'] - max_igp)
ed_mob_df.reset_index(inplace=True)
ed_mob_df.head()

Unnamed: 0,index,countryname,iso3,region,incgroup2,incgroup4,year,child,IGP,EMI
0,0,Afghanistan,AFG,South Asia,Developing economies,Low income,1980,all,0.621688,0.723749
1,1,Afghanistan,AFG,South Asia,Developing economies,Low income,1980,daughter,0.669084,0.708083
2,2,Afghanistan,AFG,South Asia,Developing economies,Low income,1980,son,0.622239,0.723567
3,3,Albania,ALB,Europe & Central Asia,Developing economies,Upper middle income,1940,all,0.593197,0.733167
4,4,Albania,ALB,Europe & Central Asia,Developing economies,Upper middle income,1940,daughter,0.555591,0.745598


Make each year a column so can merge with country codes and assign NaN values a unique plotting value

In [26]:
ed_mob_wide = pd.pivot_table(ed_mob_df,
                             values=['EMI'],
                             index=['countryname', 'iso3', 'region', 'incgroup2', 'incgroup4', 'child'],
                             columns='year')
#save a version with NaN in case want to use it later
ed_mob_wide_NaN = ed_mob_wide

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,EMI,EMI,EMI,EMI,EMI
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,year,1940,1950,1960,1970,1980
countryname,iso3,region,incgroup2,incgroup4,child,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Afghanistan,AFG,South Asia,Developing economies,Low income,all,,,,,0.723749
Afghanistan,AFG,South Asia,Developing economies,Low income,daughter,,,,,0.708083
Afghanistan,AFG,South Asia,Developing economies,Low income,son,,,,,0.723567
Albania,ALB,Europe & Central Asia,Developing economies,Upper middle income,all,0.733167,0.791035,0.804456,0.780659,0.797828
Albania,ALB,Europe & Central Asia,Developing economies,Upper middle income,daughter,0.745598,0.761773,0.795958,0.770796,0.793776
...,...,...,...,...,...,...,...,...,...,...
"Yemen, Rep.",YEM,Middle East & North Africa,Developing economies,Lower middle income,daughter,,,,,0.735930
"Yemen, Rep.",YEM,Middle East & North Africa,Developing economies,Lower middle income,son,,,,,0.846327
Zambia,ZMB,Sub-Saharan Africa,Developing economies,Lower middle income,all,,,,,0.794382
Zambia,ZMB,Sub-Saharan Africa,Developing economies,Lower middle income,daughter,,,,,0.790618


## Plotting