# Revolutions — World Bank data cleaning
*Prepared by Matthew Chiu*

**This is a data cleaning notebook for the *Revolutions* project.**
- It cleans the data from the World Bank (a full list of variables is available separately).
- The data is taken from the Redl & Hlashtwayo (2021) paper: the variables are lifted from there.
- Year, countries and the specific series can all be updated as necessary.

In [5]:
!pip install wbgapi
import wbgapi as wb
import pandas as pd
def read_csv(filename):
    return pd.read_csv(filename).to_dict('records')



In [2]:
# test dataframe
test_df = wb.data.DataFrame('SP.POP.TOTL', wb.region.members('AFR'), range(2010, 2020, 2))
test_df.to_csv(r'/Users/matthewchiu/Desktop/LSESU DSS Revolutions project/economic_data.csv')

In [6]:
WB_database_codes = read_csv('WB_categories.csv')
data_categories = []
i = 0
for element in WB_database_codes:
    data_categories.append(WB_database_codes[i]['Code'])
    i += 1
print(data_categories)

# missing (to add: Life expectancy at birth, female less male; Unemployment, youth male less female)

['SN.ITK.MSFI.ZS', 'SI.POV.MDIM', 'SI.POV.MDIM.XQ', 'SI.POV.LMIC.GP', 'SI.POV.MDIM.IT', 'EN.ATM.CO2E.PP.GD.KD', 'FR.INR.RINR', 'GC.XPN.INTP.RV.ZS', 'FR.INR.LNDP', 'NY.GDP.DEFL.KD.ZG', 'BX.KLT.DINV.WD.GD.ZS', 'NY.GDP.MKTP.KD', 'NY.GDP.PCAP.KD', 'GC.DOD.TOTL.GD.ZS', 'BN.CAB.XOKA.GD.ZS', 'MS.MIL.XPND.GD.ZS', 'GC.TAX.TOTL.GD.ZS', 'PX.REX.REER', 'PA.NUS.FCRF', 'PA.NUS.PPPC.RF', 'EG.ELC.ACCS.ZS', 'SH.H2O.BASW.ZS', 'SH.STA.BASS.ZS', 'EG.CFT.ACCS.ZS', 'SH.MED.CMHW.P3', 'SH.XPD.CHEX.PP.CD', 'SH.MED.BEDS.ZS', 'SP.DYN.LE00.FE.IN', 'SP.DYN.LE00.MA.IN', 'SH.MED.NUMW.P3', 'SH.MED.PHYS.ZS', 'SI.POV.GINI', 'SI.DST.10TH.10', 'SI.DST.FRST.10', 'SE.ENR.PRSC.FM.ZS', 'FX.OWN.TOTL.40.ZS', 'SE.ADT.1524.LT.FM.ZS', 'SL.UEM.1524.MA.ZS', 'SL.UEM.1524.FE.ZS', 'SL.WAG.0714.ZS', 'SL.UEM.TOTL.ZS', 'SL.UEM.INTM.ZS', 'ST.INT.ARVL', 'ST.INT.DPRT', 'SE.ADT.LITR.ZS', 'SE.ADT.1524.LT.ZS', 'SE.TER.CUAT.BA.MA.ZS', 'SE.TER.CUAT.BA.ZS', 'SE.TER.CUAT.BA.FE.ZS', 'EN.URB.LCTY', 'EN.URB.MCTY.TL.ZS', 'EN.POP.DNST', 'EN.POP.SLUM.UR

In [7]:
# getting data from WB
full_dataset_WB = wb.data.DataFrame(data_categories, time=range(2010,2021), labels=True, columns='series')

In [64]:
# drop regional statistics (only keep individual country statistics)
full_dataset_WB_edited = full_dataset_WB.drop(["AFE", "AFW", "ARB", "CSS", "CEB", "EAR", "EAS", "EAP", "TEA", "EMU", "ECS", "ECA", "TEC", "EUU", "FCS", "HPC", "HIC", "IBD", "IBT", "IDB", "IDX", "IDA", "LTE", "LCN", "LAC", "TLA", "LDC", "LMY", "LIC", "LMC", "MEA", "MNA", "TMN", "MIC", "NAC", "INX", "OED", "OSS", "PSS", "PST", "PRE", "SST", "SAS", "TSA", "SSF", "SSA", "TSS", "UMC", "WLD"],axis=0)
full_dataset_WB_edited

Unnamed: 0_level_0,Unnamed: 1_level_0,Country,Time,BN.CAB.XOKA.GD.ZS,BX.KLT.DINV.WD.GD.ZS,EG.CFT.ACCS.ZS,EG.ELC.ACCS.ZS,EN.ATM.CO2E.PP.GD.KD,EN.POP.DNST,EN.POP.SLUM.UR.ZS,EN.URB.LCTY,...,SL.UEM.INTM.ZS,SL.UEM.TOTL.ZS,SL.WAG.0714.ZS,SM.POP.TOTL.ZS,SN.ITK.MSFI.ZS,SP.DYN.LE00.FE.IN,SP.DYN.LE00.MA.IN,SP.URB.TOTL.IN.ZS,ST.INT.ARVL,ST.INT.DPRT
economy,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ZWE,YR2020,Zimbabwe,2020,5.096583,0.699034,30.4,52.747669,,40.505793,21.56821,1529920.0,...,,7.898,,,73.0,63.862,58.129,32.242,639000.0,
ZWE,YR2019,Zimbabwe,2019,4.216114,1.142806,30.1,46.781475,0.347597,39.691374,,1521311.0,...,9.12,7.370,,,69.8,63.661,58.644,32.210,2294000.0,3275000.0
ZWE,YR2018,Zimbabwe,2018,-4.039183,2.101721,29.9,45.572647,0.342751,38.909614,22.16356,1515016.0,...,,6.784,,,66.7,63.594,58.968,32.209,2580000.0,2288000.0
ZWE,YR2017,Zimbabwe,2017,-1.542184,1.746885,29.8,44.178635,0.300613,38.131320,,1509901.0,...,,6.279,,,67.0,62.956,58.208,32.237,2423000.0,2768000.0
ZWE,YR2016,Zimbabwe,2016,-3.394138,1.669274,29.8,42.561729,0.333455,37.359969,22.75892,1504803.0,...,,5.796,,,66.7,62.416,57.956,32.296,2168000.0,3192000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AFG,YR2014,Afghanistan,2014,-15.731394,0.209119,26.1,89.500000,0.069557,50.160542,69.68829,3632269.0,...,10.62,7.910,,,,64.274,60.812,24.587,,
AFG,YR2013,Afghanistan,2013,-24.775921,0.234926,24.8,68.290649,0.087705,48.359028,,3543232.0,...,,7.949,,,,64.027,60.791,24.373,,
AFG,YR2012,Afghanistan,2012,-25.491342,0.281255,23.0,69.099998,0.124932,46.711251,67.24795,3456496.0,...,,8.019,,,,63.514,60.317,24.160,,
AFG,YR2011,Afghanistan,2011,-12.352229,0.286818,21.8,43.222019,0.155682,44.844851,,3371653.0,...,,7.947,17.37,,,62.993,59.825,23.948,,


In [65]:
# Creating a moving average
full_dataset_WB_edited["Time"] = pd.to_datetime(full_dataset_WB_edited["Time"], format='%Y')
full_dataset_WB_edited.set_index("Time", inplace=True) # sets time column as the index
df_monthly = full_dataset_WB_edited.groupby("Country").apply(lambda x: x.resample('M', loffset='-1y').mean().ffill().rolling(12).mean()) # creates the rolling average. (with a one year offset)
full_dataset_WB_edited.to_csv(r'full_dataset_WB_edited.csv')
df_monthly.to_csv(r'df_monthly.csv')
df_monthly


>>> df.resample(freq="3s", loffset="8H")

becomes:

>>> from pandas.tseries.frequencies import to_offset
>>> df = df.resample(freq="3s").mean()
>>> df.index = df.index.to_timestamp() + to_offset("8H")

  df_monthly = full_dataset_WB_edited.groupby("Country").apply(lambda x: x.resample('M', loffset='-1y').mean().ffill().rolling(12).mean()) # creates the rolling average

>>> df.resample(freq="3s", loffset="8H")

becomes:

>>> from pandas.tseries.frequencies import to_offset
>>> df = df.resample(freq="3s").mean()
>>> df.index = df.index.to_timestamp() + to_offset("8H")

  df_monthly = full_dataset_WB_edited.groupby("Country").apply(lambda x: x.resample('M', loffset='-1y').mean().ffill().rolling(12).mean()) # creates the rolling average

>>> df.resample(freq="3s", loffset="8H")

becomes:

>>> from pandas.tseries.frequencies import to_offset
>>> df = df.resample(freq="3s").mean()
>>> df.index = df.index.to_timestamp() + to_offset("8H")

  df_monthly = full_dataset_WB_edited.groupby("Countr

Unnamed: 0_level_0,Unnamed: 1_level_0,BN.CAB.XOKA.GD.ZS,BX.KLT.DINV.WD.GD.ZS,EG.CFT.ACCS.ZS,EG.ELC.ACCS.ZS,EN.ATM.CO2E.PP.GD.KD,EN.POP.DNST,EN.POP.SLUM.UR.ZS,EN.URB.LCTY,EN.URB.MCTY.TL.ZS,FR.INR.LNDP,...,SL.UEM.INTM.ZS,SL.UEM.TOTL.ZS,SL.WAG.0714.ZS,SM.POP.TOTL.ZS,SN.ITK.MSFI.ZS,SP.DYN.LE00.FE.IN,SP.DYN.LE00.MA.IN,SP.URB.TOTL.IN.ZS,ST.INT.ARVL,ST.INT.DPRT
Country,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Afghanistan,2009-12-31,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,2009-12-31,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,2009-12-31,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,2009-12-31,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,2009-12-31,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe,2018-12-31,2.152290,1.382534,30.050000,46.479268,0.346385,39.495934,22.163560,1.519737e+06,9.947158,11.054792,...,9.165,7.223500,,2.556383,69.025000,63.644250,58.725000,32.209750,2.365500e+06,3028250.0
Zimbabwe,2018-12-31,2.840231,1.302625,30.066667,46.580004,0.346789,39.561081,22.163560,1.520262e+06,9.934054,11.772083,...,9.150,7.272333,,2.556383,69.283333,63.649833,58.698000,32.209833,2.341667e+06,3110500.0
Zimbabwe,2018-12-31,3.528173,1.222715,30.083333,46.680739,0.347193,39.626227,22.163560,1.520786e+06,9.920950,12.489375,...,9.135,7.321167,,2.556383,69.541667,63.655417,58.671000,32.209917,2.317833e+06,3192750.0
Zimbabwe,2018-12-31,4.216114,1.142806,30.100000,46.781475,0.347597,39.691374,22.163560,1.521311e+06,9.907847,13.206667,...,9.120,7.370000,,2.556383,69.800000,63.661000,58.644000,32.210000,2.294000e+06,3275000.0


In [15]:
wb.topic.info()

id,value
1.0,Agriculture & Rural Development
2.0,Aid Effectiveness
3.0,Economy & Growth
4.0,Education
5.0,Energy & Mining
6.0,Environment
7.0,Financial Sector
8.0,Health
9.0,Infrastructure
10.0,Social Protection & Labor
