# Revolutions — World Bank data cleaning
*Prepared by Matthew Chiu*

**This is a data cleaning notebook for the *Revolutions* project.**
- It cleans the data from the World Bank (a full list of variables is available separately).
- The data is taken from the Redl & Hlashtwayo (2021) paper: the variables are lifted from there.
- Year, countries and the specific series can all be updated as necessary.

In [211]:
!pip install wbgapi
import wbgapi as wb
import pandas as pd
import csv
def read_csv(filename):
    return pd.read_csv(filename).to_dict('records')

# Reading matching data
with open("country_codes.csv", 'r') as file:
    next(file)
    data = csv.reader(file)
    countries_df = pd.DataFrame(data)
countries_df.columns = column_names = ["Country", "geo", "economy", "number"]
countries_df = countries_df.drop(columns="number")
countries_df



Unnamed: 0,Country,geo,economy
0,Afghanistan,AF,AFG
1,Albania,AL,ALB
2,Algeria,DZ,DZA
3,American Samoa,AS,ASM
4,Andorra,AD,AND
...,...,...,...
244,Western Sahara,EH,ESH
245,Yemen,YE,YEM
246,Zambia,ZM,ZMB
247,Zimbabwe,ZW,ZWE


In [2]:
# test dataframe
test_df = wb.data.DataFrame('SP.POP.TOTL', wb.region.members('AFR'), range(2010, 2020, 2))
test_df.to_csv(r'/Users/matthewchiu/Desktop/LSESU DSS Revolutions project/economic_data.csv')

In [6]:
WB_database_codes = read_csv('WB_categories.csv')
data_categories = []
i = 0
for element in WB_database_codes:
    data_categories.append(WB_database_codes[i]['Code'])
    i += 1
print(data_categories)

# missing (to add: Life expectancy at birth, female less male; Unemployment, youth male less female)

['SN.ITK.MSFI.ZS', 'SI.POV.MDIM', 'SI.POV.MDIM.XQ', 'SI.POV.LMIC.GP', 'SI.POV.MDIM.IT', 'EN.ATM.CO2E.PP.GD.KD', 'FR.INR.RINR', 'GC.XPN.INTP.RV.ZS', 'FR.INR.LNDP', 'NY.GDP.DEFL.KD.ZG', 'BX.KLT.DINV.WD.GD.ZS', 'NY.GDP.MKTP.KD', 'NY.GDP.PCAP.KD', 'GC.DOD.TOTL.GD.ZS', 'BN.CAB.XOKA.GD.ZS', 'MS.MIL.XPND.GD.ZS', 'GC.TAX.TOTL.GD.ZS', 'PX.REX.REER', 'PA.NUS.FCRF', 'PA.NUS.PPPC.RF', 'EG.ELC.ACCS.ZS', 'SH.H2O.BASW.ZS', 'SH.STA.BASS.ZS', 'EG.CFT.ACCS.ZS', 'SH.MED.CMHW.P3', 'SH.XPD.CHEX.PP.CD', 'SH.MED.BEDS.ZS', 'SP.DYN.LE00.FE.IN', 'SP.DYN.LE00.MA.IN', 'SH.MED.NUMW.P3', 'SH.MED.PHYS.ZS', 'SI.POV.GINI', 'SI.DST.10TH.10', 'SI.DST.FRST.10', 'SE.ENR.PRSC.FM.ZS', 'FX.OWN.TOTL.40.ZS', 'SE.ADT.1524.LT.FM.ZS', 'SL.UEM.1524.MA.ZS', 'SL.UEM.1524.FE.ZS', 'SL.WAG.0714.ZS', 'SL.UEM.TOTL.ZS', 'SL.UEM.INTM.ZS', 'ST.INT.ARVL', 'ST.INT.DPRT', 'SE.ADT.LITR.ZS', 'SE.ADT.1524.LT.ZS', 'SE.TER.CUAT.BA.MA.ZS', 'SE.TER.CUAT.BA.ZS', 'SE.TER.CUAT.BA.FE.ZS', 'EN.URB.LCTY', 'EN.URB.MCTY.TL.ZS', 'EN.POP.DNST', 'EN.POP.SLUM.UR

In [7]:
# getting data from WB
full_dataset_WB = wb.data.DataFrame(data_categories, time=range(2010,2021), labels=True, columns='series')

In [230]:
# drop regional statistics (only keep individual country statistics)
full_dataset_WB_edited = full_dataset_WB.drop(["AFE", "AFW", "ARB", "CSS", "CEB", "EAR", "EAS", "EAP", "TEA", "EMU", "ECS", "ECA", "TEC", "EUU", "FCS", "HPC", "HIC", "IBD", "IBT", "IDB", "IDX", "IDA", "LTE", "LCN", "LAC", "TLA", "LDC", "LMY", "LIC", "LMC", "MEA", "MNA", "TMN", "MIC", "NAC", "INX", "OED", "OSS", "PSS", "PST", "PRE", "SST", "SAS", "TSA", "SSF", "SSA", "TSS", "UMC", "WLD"],axis=0)
full_dataset_WB_edited

# Creating a moving average
full_dataset_WB_edited["Time"] = pd.to_datetime(full_dataset_WB_edited["Time"], format='%Y')
full_dataset_WB_edited = pd.DataFrame.merge(full_dataset_WB_edited, countries_df, on=["Country"], how="left")
full_dataset_WB_edited.to_csv(r'full_dataset_WB_edited.csv')

In [231]:
full_dataset_WB_edited.set_index("Time", inplace=True) # sets time column as the index
df_monthly = full_dataset_WB_edited.groupby("Country").apply(lambda x: x.resample('MS', loffset='6m').mean().ffill().rolling(12).mean()) # creates the rolling average. (with a one year offset)

# Match on countries
df_monthly = df_monthly.reset_index() # reset index
df_monthly_merged = pd.merge(df_monthly_merged, df_monthly, on=["Country", "Time"], how="left")
df_monthly_merged = pd.DataFrame.merge(df_monthly, countries_df, on="Country", how="left")
df_monthly_merged["Time"] = pd.to_datetime(df_monthly_merged["Time"]) + pd.Timedelta(days=1) # shift by one day


# Export
df_monthly_merged.to_csv(r'df_monthly.csv')
df_monthly_merged


>>> df.resample(freq="3s", loffset="8H")

becomes:

>>> from pandas.tseries.frequencies import to_offset
>>> df = df.resample(freq="3s").mean()
>>> df.index = df.index.to_timestamp() + to_offset("8H")

  df_monthly = full_dataset_WB_edited.groupby("Country").apply(lambda x: x.resample('MS', loffset='6m').mean().ffill().rolling(12).mean()) # creates the rolling average. (with a one year offset)

>>> df.resample(freq="3s", loffset="8H")

becomes:

>>> from pandas.tseries.frequencies import to_offset
>>> df = df.resample(freq="3s").mean()
>>> df.index = df.index.to_timestamp() + to_offset("8H")

  df_monthly = full_dataset_WB_edited.groupby("Country").apply(lambda x: x.resample('MS', loffset='6m').mean().ffill().rolling(12).mean()) # creates the rolling average. (with a one year offset)

>>> df.resample(freq="3s", loffset="8H")

becomes:

>>> from pandas.tseries.frequencies import to_offset
>>> df = df.resample(freq="3s").mean()
>>> df.index = df.index.to_timestamp() + to_offset("8H")

 

Unnamed: 0,Country,Time,BN.CAB.XOKA.GD.ZS,BX.KLT.DINV.WD.GD.ZS,EG.CFT.ACCS.ZS,EG.ELC.ACCS.ZS,EN.ATM.CO2E.PP.GD.KD,EN.POP.DNST,EN.POP.SLUM.UR.ZS,EN.URB.LCTY,...,SL.WAG.0714.ZS,SM.POP.TOTL.ZS,SN.ITK.MSFI.ZS,SP.DYN.LE00.FE.IN,SP.DYN.LE00.MA.IN,SP.URB.TOTL.IN.ZS,ST.INT.ARVL,ST.INT.DPRT,geo,economy
0,Afghanistan,2010-07-01,,,,,,,,,...,,,,,,,,,AF,AFG
1,Afghanistan,2010-08-01,,,,,,,,,...,,,,,,,,,AF,AFG
2,Afghanistan,2010-09-01,,,,,,,,,...,,,,,,,,,AF,AFG
3,Afghanistan,2010-10-01,,,,,,,,,...,,,,,,,,,AF,AFG
4,Afghanistan,2010-11-01,,,,,,,,,...,,,,,,,,,AF,AFG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26252,Zimbabwe,2020-03-01,2.152290,1.382534,30.050000,46.479268,0.346385,39.495934,22.163560,1.519737e+06,...,,2.556383,69.025000,63.644250,58.725000,32.209750,2.365500e+06,3028250.0,ZW,ZWE
26253,Zimbabwe,2020-04-01,2.840231,1.302625,30.066667,46.580004,0.346789,39.561081,22.163560,1.520262e+06,...,,2.556383,69.283333,63.649833,58.698000,32.209833,2.341667e+06,3110500.0,ZW,ZWE
26254,Zimbabwe,2020-05-01,3.528173,1.222715,30.083333,46.680739,0.347193,39.626227,22.163560,1.520786e+06,...,,2.556383,69.541667,63.655417,58.671000,32.209917,2.317833e+06,3192750.0,ZW,ZWE
26255,Zimbabwe,2020-06-01,4.216114,1.142806,30.100000,46.781475,0.347597,39.691374,22.163560,1.521311e+06,...,,2.556383,69.800000,63.661000,58.644000,32.210000,2.294000e+06,3275000.0,ZW,ZWE


In [15]:
wb.topic.info()

id,value
1.0,Agriculture & Rural Development
2.0,Aid Effectiveness
3.0,Economy & Growth
4.0,Education
5.0,Energy & Mining
6.0,Environment
7.0,Financial Sector
8.0,Health
9.0,Infrastructure
10.0,Social Protection & Labor
