In [1]:
import pandas as pd
import numpy as np

In [36]:
education = pd.read_csv("../Data/WorldBank/Education_WB/edData.csv")
educationSimple = pd.read_csv("../Data/WorldBank/WB_Edu_simple/646c5220-17f9-41f4-b39f-0f8c34130df6_Data.csv")
gdpInvestment = pd.read_csv("../Data/WorldBank/GDP-Investment_WB/9570f3c5-6012-43e4-980d-8ea2ece70445_Data.csv")
childMortality = pd.read_csv("../Data/WorldBank/childMortality_WB/8a867181-d49e-48fc-b846-4dc5e7550807_Data.csv")
taxes = pd.read_csv("../Data/WorldBank/WB_taxesOfGDP/a77adba3-031c-495c-a100-610c369cdff7_Data.csv")

In [37]:
def drop_non_year_column(titles):
    # Get Years in dataset
    drops = []
    for title in titles:
        if not title.isdigit() : drops.append(title)
    return drops

def cut_year_title_down(title):
    # world bank columns end with [YRxxxx] EG: 1970 [YR1970]
    # This can be cut a way easily
    if title[0].isdigit() : title = title[0:4]
    return title

def transpose_world_bank_data(worldBankData):
    result = []
    
    worldBankData = pd.DataFrame(worldBankData)
    worldBankData.columns = [cut_year_title_down(x) for x in worldBankData.columns]

    years = worldBankData.columns.drop(labels=drop_non_year_column(worldBankData.columns))
    grouped = worldBankData.groupby("Series Code")
    for (name, group) in grouped.__iter__():
        transposed = group.transpose()
        Resgrouped = []
        
        for (country) in transposed[:3]:
            countryData = transposed[country]
            for year in years:
                Resgrouped.append({
                    'year':int(year),
                    'country':countryData["Country Code"],
                    name:countryData[year],
                    })

        if len(result) > 0: 
            result = pd.DataFrame(result)
            new = pd.DataFrame(Resgrouped)
            result= result.merge(new, on=["year","country"], how="outer", sort=True)
        else:
            result = pd.DataFrame(Resgrouped)
    return result

In [38]:
education_data = transpose_world_bank_data(education)

gdpInvestment_data = transpose_world_bank_data(gdpInvestment)
total = education_data.merge(gdpInvestment_data, on=["year","country"], how="outer", sort=True)

educationSimple_data = transpose_world_bank_data(educationSimple)
total = total.merge(educationSimple_data, on=["year","country"], how="outer", sort=True)

childMortality_data = transpose_world_bank_data(childMortality)
total = total.merge(childMortality_data, on=["year","country"], how="outer", sort=True)

taxes_data = transpose_world_bank_data(taxes)
total = total.merge(taxes_data, on=["year","country"], how="outer", sort=True)

In [39]:
total = total.loc[~total.index.duplicated(),:].copy()

In [40]:
total.columns

Index(['year', 'country', 'NY.GDP.MKTP.CD_x', 'NY.GDP.MKTP.KD',
       'NY.GDP.MKTP.PP.CD', 'NY.GDP.MKTP.PP.KD', 'NY.GDP.PCAP.CD',
       'NY.GDP.PCAP.KD', 'NY.GDP.PCAP.PP.CD', 'NY.GDP.PCAP.PP.KD',
       'SE.XPD.TOTL.GD.ZS', 'UIS.ILLPOP.AG25T64', 'UIS.ILLPOP.AG25T64.F',
       'UIS.ILLPOP.AG25T64.M', 'UIS.ILLPOPF.AG25T64', 'UIS.LP.AG15T24',
       'UIS.LP.AG15T24.F', 'UIS.LP.AG15T24.M', 'UIS.LP.AG15T99',
       'UIS.LP.AG15T99.F', 'UIS.LP.AG15T99.M', 'UIS.LP.AG65', 'UIS.LP.AG65.F',
       'UIS.LP.AG65.M', 'UIS.LPP.AG15T24', 'UIS.LPP.AG15T99', 'UIS.LPP.AG65',
       'UIS.PLILLITP', 'UIS.PLILLITP.F', 'UIS.PLILLITP.M', 'UIS.XGDP.0.FSGOV',
       'UIS.XGDP.1.FSGOV', 'UIS.XGDP.2.FSGOV', 'UIS.XGDP.23.FSGOV',
       'UIS.XGDP.2T4.V.FSGOV', 'UIS.XGDP.3.FSGOV', 'UIS.XGDP.4.FSGOV',
       'UIS.XGDP.56.FSGOV', 'UIS.XUNIT.GDPCAP.02.FSGOV',
       'UIS.XUNIT.GDPCAP.1.FSGOV', 'UIS.XUNIT.GDPCAP.1.FSHH',
       'UIS.XUNIT.GDPCAP.2.FSGOV', 'UIS.XUNIT.GDPCAP.23.FSGOV',
       'UIS.XUNIT.GDPCAP.23.FSHH'

In [42]:
columnNames = {
    'NY.GDP.MKTP.CD_x':'GDP-currentDollar-outdated',
    'NY.GDP.MKTP.CD':'GDP-currentDollar',
    'NY.GDP.PCAP.CD':'GDP-PerCapita',
    'NY.GDP.PCAP.CD_x':'GDP-PerCapita-outdated',
    'NY.GDP.MKTP.KD':'GDP-In2015Dollars',
    'NY.GDP.PCAP.KD':'GDP-perCapital2015Dollar',
    'NY.GDP.PCAP.PP.KD':'GDP-perCapital2017Dollar',
    'NY.GDP.MKTP.PP.CD':'GDP-InCurrentDollars',
    'NY.GDP.MKTP.PP.KD':'GDP-In2017Dollars',
    'NY.GDP.PCAP.PP.CD':'GDP-perCapital-current',
    'NY.GDS.TOTL.ZS':'grossDomesticSavings',
    'SP.DYN.IMRT.IN':'childMortalityRatePer1000',
    'GC.TAX.TOTL.GD.ZS':'taxRevenueOfGDP',
    'NY.ADJ.ICTR.GN.ZS':'adjustedSavingsGrossSaving',
    'SE.XPD.TOTL.GD.ZS':'expenseOnEduOfGDP',
    'BN.KLT.DINV.CD':'foreigDirectInvestment',
    'BX.KLT.DINV.WD.GD.ZS':'foreigDirectInvestmentOfGDP',
    'GC.NFN.TOTL.GD.ZS':'netInvestmentOfGDP',
    
    'SE.ADT.LITR.ZS': 'adultIllit',
    'UIS.XGDP.56.FSGOV':'tertEduOfGDP',
    'UIS.XGDP.23.FSGOV':'seconEduOfGDP',
    'UIS.XGDP.1.FSGOV':'primerEduOfGDP',

    "UIS.LPP.Ag15t99":	"Adul_illit_pop_prec-female",
    "UIS.LP.Ag15t99":	"Adul_illit_pop_prec-both",
    "UIS.LP.Ag15t99.F":	"Adul_illit_pop_prec-female",
    "UIS.LP.Ag15t99.M":	"Adul_illit_pop_prec-male",
    "UIS.LP.Ag65.F":	"Elderly_illit_pop-female",
    "UIS.LP.Ag65.M":	"Elderly_illit_pop-male",
    "UIS.ILLPOPF.AG25T64":	"Illit_pop_workForce-female",
    "UIS.ILLPOP.AG25T64":	"Illit_pop_workForce",
    "UIS.ILLPOP.AG25T64.F":	"Illit_pop_workForce-female",
    "UIS.ILLPOP.AG25T64.M":	"Illit_pop_workForce-male",
    "UIS.PLILLITP":	"Partic_lit_prog-illit_pop",
    "UIS.PLILLITP.F":	"Partic_lit_prog_illit_pop-female",
    "UIS.PLILLITP.M":	"Partic_lit_prog_illit_pop-male",
    "UIS.LPP.Ag15t24":	"Youth_illit_pop_prec-female",
    "UIS.LP.Ag15t24":	"Youth_illit_pop",
    "UIS.LP.Ag15t24.F":	"Youth_illit_pop_female",
    "UIS.LP.Ag15t24.M":	"Youth_illit_pop_male",
    
    "UIS.XGDP.2.FSGOV":	"Gov_expe_second_edu-of-GDP",
    "UIS.XGDP.4.FSGOV":	"Gov_expe_secodPost_edu-of-GDP",
    "UIS.XGDP.0.FSGOV":	"Gov_expe_prePrim_edu-of-GDP",
    "UIS.XGDP.1.FSGOV":	"Gov_expe_prim_edu-of-GDP",

    "UIS.XGDP.2T4.V.FSGOV": "Gov_expe_secondTer-edu-ofGDP",
    "UIS.XGDP.23.FSGOV": "Gov_expe_second_educ-perc-GDP",
    "UIS.XGDP.56.FSGOV": "Gov_expe_tert_edu-perc-GDP",

    "UIS.XGDP.3.FSGOV": "Gov_expe on upper secondary education as a percentage of GDP (%)",
    "UIS.XUNIT.GDPCAP.2.FSGOV":	"inti_gov_fund- lower secondary student as a percentage of GDP per capita",
    "UIS.XUNIT.GDPCAP.02.FSGOV": "inti_gov_fund- pre-primary student as a percentage of GDP per capita",
    "UIS.XUNIT.GDPCAP.1.FSGOV":	"inti_gov_fund- primary student as a percentage of GDP per capita",
    "UIS.XUNIT.GDPCAP.23.FSGOV": "inti_gov_fund- secondary student as a percentage of GDP per capita",
    "UIS.XUNIT.GDPCAP.5T8.FSGOV": "inti_gov_fund- tertiary student as a percentage of GDP per capita",
    "UIS.XUNIT.GDPCAP.3.FSGOV": "inti_gov_fund- upper secondary student as a percentage of GDP per capita",

    "UIS.XUNIT.GDPCAP.23.FSHH": "inti_house-fund_seco_stud-per-GDP",
    "UIS.XUNIT.GDPCAP.1.FSHH": "inti_house-fund_prim_stud-per-GDP",
    "UIS.XUNIT.GDPCAP.5T8.FSHH": "inti_house-fund_tert_stud-per-GDP",

}
total = total.rename(columns=columnNames)

In [46]:
total = total.replace("..",np.nan)

In [47]:
total

Unnamed: 0,year,country,GDP-currentDollar-outdated,GDP-In2015Dollars,GDP-InCurrentDollars,GDP-In2017Dollars,GDP-PerCapita,GDP-perCapital2015Dollar,GDP-perCapital-current,GDP-perCapital2017Dollar,...,adultIllit,SP.POP.TOTL,UIS.LR.AG25T64,UIS.LR.AG25T64.URB,UIS.LR.AG65,UIS.LR.AG65T99.RUR,UIS.YADULT.PROFILITERACY,UIS.YADULT.PROFILITERACY.GPIA,childMortalityRatePer1000,taxRevenueOfGDP
0,1960,ABW,,,,,,,,,...,,,,,,,,,,
1,1960,AFE,,,,,,,,,...,,,,,,,,,,
2,1960,AFG,,,,,,,,,...,,,,,,,,,,
3,1960,AFW,,,,,,,,,...,,,,,,,,,,
4,1960,AGO,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16742,2021,XKX,,,,,,,,,...,,,,,,,,,,
16743,2021,YEM,,,,,,,,,...,,,,,,,,,,
16744,2021,ZAF,,,,,,,,,...,,,,,,,,,,
16745,2021,ZMB,,,,,,,,,...,,,,,,,,,,


In [48]:
# total.isna().sum().sort_values(ascending=False) / 16747
largeNan = total.isna().sum()[(total.isna().sum()/16747) > 0.9].keys()
doppredColumns = total.drop(columns=largeNan, axis=0)

doppredColumns.isna().sum().sort_values(ascending=False) / 16747

UIS.XUNIT.GDPCAP.02.FSGOV      0.865827
UIS.LR.AG65                    0.850302
UIS.LP.AG65                    0.849824
UIS.LPP.AG65                   0.849824
UIS.LP.AG65.M                  0.848988
UIS.LR.AG25T64                 0.848749
UIS.LP.AG65.F                  0.848331
Illit_pop_workForce            0.847614
Illit_pop_workForce-female     0.847614
Illit_pop_workForce-female     0.847555
SE.ADT.LITR.FE.ZS              0.847435
Illit_pop_workForce-male       0.847376
UIS.LPP.AG15T24                0.847316
UIS.LP.AG15T24.F               0.847316
UIS.LP.AG15T24                 0.847316
UIS.LP.AG15T24.M               0.847197
adultIllit                     0.847137
UIS.LP.AG15T99                 0.846122
UIS.LPP.AG15T99                0.846122
UIS.LP.AG15T99.F               0.846062
UIS.LP.AG15T99.M               0.845883
seconEduOfGDP                  0.821043
primerEduOfGDP                 0.820923
UIS.XGDP.0.FSGOV               0.815250
tertEduOfGDP                   0.803308


In [13]:
total.isna().sum()["GDP-PerCapita"]

6982

In [9]:
for column in total.columns:
    if column != "country":
        total[column] = pd.to_numeric(total[column])
        if column != 'year':
            total[column+'interPolation'] = total.groupby('country').apply(lambda group: group.interpolate())[column]

In [11]:
total["noGDPDummy"] = total["GDP-PerCapita"].isna()
total["noTaxDummy"] = total["taxRevenueOfGDP"].isna()

In [12]:
total.to_csv("data.csv", index=False)