> In this notebook, we import and format the world bank data for modeling. Features from this data serve to model both "school resources", and also "learning outcomes" via literacy rates.

In [56]:
import pandas as pd
import numpy as np

In [57]:
world_bank = pd.read_csv('../data/world_bank/API_4_DS2_en_csv_v2_103930.csv')

In [58]:
countries = ['Albania', 'Algeria', 'Argentina', 'Australia', 'Austria', 'Belgium', 'Brazil',
             'Bulgaria', 'Canada', 'Chile', 'China', 'Colombia', 'Costa Rica', 'Croatia',
             'Cyprus', 'Czechia', 'Denmark', 'Dominican Republic', 'Estonia', 'Finland',
             'France', 'Georgia', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Indonesia',
             'Ireland', 'Israel', 'Italy', 'Japan', 'Jordan', 'Kazakhstan', 'Latvia',
             'Lebanon', 'Lithuania', 'Luxembourg', 'Malaysia', 'Malta', 'Mexico',
             'Montenegro', 'Netherlands', 'New Zealand', 'North Macedonia', 'Norway',
             'Peru', 'Poland', 'Portugal', 'Qatar', 'Republic of Korea', 'Republic of Moldova',
             'Romania', 'Russian Federation', 'Singapore', 'Slovakia', 'Slovenia', 'Spain',
             'Sweden', 'Switzerland', 'Thailand', 'Trinidad and Tobago', 'Tunisia', 'Turkey',
             'United Arab Emirates', 'United Kingdom of Great Britain and Northern Ireland',
             'United States of America', 'Uruguay', 'Viet Nam']

In [59]:
[country for country in countries if country not in world_bank['Country Name'].unique()]

['Czechia',
 'Republic of Korea',
 'Republic of Moldova',
 'Slovakia',
 'United Kingdom of Great Britain and Northern Ireland',
 'United States of America',
 'Viet Nam']

In [60]:
rename_dict = {'Czech Republic': 'Czechia',
               'Korea, Rep.': 'Republic of Korea',
               'Moldova': 'Republic of Moldova',
               'Slovak Republic': 'Slovakia',
               'United Kingdom':
               'United Kingdom of Great Britain and Northern Ireland',
               'United States': 'United States of America',
               'Vietnam': 'Viet Nam'}

In [61]:
for country in world_bank['Country Name']:
    if country in rename_dict.keys():
        world_bank.loc[world_bank['Country Name'] == country, 'Country Name'] = rename_dict[country]

In [62]:
[country for country in countries if country not in world_bank['Country Name'].unique()]

[]

In [63]:
years = [year for year in range(2000, 2016, 3)]
years

[2000, 2003, 2006, 2009, 2012, 2015]

In [64]:
world_bank.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', 'Unnamed: 63'],
      dtype='object')

In [65]:
expenditure_features = [feature for feature in world_bank['Indicator Name'].unique()
                        if 'xpenditure' in feature]
expenditure_features

['Government expenditure on education, total (% of GDP)',
 'Government expenditure on education, total (% of government expenditure)',
 'Expenditure on tertiary education (% of government expenditure on education)',
 'Government expenditure per student, tertiary (% of GDP per capita)',
 'Expenditure on secondary education (% of government expenditure on education)',
 'Government expenditure per student, secondary (% of GDP per capita)',
 'Expenditure on primary education (% of government expenditure on education)',
 'Government expenditure per student, primary (% of GDP per capita)',
 'Current education expenditure, total (% of total expenditure in public institutions)',
 'Current education expenditure, tertiary (% of total expenditure in tertiary public institutions)',
 'Current education expenditure, secondary (% of total expenditure in secondary public institutions)',
 'Current education expenditure, primary (% of total expenditure in primary public institutions)']

In [66]:
literacy_features = [feature for feature in world_bank['Indicator Name'].unique()
                        if 'iteracy' in feature]

literacy_totals = literacy_features[0:1] + literacy_features[3:4]
literacy_totals

['Literacy rate, adult total (% of people ages 15 and above)',
 'Literacy rate, youth total (% of people ages 15-24)']

In [67]:
features = expenditure_features + literacy_totals
features

['Government expenditure on education, total (% of GDP)',
 'Government expenditure on education, total (% of government expenditure)',
 'Expenditure on tertiary education (% of government expenditure on education)',
 'Government expenditure per student, tertiary (% of GDP per capita)',
 'Expenditure on secondary education (% of government expenditure on education)',
 'Government expenditure per student, secondary (% of GDP per capita)',
 'Expenditure on primary education (% of government expenditure on education)',
 'Government expenditure per student, primary (% of GDP per capita)',
 'Current education expenditure, total (% of total expenditure in public institutions)',
 'Current education expenditure, tertiary (% of total expenditure in tertiary public institutions)',
 'Current education expenditure, secondary (% of total expenditure in secondary public institutions)',
 'Current education expenditure, primary (% of total expenditure in primary public institutions)',
 'Literacy rate, 

In [68]:
world_bank_df = pd.DataFrame(columns=features)
for year in years:
    for country in countries:
        country_filter = (world_bank['Country Name'] == country)
        
        country_dict = {feature: 0 for feature in features}
        index = [f"{year}_{country}"]
        
        for feature in features:
            feature_filter = (world_bank['Indicator Name'] == feature)
            
            country_dict[feature] += world_bank[country_filter
                                               & feature_filter][str(year)].values[0]
            
        country_df = pd.DataFrame(country_dict, index=index)
        world_bank_df = pd.concat([world_bank_df, country_df])

In [69]:
world_bank_df.to_csv('../data/world_bank/world_bank.csv', index_label='Year_Country')