# Data Cleaning

In [1648]:
# import modules
import numpy as np
import pandas as pd
import json

# read in country map
with open('./data/country_map.json') as out:
    country_map = json.load(out)

In [1649]:
# define some useful helper functions

# resuable function to print dimensions in a more descriptive manner
def print_shape(df):
    print("Data dimensions: ", df.shape[0], "rows and", df.shape[1], "columns")

In [1650]:
# import main data file
raw_data = pd.read_csv("./data/country_profile_variables_new.csv")
raw_data.head()

Unnamed: 0,country,Region,Surface area (km2),Population in thousands (2017),"Population density (per km2, 2017)","Sex ratio (m per 100 f, 2017)",GDP: Gross domestic product (million current US$),"GDP growth rate (annual %, const. 2005 prices)",GDP per capita (current US$),Economy: Agriculture (% of GVA),...,Mobile-cellular subscriptions (per 100 inhabitants),Individuals using the Internet (per 100 inhabitants),Threatened species (number),Forested area (% of land area),CO2 emission estimates (million tons/tons per capita),"Energy production, primary (Petajoules)",Energy supply per capita (Gigajoules),"Pop. using improved drinking water (urban/rural, %)","Pop. using improved sanitation facilities (urban/rural, %)",Net Official Development Assist. received (% of GNI)
0,Afghanistan,SouthernAsia,652864,35530,54.4,106.3,20270,-2.4,623.2,23.3,...,61.6,8.3,42,2.1,9.8/0.3,63,5,78.2/47.0,45.1/27.0,21.43
1,Albania,SouthernEurope,28748,2930,106.9,101.9,11541,2.6,3984.2,22.4,...,106.4,63.3,130,28.2,5.7/2.0,84,36,94.9/95.2,95.5/90.2,2.96
2,Algeria,NorthernAfrica,2381741,41318,17.3,102.0,164779,3.8,4154.1,12.2,...,113,38.2,135,0.8,145.4/3.7,5900,55,84.3/81.8,89.8/82.2,0.05
3,American Samoa,Polynesia,199,56,278.2,103.6,-99,-99.0,-99.0,-99.0,...,...,-99.0,92,87.9,-99,-99,-99,100.0/100.0,62.5/62.5,-99.0
4,Andorra,SouthernEurope,468,77,163.8,102.3,2812,0.8,39896.4,0.5,...,88.1,96.9,13,34.0,0.5/6.4,1,119,100.0/100.0,100.0/100.0,-99.0


In [1651]:
# check data shape, columns, column types, etc
# print raw data dimens
print_shape(raw_data)

Data dimensions:  229 rows and 49 columns


# Dropping non UN member states and territories

In [1653]:
raw_data['in_un'] = raw_data.apply(lambda x: True if country_map[x.country][1] else False, axis=1)
raw_data = raw_data[raw_data["in_un"] == True]
raw_data.drop(columns="in_un", inplace=True)
print_shape(raw_data)

Data dimensions:  191 rows and 49 columns


# DATA CLEANING - Handling data types & dirty values

In [1654]:
# first lets check the column data types 
#to figure out which columns are objects and could potentially be converted to numbers
print(raw_data.dtypes)

country                                                        object
Region                                                         object
Surface area (km2)                                             object
Population in thousands (2017)                                  int64
Population density (per km2, 2017)                            float64
Sex ratio (m per 100 f, 2017)                                 float64
GDP: Gross domestic product (million current US$)               int64
GDP growth rate (annual %, const. 2005 prices)                 object
GDP per capita (current US$)                                  float64
Economy: Agriculture (% of GVA)                                object
Economy: Industry (% of GVA)                                  float64
Economy: Services and other activity (% of GVA)               float64
Employment: Agriculture (% of employed)                        object
Employment: Industry (% of employed)                           object
Employment: Services

In [1655]:
# first will first split the columns with format "number/number" 
# female / male columns
# Labour force participation (female/male pop. %)
# Life expectancy at birth (females/males, years)
# Education: Primary gross enrol. ratio (f/m per 100 pop.)
# Education: Secondary gross enrol. ratio (f/m per 100 pop.)
# Education: Tertiary gross enrol. ratio (f/m per 100 pop.)

print("OG RAW DATA: ")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(raw_data)

def format_check(x, fb, delimiter):
    if (isinstance(x, str) and "/" in x):
        return x.split(delimiter)[fb]
    return x
    
def do_splitting(df, colName, colALabel="a", colBLabel="b", delimiter="/"):
        ogCol = df[colName]
        colA = ogCol.apply(lambda x : format_check(x, 0, delimiter))
        colB = ogCol.apply(lambda x : format_check(x, 1, delimiter))
        df[colName + "-" + colALabel] = colA
        df[colName + "-" + colBLabel] = colB

def split_cols(df, columns, colALabel="a", colBLabel="b", delimiter="/"):
    if isinstance(columns, str):
        do_splitting(df, columns, colALabel, colBLabel, delimiter)
    elif isinstance(columns, list):
        for colName in columns:
            do_splitting(df, colName, colALabel, colBLabel, delimiter)
    
    df.drop(columns=columns, inplace=True)
    

# female male cols
female_male_cols = ['Labour force participation (female/male pop. %)',
'Life expectancy at birth (females/males, years)',
'Education: Primary gross enrol. ratio (f/m per 100 pop.)',
'Education: Secondary gross enrol. ratio (f/m per 100 pop.)',
'Education: Tertiary gross enrol. ratio (f/m per 100 pop.)']

split_cols(raw_data, female_male_cols, "female", "male", "/")



# other cols
split_cols(raw_data, "Population age distribution (0-14 / 60+ years, %)", "0-14 years", "60+ years")
split_cols(raw_data, "International migrant stock (000/% of total pop.)", "in thousands", "as % of total pop")
split_cols(raw_data, "CO2 emission estimates (million tons/tons per capita)", "million tons", "tons per capita")
split_cols(raw_data, "Pop. using improved drinking water (urban/rural, %)", "Urban", "Rural")
split_cols(raw_data, "Pop. using improved sanitation facilities (urban/rural, %)", "Urban", "Rural")



# # removing dirty values
raw_data = raw_data.replace(to_replace = "...", value = -99)
raw_data = raw_data.replace(to_replace ="^~", value = 0, regex=True)
raw_data = raw_data.replace(to_replace ="-~0.0", value = 0)

# convert entire data frame to numeric
raw_data = raw_data.convert_dtypes()
raw_data.dtypes

temp_df = raw_data.drop(columns=["country", "Region"])
temp_df = temp_df.apply(pd.to_numeric)
temp_df["country"] = raw_data["country"]
temp_df["Region"] = raw_data["Region"]
raw_data = temp_df
raw_data = raw_data.fillna(-99)

print("TRANSFORMED RAW DATA: ")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(raw_data)


raw_data.to_csv('out.csv')


OG RAW DATA: 
                                       country             Region  \
0                                  Afghanistan       SouthernAsia   
1                                      Albania     SouthernEurope   
2                                      Algeria     NorthernAfrica   
4                                      Andorra     SouthernEurope   
5                                       Angola       MiddleAfrica   
7                          Antigua and Barbuda          Caribbean   
8                                    Argentina       SouthAmerica   
9                                      Armenia        WesternAsia   
11                                   Australia            Oceania   
12                                     Austria      WesternEurope   
13                                  Azerbaijan        WesternAsia   
14                                     Bahamas          Caribbean   
15                                     Bahrain        WesternAsia   
16                  

# Filling missing data from external sources

In [1656]:
# locate and count -99 values
count = 0
for (col, data) in raw_data.iteritems():
    if col in ["country", "Region"]:
        continue
    df = raw_data.loc[raw_data[col] == -99]
    if not df.empty:
        count += df.shape[0]
        print(df[["country", col]])
        print(df.shape)
        print("\n")
print("Total count: ", count)

    country  Surface area (km2)
193   Sudan                 -99
(1, 59)


    country  Economy: Agriculture (% of GVA)
133  Monaco                            -99.0
(1, 59)


                              country  Employment: Agriculture (% of employed)
4                             Andorra                                    -99.0
7                 Antigua and Barbuda                                    -99.0
59                           Dominica                                    -99.0
83                            Grenada                                    -99.0
108                          Kiribati                                    -99.0
117                     Liechtenstein                                    -99.0
126                  Marshall Islands                                    -99.0
132  Micronesia (Federated States of)                                    -99.0
133                            Monaco                                    -99.0
141                             Naur

In [1657]:
# patching government expenditure on healthcare missing data from an external source
# external source: https://data.worldbank.org/indicator/SE.XPD.TOTL.GD.ZS?end=2017&start=2017

# load education expenditure data from secondary source
edu_exp = pd.read_csv('./data/government_expenditure_on_eduction_of_gdp.csv')

# drop data for all years except 2017
edu_exp = edu_exp[["Country Name", "2017"]]

# locate countries in raw_data with -99 for education expenditure
countries_with_missing_data = raw_data[raw_data["Education: Government expenditure (% of GDP)"] == -99]["country"]

def find_and_replace(countries_with_missing_data, new_data, valueCol, col):
    num_rows = countries_with_missing_data.shape[0]
    count = 0
    for index1, country in countries_with_missing_data.iteritems():
        new_data_country = country_map[country][0]
        row = new_data[new_data["Country Name"] == new_data_country]
        arr = row[valueCol].values
        print("len arr: ", len(arr), arr)
        if len(arr) == 0:
            continue
        elif np.isnan(arr).any():
            continue
        else:
            print("replacing with: ", arr[0])
            raw_data.at[index1, col] = arr[0]
            countries_with_missing_data.pop(index1)
            count += 1
    print(count, "/ ", num_rows, " countries with missing data for ", valueCol, " found and corrected")
    print("Reamaing countries: ")
    print(countries_with_missing_data)


find_and_replace(countries_with_missing_data, edu_exp, "2017", "Education: Government expenditure (% of GDP)")

len arr:  1 [6.50538015]
replacing with:  6.50538015365601
len arr:  1 [2.46688008]
replacing with:  2.46688008308411
len arr:  1 [2.44847989]
replacing with:  2.44847989082336
len arr:  1 [2.41849995]
replacing with:  2.41849994659424
len arr:  1 [nan]
len arr:  1 [7.0999999]
replacing with:  7.09999990463257
len arr:  1 [3.66744995]
replacing with:  3.66744995117188
len arr:  1 [4.4429698]
replacing with:  4.44296979904175
len arr:  1 [nan]
len arr:  1 [nan]
len arr:  1 [3.74855995]
replacing with:  3.74855995178223
len arr:  1 [4.76674986]
replacing with:  4.7667498588562
len arr:  1 [3.92026997]
replacing with:  3.92026996612549
len arr:  1 [nan]
len arr:  1 [nan]
len arr:  1 [nan]
len arr:  1 [3.47220993]
replacing with:  3.47220993041992
len arr:  1 [3.17379999]
replacing with:  3.17379999160767
len arr:  1 [1.59118998]
replacing with:  1.5911899805069
len arr:  1 [nan]
len arr:  1 [3.22853994]
replacing with:  3.22853994369507
len arr:  1 [10.86316013]
replacing with:  10.863160

In [1659]:
# columns that can be partially filled with external data

# Agriculural production index
# SHOUT: maybe set the remaining four countries to median
'''
Found for two countries.
Data: https://data.worldbank.org/indicator/AG.PRD.FOOD.XD
Estimated using: Food production index (2014-2016 = 100) year: 2018
South Sudan 98.52
Sudan 117.34
'''
raw_data.loc[raw_data["country"] == "South Sudan","Agricultural production index (2004-2006=100)"] = 99 # 98.52 round off to 99 since dt is int
raw_data.loc[raw_data["country"] == "Sudan","Agricultural production index (2004-2006=100)"] = 117 # rounded off from 117.34

# food production index
'''
Found for two countries.
Data: https://data.worldbank.org/indicator/AG.PRD.FOOD.XD
Estimated using: Food production index (2014-2016 = 100) year: 2018
South Sudan 98.52
Sudan 117.34
'''
raw_data.loc[raw_data["country"] == "South Sudan","Food production index (2004-2006=100)"] = 99
raw_data.loc[raw_data["country"] == "Sudan","Food production index (2004-2006=100)"] = 117

# Fertility rate, total (live births per woman)
'''
Fertility rate, total (live births per woman)
Data: https://data.worldbank.org/indicator/SP.DYN.TFRT.IN
Estimated using: Fertility rate, total (births per woman)
Dominica 1.9, year 2003
San Marino 1.26, year 2012
'''
raw_data.loc[raw_data["country"] == "Dominica","Fertility rate, total (live births per woman)"] = 1.9
raw_data.loc[raw_data["country"] == "San Marino","Fertility rate, total (live births per woman)"] = 1.26

# Balance of payments, current account (million US$)
'''
Estimated from: https://data.worldbank.org/indicator/BN.CAB.XOKA.CD?end=2020&locations=&most_recent_year_desc=true&start=2005
                  country  Balance of payments, current account (million US$)
47                Comoros                                                -22971469.912 (2017)
48                  Congo                                                -3.594 billion (2016)
67               Ethiopia                                                -5.929 billion (2017)
75                  Gabon                                                140995975.045 (2015)
76                 Gambia                                                -95242046.929 (2017)
198  Syrian Arab Republic                                                -367388473.023 (2010)
206   Trinidad and Tobago                                                1.409 billion (2017)
'''
MILLION = 1000000
BILLION = 1000000000
raw_data.loc[raw_data["country"] == "Comoros","Balance of payments, current account (million US$)"] = -22971469.912/MILLION
raw_data.loc[raw_data["country"] == "Congo","Balance of payments, current account (million US$)"] = (-3.594*BILLION)/MILLION
raw_data.loc[raw_data["country"] == "Ethiopia","Balance of payments, current account (million US$)"] = (-5.929*BILLION)/MILLION
raw_data.loc[raw_data["country"] == "Gabon","Balance of payments, current account (million US$)"] = 140995975.045 /MILLION
raw_data.loc[raw_data["country"] == "Gambia","Balance of payments, current account (million US$)"] = -95242046.929 /MILLION
raw_data.loc[raw_data["country"] == "Syrian Arab Republic","Balance of payments, current account (million US$)"] = -367388473.023 /MILLION
raw_data.loc[raw_data["country"] == "Trinidad and Tobago","Balance of payments, current account (million US$)"] = ( 1.409*BILLION)/MILLION

# infant mortality rate
'''
    Estimated from: https://data.worldbank.org/indicator/SP.DYN.IMRT.IN?locations=TV&most_recent_year_desc=true
    country  Infant mortality rate (per 1000 live births
211  Tuvalu                                        20.6 (2017)
'''
raw_data.loc[raw_data["country"] == "Tuvalu","Infant mortality rate (per 1000 live births"] = 20.6


# health expenditure
'''
Estimated from: https://data.worldbank.org/indicator/SH.XPD.CHEX.GD.ZS?most_recent_year_desc=true
                                   country      Health: Total expenditure (% of GDP)  
55   Democratic People's Republic of Korea      no data found
117                          Liechtenstein      no data found
140                                Namibia      8.5 (2019)
187                                Somalia      no data found
'''
raw_data.loc[raw_data["country"] == "Namibia","Health: Total expenditure (% of GDP)"] = 8.5

In [1660]:
# physicians
'''
                                country  Health: Physicians (per 1000 pop.)
2                               Algeria                               -99.0
5                                Angola                               -99.0
14                              Bahamas                               -99.0
17                             Barbados                               -99.0
20                               Belize                               -99.0
..                                  ...                                 ...
206                 Trinidad and Tobago                               -99.0
211                              Tuvalu                               -99.0
212                              Uganda                               -99.0
219                             Uruguay                               -99.0
222  Venezuela (Bolivarian Republic of)                               -99.0
(64 rows)
'''

# patching physicians missing data from an external source
# external source: https://data.worldbank.org/indicator/SH.MED.PHYS.ZS?end=2017&most_recent_year_desc=true&start=2017

# load physician data from secondary source
physicians = pd.read_csv('./data/physicians_per_1000_people.csv')

# drop data for all years except 2017
physicians = physicians[["Country Name", "2017"]]

# locate countries in raw_data with -99 for education expenditure
countries_with_missing_data = raw_data[raw_data["Health: Physicians (per 1000 pop.)"] == -99]["country"]
print("before: ", raw_data[raw_data["Health: Physicians (per 1000 pop.)"] == -99].shape)
find_and_replace(countries_with_missing_data, physicians, "2017", "Health: Physicians (per 1000 pop.)")
print("now: ", raw_data[raw_data["Health: Physicians (per 1000 pop.)"] == -99].shape)

before:  (67, 59)
len arr:  1 [nan]
len arr:  1 [0.2146]
replacing with:  0.2146
len arr:  1 [2.7673]
replacing with:  2.7673
len arr:  1 [1.9387]
replacing with:  1.9387
len arr:  1 [2.4878]
replacing with:  2.4878
len arr:  1 [1.1229]
replacing with:  1.1229
len arr:  1 [0.1001]
replacing with:  0.1001
len arr:  1 [nan]
len arr:  1 [nan]
len arr:  1 [4.8822]
replacing with:  4.8822
len arr:  1 [3.7157]
replacing with:  3.7157
len arr:  1 [nan]
len arr:  1 [nan]
len arr:  1 [nan]
len arr:  1 [1.1189]
replacing with:  1.1189
len arr:  1 [nan]
len arr:  1 [0.4017]
replacing with:  0.4017
len arr:  1 [nan]
len arr:  1 [nan]
len arr:  1 [nan]
len arr:  1 [0.6819]
replacing with:  0.6819
len arr:  1 [0.1138]
replacing with:  0.1138
len arr:  1 [0.092]
replacing with:  0.092
len arr:  1 [1.4067]
replacing with:  1.4067
len arr:  1 [0.28]
replacing with:  0.28
len arr:  1 [nan]
len arr:  1 [nan]
len arr:  1 [nan]
len arr:  1 [0.3089]
replacing with:  0.3089
len arr:  1 [nan]
len arr:  1 [nan

In [1661]:
# c02 emissions
'''
                 country        CO2 emission estimates (million tons/tons per capita)
7    Antigua and Barbuda        
133               Monaco        
154                Palau        
175           San Marino        
211               Tuvalu        
(5, 59)
'''

# SHOUT: TBC



'\n                 country        CO2 emission estimates (million tons/tons per capita)\n7    Antigua and Barbuda        \n133               Monaco        \n154                Palau        \n175           San Marino        \n211               Tuvalu        \n(5, 59)\n'

In [1662]:
# labour force participation females
'''
estimated from: https://data.worldbank.org/indicator/SL.TLF.CACT.FE.NE.ZS?most_recent_year_desc=true
estimated with indicator: Labor force participation rate, female (% of female population ages 15+) (national estimate)
                   country      Labour force participation (female/male pop. %)-female (year)
7      Antigua and Barbuda      66.24 (2001)
59                Dominica      45.06 (2001)
108               Kiribati      28.69 (2019)
133                 Monaco      39.80 (2016)
154                  Palau      55.56 (2014)
170  Saint Kitts and Nevis      64.42 (2001)
175             San Marino      87.51 (2019)
189            South Sudan      71.58 (2008)
211                 Tuvalu      38.34 (2016)
(9, 59)
'''
new_data = {
    'Antigua and Barbuda':       66.24,
               'Dominica':       45.06,
               'Kiribati':       28.69,
                 'Monaco':       39.80,
                  'Palau':       55.56,
  'Saint Kitts and Nevis':       64.42,
             'San Marino':       87.51,
            'South Sudan':       71.58,
                 'Tuvalu':       38.34
}

for key in new_data:
    raw_data.loc[raw_data["country"] == key,"Labour force participation (female/male pop. %)-female"] = new_data[key]


# labour force participation males
'''
estimated from: https://data.worldbank.org/indicator/SL.TLF.CACT.MA.NE.ZS
estimated with indicator: Labor force participation rate, male (% of male population ages 15+) (national estimate)
                              country       Labour force participation (female/male pop. %)-male (year)
4                             Andorra       no data
7                 Antigua and Barbuda       78.23 (2001)
59                           Dominica       70.221 (2001)
83                            Grenada       77.10 (2015)
108                          Kiribati       43.14 (2019)
132  Micronesia (Federated States of)       67.85 (2014)
133                            Monaco       57.19 (2016)
154                             Palau       73.27 (2014)
170             Saint Kitts and Nevis       73.05 (2001)
175                        San Marino       82.00 (2015)
189                       South Sudan       76.71 (2008)
211                            Tuvalu       71.20 (2016)
(12, 59)
'''

new_data = {
                 'Antigua and Barbuda':       78.23,
                           'Dominica':       70.221,
                            'Grenada':       77.10,
                          'Kiribati':       43.14,
  'Micronesia (Federated States of)':       67.85,
                            'Monaco':       57.19,
                             'Palau':       73.27,
             'Saint Kitts and Nevis':       73.05,
                        'San Marino':       82.00,
                       'South Sudan':       76.71,
                            'Tuvalu':       71.20
}

for key in new_data:
    raw_data.loc[raw_data["country"] == key,"Labour force participation (female/male pop. %)-male"] = new_data[key]

In [1663]:
# life expectancy at birth
'''
estimated from: https://data.worldbank.org/indicator/SP.DYN.LE00.FE.IN
estimated with indicator: Life expectancy at birth, female (years)
country  Life expectancy at birth (females/males, years)-female (year)
59   Dominica                                              78.8 (2001)     
211    Tuvalu                                              no data    
(2, 59)
'''
raw_data.loc[raw_data["country"] == 'Dominica',"Life expectancy at birth (females/males, years)-female"] = 78.8

'''
estimated from: https://data.worldbank.org/indicator/SP.DYN.LE00.MA.IN
estimated with indicator: Life expectancy at birth, male (years)
                   country      Life expectancy at birth (females/males, years)-male (year)
4                  Andorra      no data
59                Dominica      74.5 (2002)
117          Liechtenstein      80.70 (2019)
133                 Monaco      no data
170  Saint Kitts and Nevis      68.80 (2002)
211                 Tuvalu      no data
(6, 59)
'''
raw_data.loc[raw_data["country"] == "Dominica","Life expectancy at birth (females/males, years)-male"] = 74.5
raw_data.loc[raw_data["country"] == "Liechtenstein","Life expectancy at birth (females/males, years)-male"] = 80.7
raw_data.loc[raw_data["country"] == "Saint Kitts and Nevis","Life expectancy at birth (females/males, years)-male"] = 68.8

In [1664]:
# education
'''
estimated from: https://data.worldbank.org/indicator/SE.PRM.ENRR.FE
estimated with indicator: School enrollment, primary, female (% gross)
                 country        Education: Primary gross enrol. ratio (f/m per 100 pop.)-female (year)
14               Bahamas        95.54 (2006)
87         Guinea-Bissau        114.46 (2010)
98                  Iraq        99.11 (2007)
103              Jamaica        90.83 (2007)
116                Libya        106.6 (2006)
146            Nicaragua        120.12 (2010)
171          Saint Lucia        102.36 (2020)
187              Somalia        16.63 (2007)
206  Trinidad and Tobago        104.35 (2010)
(9, 59)
'''
new_data = {
               'Bahamas':        95.54,
         'Guinea-Bissau':        114.46,
                  'Iraq':        99.11,
              'Jamaica':        90.83,
                'Libya':        106.6,
            'Nicaragua':        120.12,
          'Saint Lucia':        102.36,
              'Somalia':        16.63,
  'Trinidad and Tobago':        104.35
}

for key in new_data:
    raw_data.loc[raw_data["country"] == key,"Education: Primary gross enrol. ratio (f/m per 100 pop.)-female"] = new_data[key]

'''
estimated from: https://data.worldbank.org/indicator/SE.PRM.ENRR.MA
estimated with indicator: School enrollment, primary, male (% gross)
                    country        Education: Primary gross enrol. ratio (f/m per 100 pop.)-male (year)
4                   Andorra        no data in 21st century
14                  Bahamas        97.57 (2006)
26   Bosnia and Herzegovina        no data
87            Guinea-Bissau        122.93 (2010)
90                    Haiti        no data in 21st century
98                     Iraq        177.76 (2007)
103                 Jamaica        90.42 (2007)
116                   Libya        111.31 (2006)
133                  Monaco        no data
146               Nicaragua        121.13 (2010)
171             Saint Lucia        99.86 (2020)
182               Singapore        100.70 (2019)
187                 Somalia        30.06 (2007)
206     Trinidad and Tobago        108.02 (2010)   
(14, 59)
'''
new_data = {
                  'Bahamas':       97.57,
            'Guinea-Bissau':       122.93,
                     'Iraq':       177.76,
                 'Jamaica':        90.42, 
                   'Libya':        111.31 ,
               'Nicaragua':        121.13,
             'Saint Lucia':        99.86,
               'Singapore':        100.70,
                 'Somalia':        30.06,
     'Trinidad and Tobago':        108.02   
}

for key in new_data:
    raw_data.loc[raw_data["country"] == key,"Education: Primary gross enrol. ratio (f/m per 100 pop.)-male"] = new_data[key]

In [1665]:
# secondary education
# retrivied from: https://data.worldbank.org/indicator/SE.SEC.ENRR.MA
# indicator: School enrollment, secondary, male (% gross)
enr_sec_male = pd.read_csv("./data/school_enrollment_secondary_male_percentage_gross.csv")
enr_sec_male.head()

# drop column years 1999 and earlier
years = [str(x) for x in range(2000, 2021)]
years = ["Country Name"] + years
enr_sec_male = enr_sec_male[years]
# there are 27 missing values for Education: Secondary gross enrol. ratio (f/m per 100 pop.)-male
countries_with_missing_data = raw_data[raw_data["Education: Secondary gross enrol. ratio (f/m per 100 pop.)-male"] == -99]["country"]
count = 0

def find_and_replace_traverse_year(countries_with_missing_data, new_data, col):
    num_rows = countries_with_missing_data.shape[0]
    count = 0
    for index1, country in countries_with_missing_data.iteritems():
        new_data_country = country_map[country][0]
        if not new_data_country:
            continue
        row = new_data[new_data["Country Name"] == new_data_country]
        reversed_years = years[::-1]
        # find the most recent year in enr_sec_male with a value available for secondary gross enrolment
        for year in reversed_years:
            if (year == "Country Name"):
                break
            if len(row[year].values) == 0:
                continue
            elif np.isnan(row[year].values).any():
                continue
            else:
                raw_data.at[index1, col] = row[year].values[0]
                count += 1
                countries_with_missing_data.pop(index1)
                break
        
    print(count, "/ ", num_rows, " countries with missing ", col, " data found and corrected")
    print("Reamaing countries: ")
    print(countries_with_missing_data)

find_and_replace_traverse_year(countries_with_missing_data, enr_sec_male, "Education: Secondary gross enrol. ratio (f/m per 100 pop.)-male")


19 /  26  countries with missing  Education: Secondary gross enrol. ratio (f/m per 100 pop.)-male  data found and corrected
Reamaing countries: 
4                     Andorra
26     Bosnia and Herzegovina
75                      Gabon
90                      Haiti
133                    Monaco
223                  Viet Nam
227                    Zambia
Name: country, dtype: string


In [1666]:
# retrivied from: https://data.worldbank.org/indicator/SE.SEC.ENRR.MA
# indicator: School enrollment, secondary, female (% gross)
enr_sec_female = pd.read_csv("./data/school_enrollment_secondary_female_percentage_gross.csv")
enr_sec_female.head()

# drop column years 1999 and earlier
years = [str(x) for x in range(2000, 2021)]
years = ["Country Name"] + years
enr_sec_female = enr_sec_female[years]

# there are 27 missing values for Education: Secondary gross enrol. ratio (f/m per 100 pop.)-female
countries_with_missing_data = raw_data[raw_data["Education: Secondary gross enrol. ratio (f/m per 100 pop.)-female"] == -99]["country"]
find_and_replace_traverse_year(countries_with_missing_data, enr_sec_female, "Education: Secondary gross enrol. ratio (f/m per 100 pop.)-female")

19 /  26  countries with missing  Education: Secondary gross enrol. ratio (f/m per 100 pop.)-female  data found and corrected
Reamaing countries: 
4                     Andorra
26     Bosnia and Herzegovina
75                      Gabon
90                      Haiti
133                    Monaco
223                  Viet Nam
227                    Zambia
Name: country, dtype: string


In [1667]:
# tertiary education
# retrivied from: https://data.worldbank.org/indicator/SE.TER.ENRR.FE
# indicator: School enrollment, tertiary, female (% gross)
enr_sec_female = pd.read_csv("./data/tertiary_enrollment_female_percentage_gross.csv")
enr_sec_female.head()

# drop column years 1999 and earlier
years = [str(x) for x in range(2000, 2021)]
years = ["Country Name"] + years
enr_sec_female = enr_sec_female[years]
enr_sec_female

# there are 27 missing values for Education: Tertiary gross enrol. ratio (f/m per 100 pop.)-female
countries_with_missing_data = raw_data[raw_data["Education: Tertiary gross enrol. ratio (f/m per 100 pop.)-female"] == -99]["country"]
find_and_replace_traverse_year(countries_with_missing_data, enr_sec_female, "Education: Tertiary gross enrol. ratio (f/m per 100 pop.)-female")

27 /  40  countries with missing  Education: Tertiary gross enrol. ratio (f/m per 100 pop.)-female  data found and corrected
Reamaing countries: 
4                               Andorra
14                              Bahamas
24     Bolivia (Plurinational State of)
59                             Dominica
90                                Haiti
108                            Kiribati
132    Micronesia (Federated States of)
133                              Monaco
141                               Nauru
156                    Papua New Guinea
186                     Solomon Islands
187                             Somalia
211                              Tuvalu
Name: country, dtype: string


In [1668]:
# retrivied from: https://data.worldbank.org/indicator/SE.TER.ENRR.MA
# indicator: School enrollment, tertiary, male (% gross)
enr_ter_male = pd.read_csv("./data/tertiary_enrollment_male_percentage_gross.csv")
enr_ter_male.head()

# drop column years 1999 and earlier
years = [str(x) for x in range(2000, 2021)]
years = ["Country Name"] + years
enr_ter_male = enr_ter_male[years]
enr_ter_male

# there are 27 missing values for Education: Tertiary gross enrol. ratio (f/m per 100 pop.)-male
countries_with_missing_data = raw_data[raw_data["Education: Tertiary gross enrol. ratio (f/m per 100 pop.)-male"] == -99]["country"]
find_and_replace_traverse_year(countries_with_missing_data, enr_ter_male, "Education: Tertiary gross enrol. ratio (f/m per 100 pop.)-male")

27 /  40  countries with missing  Education: Tertiary gross enrol. ratio (f/m per 100 pop.)-male  data found and corrected
Reamaing countries: 
4                               Andorra
14                              Bahamas
24     Bolivia (Plurinational State of)
59                             Dominica
90                                Haiti
108                            Kiribati
132    Micronesia (Federated States of)
133                              Monaco
141                               Nauru
156                    Papua New Guinea
186                     Solomon Islands
187                             Somalia
211                              Tuvalu
Name: country, dtype: string


# Primary Feature selection

1. The data frame has 59 columns. It is likely that many of these columns are strongly correlated.  
2. Given that we are trying to identify specific countries to give foreign aid to, it is important that  
their data is as complete as possible. While estimation is useful for use cases in which we are doing prediction  
if we estimate the value of too many features we might identify countries that need aid wrongly.  
  
Hence for these two reason, we have decided to drop any columns with more than 10 missing values.


In [1669]:
cols_to_drop = []
for col in raw_data.columns:
    temp = raw_data[raw_data[col] == -99]
    if temp.shape[0] > 10:
        print()
        print(col)
        print(temp.shape[0])
        cols_to_drop.append(col)


Employment: Agriculture (% of employed)
15

Employment: Industry (% of employed)
15

Employment: Services (% of employed)
15

Unemployment (% of labour force)
11

Balance of payments, current account (million US$)
16

Refugees and others of concern to UNHCR (in thousands)
17

Health: Physicians (per 1000 pop.)
34

Education: Government expenditure (% of GDP)
24

Net Official Development Assist. received (% of GNI)
60

Education: Tertiary gross enrol. ratio (f/m per 100 pop.)-female
13

Education: Tertiary gross enrol. ratio (f/m per 100 pop.)-male
13

Pop. using improved sanitation facilities (urban/rural, %)-Rural
12


In [1670]:
raw_data.drop(columns=cols_to_drop, inplace=True)
print_shape(raw_data)

Data dimensions:  191 rows and 47 columns


# Filling in remaining missing values with reasonable estimates

We will fill all remaining missing values with the median of the column.

In [1671]:
raw_data.replace(to_replace=-99, value=np.nan, inplace=True)
raw_data.fillna(raw_data.median(), inplace=True)

  raw_data.fillna(raw_data.median(), inplace=True)


In [1672]:
# progress check
# locate and count remaining -99 values
count = 0
for (col, data) in raw_data.iteritems():
    if col in ["country", "Region"]:
        continue
    df = raw_data.loc[raw_data[col] == -99]
    if not df.empty:
        count += df.shape[0]
        print(df[["country", col]])
        print(df.shape)
        print("\n")
pd.isnull(raw_data).sum()


Surface area (km2)                                                       0
Population in thousands (2017)                                           0
Population density (per km2, 2017)                                       0
Sex ratio (m per 100 f, 2017)                                            0
GDP: Gross domestic product (million current US$)                        0
GDP growth rate (annual %, const. 2005 prices)                           0
GDP per capita (current US$)                                             0
Economy: Agriculture (% of GVA)                                          0
Economy: Industry (% of GVA)                                             0
Economy: Services and other activity (% of GVA)                          0
Agricultural production index (2004-2006=100)                            0
Food production index (2004-2006=100)                                    0
International trade: Exports (million US$)                               0
International trade: Impo

In [1673]:
clean_data = raw_data

%store clean_data

Stored 'clean_data' (DataFrame)
