## Selection of indicators from WoldBankIndicators dataset
*@XavierTorres*

In [1]:
import numpy as np
import pandas as pd

In [2]:
# IMPORTANT NOTE: Indicator called "Ever-breastfed rates (%)" has been introduced manually with info oly from 2018 using latest
# report from UNICEF => https://data.unicef.org/wp-content/uploads/2018/05/180509_Breastfeeding.pdf

# Load dataset with all the indicators from World Bank Data (1430 indicators for 221 countries since 1960)

df_WBI = pd.read_csv('C:/Users/torre/Documents/Local Omdena WFP files/WorldBank/Part1/0_MERGED_World_Development_Indicators.csv', sep=';', low_memory = False)

In [3]:
# Loading dataset with selected indicators
# For seletion procedure, in previous analisys it has been checked data quality (missing values) and relative indicators
# like  'per capita', '% based indicators', 'constant US$'' instead of current US$, etc.

df_selection = pd.read_csv('C:/Users/torre/Documents/Local Omdena WFP files/WorldBank/Part1/Selected_Indicators.csv', sep=';')

In [4]:
df_WBI.iloc[0,3]

'EG.CFT.ACCS.ZS'

In [5]:
df_selection.shape

(1430, 2)

In [6]:
df_WBI.head(3)

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1949,1950,1951,1952,1953,1954,...,2012,2013,2014,2015,2016,2017,2018,2019,Income_level,Indicator_CAT
0,Aruba,ABW,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,,,,,,,,,High,
1,Aruba,ABW,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,,,High,Infrastructure
2,Aruba,ABW,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,,,High,Infrastructure


In [7]:
# Each sindocator has a 1 if is selected and a 0 if not

df_selection.describe()

Unnamed: 0,Selected
count,1430.0
mean,0.01958
std,0.138602
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [8]:
# Deleting rows with Selection = 0

df_selection = df_selection[df_selection['Selected'] == 1]
df_selection.head()

Unnamed: 0,Indicator,Selected
0,Adjusted savings: education expenditure (% of ...,1
1,"Air transport, freight (million ton-km)",1
2,Arable land (hectares per person),1
3,Cereal yield (kg per hectare),1
4,Food production index (2004-2006 = 100),1


In [9]:
df_selection.shape

(28, 2)

In [10]:
# We are going to drop all rows in the original Dataset that have not_selected indicators

my_list = df_selection['Indicator']
df_WBI_new = df_WBI[df_WBI['Series Name'].isin(my_list)]

In [11]:
# Drop columns redundant like 'Series code'

df_WBI_new = df_WBI_new.drop(['Series Code'], axis=1)

In [12]:
df_WBI_new.shape

(6188, 76)

In [13]:
df_WBI_new.head(3)

Unnamed: 0,Country Name,Country Code,Series Name,1949,1950,1951,1952,1953,1954,1955,...,2012,2013,2014,2015,2016,2017,2018,2019,Income_level,Indicator_CAT
34,Aruba,ABW,Adjusted savings: education expenditure (% of ...,,,,,,,,...,6.863412,7.271645,6.411805,6.793939,6.793939,6.793939,,,High,
75,Aruba,ABW,"Air transport, freight (million ton-km)",,,,,,,,...,,,,,,,,,High,
90,Aruba,ABW,Arable land (hectares per person),,,,,,,,...,0.019501,0.019388,0.019273,0.019168,0.019071,,,,High,Agriculture


In [14]:
# Checking 'Indicator Category' has plenty of values and 0 NaN
df_WBI_new['Indicator_CAT'].isna().sum()

2210

In [15]:
# Writing to csv the final Dataset to be merged with other dataset from cyclones...

df_WBI_new.to_csv(r'Data_input/OUTPUT_WBI_Selection_Country_Year.csv', index = False, sep = ';')