## Selection of indicators from WoldBankIndicators dataset
*@XavierTorres*

In [1]:
import numpy as np
import pandas as pd

In [2]:
# IMPORTANT NOTE: Indicator called "Ever-breastfed rates (%)" has been introduced manually with info oly from 2018 using latest
# report from UNICEF => https://data.unicef.org/wp-content/uploads/2018/05/180509_Breastfeeding.pdf

# Load dataset with all the indicators from World Bank Data (1430 indicators for 217 countries since 1960)

df_WBI = pd.read_csv('C:/Users/torre/Documents/Local Omdena WFP files/WorldBank/Part1/0_MERGED_World_Development_Indicators.csv', sep=';', low_memory = False)

In [3]:
# Loading dataset with selected indicators
# For seletion procedure, in previous analisys it has been checked data quality (missing values) and relative indicators
# like  'per capita', '% based indicators', 'constant US$'' instead of current US$, etc.

df_selection = pd.read_csv('C:/Users/torre/Documents/Local Omdena WFP files/WorldBank/Part1/Selected_Indicators.csv', sep=';')

In [4]:
df_WBI.dtypes

Country Name      object
Country Code      object
Series Name       object
Series Code       object
1960             float64
                  ...   
2017             float64
2018             float64
2019             float64
Income_level      object
Indicator_CAT     object
Length: 66, dtype: object

In [5]:
df_selection.dtypes

Indicator     object
Selected     float64
dtype: object

In [6]:
df_WBI.head(3)

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,Income_level,Indicator_CAT
0,Afghanistan,AFG,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,24.08,26.17,27.99,30.1,32.44,,,,Low,
1,Albania,ALB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,69.96,71.78,73.98,75.37,77.42,,,,High_Middle,
2,Algeria,DZA,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,92.47,92.83,93.1,92.7,92.62,,,,High_Middle,


In [7]:
# Each sindocator has a 1 if is selected and a 0 if not

df_selection.describe()

Unnamed: 0,Selected
count,1430.0
mean,0.064336
std,0.245436
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [8]:
# Deleting rows with Selection = 0

df_selection = df_selection[df_selection['Selected'] == 1]
df_selection.head()

Unnamed: 0,Indicator,Selected
0,"Population, total",1.0
1,Rural population (% of total population),1.0
2,Urban population (% of total population),1.0
3,Population density (people per sq. km of land ...,1.0
4,Crop production index (2004-2006 = 100),1.0


In [9]:
df_selection.shape

(92, 2)

In [10]:
# We are going to drop all rows in the original Dataset that have not_selected indicators

my_list = df_selection['Indicator']
df_WBI_new = df_WBI[df_WBI['Series Name'].isin(my_list)]

In [11]:
# Drop columns redundant like 'Series code'

df_WBI_new = df_WBI_new.drop(['Series Code'], axis=1)

In [12]:
df_WBI_new.shape

(19964, 65)

In [13]:
df_WBI_new.head(3)

Unnamed: 0,Country Name,Country Code,Series Name,1960,1961,1962,1963,1964,1965,1966,...,2012,2013,2014,2015,2016,2017,2018,2019,Income_level,Indicator_CAT
217,Afghanistan,AFG,Access to electricity (% of population),,,,,,,,...,69.1,70.153481,89.5,71.5,97.7,97.7,,,Low,Infrastructure
218,Albania,ALB,Access to electricity (% of population),,,,,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,,,High_Middle,Infrastructure
219,Algeria,DZA,Access to electricity (% of population),,,,,,,,...,98.76466,99.580971,99.877052,99.943069,99.992317,100.0,,,High_Middle,Infrastructure


In [14]:
# Checking 'Indicator Category' has plenty of values and 0 NaN
df_WBI_new['Indicator_CAT'].isna().sum()

0

In [17]:
# Witing to csv the final Dataset to be merged with other dataset from cyclones...

df_WBI_new.to_csv(r'C:/Users/torre/Documents/Local Omdena WFP files/WorldBank/Part1/OUTPUT_WBI_Selection_Country_Year.csv', index = False, sep = ';')