# Index - Education


### 1) Importing the required packages

In [21]:
%run ./functions.ipynb

import pandas as pd
import numpy as np 
import pandas_datareader
from pandas_datareader import wb
from fancyimpute import KNN, SoftImpute, IterativeImputer, BiScaler, NuclearNormMinimization, IterativeSVD  
from sklearn import preprocessing
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt



ModuleNotFoundError: No module named 'pmdarima'

### 2) Download the data

In [17]:
area = "Health" #set the indicator area

worldbank_data = load_the_database(area = area)

### 3) Area-specific data transformation

In [5]:
worldbank_data_idx = worldbank_data.reset_index()

#Cause of death, by non-communicable diseases (% of total)
worldbank_data_idx["SH.DTH.NCOM.ZS"] = 1/worldbank_data_idx["SH.DTH.NCOM.ZS"]


#Child malnutrition, stunting (moderate or severe) (% under age 5),
worldbank_data_idx["SH.STA.STNT.ZS"] = 1/worldbank_data_idx["SH.STA.STNT.ZS"]

#HIV prevalence, adult (% ages 15-49)
worldbank_data_idx["SH.DYN.AIDS.ZS"] = 1/worldbank_data_idx["SH.DYN.AIDS.ZS"]

#Mortality rate, infant (per 1,000 live births)
worldbank_data_idx["SP.DYN.IMRT.IN"] = 1/worldbank_data_idx["SP.DYN.IMRT.IN"]

#Infants lacking immunization, DPT (% of one-year-olds),

worldbank_data_idx["SH.IMM.IDPT"] = 1/worldbank_data_idx["SH.IMM.IDPT"]

#Infants lacking immunization, measles (% of one-year-olds),

worldbank_data_idx["SH.IMM.MEAS"] = 1/worldbank_data_idx["SH.IMM.MEAS"]

#Malaria incidence (per 1,000 people at risk),

worldbank_data_idx["SH.MLR.INCD.P3"] = 1/worldbank_data_idx["SH.MLR.INCD.P3"]

#Tuberculosis incidence (per 100,000 people),

worldbank_data_idx["SH.TBS.INCD"] = 1/worldbank_data_idx["SH.TBS.INCD"]

#Mortality rate, under-five (per 1,000 live births),

worldbank_data_idx["SH.DYN.MORT"] = 1/worldbank_data_idx["SH.DYN.MORT"]


#Mean Adult Mortality Rate 

worldbank_data_idx["MeanMortality"] = (worldbank_data_idx["SP.DYN.AMRT.FE"]+worldbank_data_idx["SP.DYN.AMRT.MA"])/2



worldbank_data_idx = worldbank_data_idx.drop(["SP.DYN.AMRT.FE","SP.DYN.AMRT.MA"], axis=1)

### 4) Imputing the missing data


In [6]:
worldbank_data_filled = impute_missing()


[IterativeImputer] Early stopping criterion not reached.



### 5) Scaling and forecasting the data

In [7]:
worldbank_data_scaled = scale_and_forecast()


invalid value encountered in greater_equal



### 6) Build the index (mean, median, pca...)

In [8]:
#Taking the mean to build our indicator
worldbank_data_scaled["indicator"] = worldbank_data_scaled.mean(axis=1)

#Or Median
#worldbank_data_scaled["indicator"] = worldbank_data_scaled.median(axis=1)

#Or it can be done by using PCA first component weight (no big differences)

#from sklearn.decomposition import PCA
#pca = PCA(n_components=1)
#worldbank_data_scaled["indicator"] = pca.fit_transform(worldbank_data_scaled)




worldbank_data_scaled.reset_index(inplace = True)
worldbank_data_scaled.year = worldbank_data_scaled.year.astype(int)
merged_data = worldbank_data_scaled
merged_data[merged_data["year"] == 2019].nlargest(10, 'indicator')

Unnamed: 0,country,year,SE.XPD.TOTL.GD.ZS,SE.PRE.ENRR,UIS.GER.12,SE.SEC.ENRR,SE.TER.ENRR,SE.ADT.LITR.ZS,UIS.EA.MEAN.1T6.AG25T99,SE.SEC.CUAT.UP.ZS,SE.PRM.DROP.ZS,SE.PRM.TCAQ.ZS,SE.PRM.ENRL.TC.ZS,UIS.SR.2.GPV.GLAST.CP.T,SE.SCH.LIFE,BAR.SCHL.25UP,LO.PISA,indicator
1530,Sweden,2019,0.874058,0.822977,0.858816,0.96497,0.880773,0.927005,0.95675,0.936702,0.96411,0.943498,0.961972,0.961456,0.967966,0.971632,0.872873,0.924371
450,Denmark,2019,0.941618,0.900849,0.887868,0.964963,0.894902,0.709462,0.962908,0.924904,0.988879,0.963881,0.982689,0.843199,0.968746,0.928957,0.863733,0.91517
1230,Norway,2019,0.909288,0.841299,0.633634,0.957124,0.912043,0.978208,0.958499,0.915902,0.959617,0.926395,0.990332,0.766111,0.97549,0.96897,0.85848,0.903426
520,Estonia,2019,0.770928,0.915038,0.80887,0.923114,0.858217,0.973012,0.980043,0.959265,0.824022,0.976589,0.895873,0.80388,0.921823,0.970284,0.937889,0.901256
540,Finland,2019,0.863203,0.619145,0.793119,0.970604,0.957562,0.758815,0.96297,0.869777,0.985781,0.99297,0.82475,0.957304,0.984175,0.900699,0.967059,0.893862
280,Switzerland,2019,0.660957,0.945838,0.872091,0.836687,0.787849,0.928508,0.978716,0.945678,0.970722,0.974623,0.959291,0.74679,0.912318,0.962256,0.92302,0.89369
1320,Poland,2019,0.661898,0.658203,0.760483,0.922524,0.863795,0.991483,0.940305,0.937747,0.896917,0.978737,0.963983,0.904183,0.914757,0.941768,0.885602,0.881492
270,Canada,2019,0.777986,0.694793,0.779789,0.939249,0.94849,0.900345,0.987092,0.939849,0.61018,0.851194,0.820653,0.998821,0.964002,0.97764,0.962386,0.876831
180,Belarus,2019,0.681521,0.93932,0.909269,0.929168,0.938117,0.989729,0.71268,0.971961,0.902857,0.716688,0.781624,0.910956,0.899872,0.999391,0.793958,0.871807
1520,Slovenia,2019,0.706772,0.811254,0.676247,0.910975,0.903312,0.987627,0.917835,0.91191,0.942118,0.934935,0.828094,0.749399,0.940939,0.978846,0.876835,0.871807


### 7) Plot the indicator for each country

In [9]:
plot_variable()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### 8) Export to a Csv File

In [10]:
merged_data.to_csv(area + "_index.csv")