# Index - Health


### 1) Importing the required packages

In [18]:
%run ./functions.ipynb

import pandas as pd
import numpy as np 
import pandas_datareader
from pandas_datareader import wb
from fancyimpute import KNN, SoftImpute, IterativeImputer, BiScaler, NuclearNormMinimization, IterativeSVD  
from sklearn import preprocessing
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

### 2) Download the data

In [19]:
area = "Health" #set the indicator area

worldbank_data = load_the_database(area = area)

KeyboardInterrupt: 

### 3) Area-specific data transformation

In [3]:
worldbank_data_idx = worldbank_data.reset_index()

#from pupil to teacher to teacher to pupil
worldbank_data_idx["SE.PRM.ENRL.TC.ZS"] = 1/worldbank_data_idx["SE.PRM.ENRL.TC.ZS"]

#And school dropout

worldbank_data_idx["SE.PRM.DROP.ZS"] = 1/worldbank_data_idx["SE.PRM.DROP.ZS"]

#Mean PISA scores due to high correlation

worldbank_data_idx["LO.PISA"] = (worldbank_data_idx["LO.PISA.MAT"]+worldbank_data_idx["LO.PISA.REA"]+worldbank_data_idx["LO.PISA.SCI"])/3


#and create a gender education variable

#worldbank_data_idx["gender_ratio1"] = worldbank_data_idx["SE.SCH.LIFE.FE"]/worldbank_data_idx["SE.SCH.LIFE.MA"] 
#worldbank_data_idx["gender_ratio2"] = worldbank_data_idx["SE.SEC.CUAT.UP.FE.ZS"]/worldbank_data_idx["SE.SEC.CUAT.UP.MA.ZS"]
#worldbank_data_idx["gender_ratio"] =worldbank_data_idx[["gender_ratio1","gender_ratio2"]].mean(axis = 1)


worldbank_data_idx = worldbank_data_idx.drop(["SE.SCH.LIFE.FE", "LO.PISA.MAT", "LO.PISA.REA", "LO.PISA.SCI", "SE.SCH.LIFE.MA", "SE.SEC.CUAT.UP.MA.ZS", "SE.SEC.CUAT.UP.FE.ZS"], axis=1)

### 4) Imputing the missing data


In [4]:
worldbank_data_filled = impute_missing()


[IterativeImputer] Early stopping criterion not reached.



### 5) Scaling and forecasting the data

In [5]:
worldbank_data_scaled = scale_and_forecast()


invalid value encountered in greater_equal



### 6) Build the index (mean, median, pca...)

In [6]:
#Taking the mean to build our indicator
worldbank_data_scaled["indicator"] = worldbank_data_scaled.mean(axis=1)

#Or Median
#worldbank_data_scaled["indicator"] = worldbank_data_scaled.median(axis=1)

#Or it can be done by using PCA first component weight (no big differences)

#from sklearn.decomposition import PCA
#pca = PCA(n_components=1)
#worldbank_data_scaled["indicator"] = pca.fit_transform(worldbank_data_scaled)




worldbank_data_scaled.reset_index(inplace = True)
worldbank_data_scaled.year = worldbank_data_scaled.year.astype(int)
merged_data = worldbank_data_scaled
merged_data[merged_data["year"] == 2019].nlargest(10, 'indicator')

Unnamed: 0,country,year,SE.XPD.TOTL.GD.ZS,SE.PRE.ENRR,UIS.GER.12,SE.SEC.ENRR,SE.TER.ENRR,SE.ADT.LITR.ZS,UIS.EA.MEAN.1T6.AG25T99,SE.SEC.CUAT.UP.ZS,SE.PRM.DROP.ZS,SE.PRM.TCAQ.ZS,SE.PRM.ENRL.TC.ZS,UIS.SR.2.GPV.GLAST.CP.T,SE.SCH.LIFE,BAR.SCHL.25UP,LO.PISA,indicator
1530,Sweden,2019,0.872802,0.823233,0.857713,0.969046,0.885658,0.887881,0.956352,0.93687,0.964082,0.943255,0.961976,0.961553,0.96779,0.971548,0.872797,0.92217
450,Denmark,2019,0.940936,0.902062,0.885384,0.968971,0.899861,0.667391,0.962963,0.925251,0.9889,0.96381,0.982647,0.845002,0.968301,0.928329,0.864456,0.912951
1230,Norway,2019,0.903814,0.842042,0.629597,0.95997,0.916659,0.96706,0.9583,0.915903,0.960137,0.926291,0.990369,0.767347,0.974975,0.968798,0.858291,0.902637
520,Estonia,2019,0.76934,0.915118,0.808573,0.923967,0.86478,0.979477,0.979978,0.959418,0.81721,0.974965,0.89591,0.805465,0.921696,0.969944,0.937564,0.90156
540,Finland,2019,0.860697,0.620408,0.786559,0.974769,0.961344,0.712515,0.962956,0.869897,0.98571,0.992972,0.824641,0.955818,0.983844,0.900034,0.966939,0.890607
280,Switzerland,2019,0.659601,0.945903,0.871939,0.834994,0.798133,0.877395,0.978526,0.945779,0.968577,0.973405,0.959165,0.746726,0.911753,0.961878,0.922943,0.890448
1320,Poland,2019,0.660154,0.658126,0.759759,0.923131,0.869517,0.993642,0.940277,0.937946,0.896382,0.977119,0.963682,0.903905,0.914311,0.94164,0.885461,0.88167
1520,Slovenia,2019,0.705501,0.811708,0.6719,0.911447,0.90899,0.99095,0.917871,0.911922,0.94289,0.938101,0.828172,0.750057,0.940632,0.978813,0.877656,0.872441
270,Canada,2019,0.776123,0.695103,0.780511,0.941058,0.953614,0.861574,0.987056,0.939951,0.573261,0.850274,0.820856,0.998833,0.964055,0.977564,0.961887,0.872115
180,Belarus,2019,0.679806,0.93935,0.909111,0.929898,0.943226,0.992224,0.708429,0.971279,0.904198,0.716314,0.780905,0.910896,0.899613,0.999395,0.792871,0.871834


### 7) Plot the indicator for each country

In [7]:
plot_variable()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### 8) Export to a Csv File

In [8]:
merged_data.to_csv(area + "_index.csv")