In [84]:
import pandas as pd
import numpy as np 
import matplotlib as mpl
import seaborn as sns

In [85]:
df = pd.read_csv("..\Datasets\LifeExpectancy_Meanyearsofschooling.csv")
df.head()

Unnamed: 0,iso3,country,hdicode,region,hdi_rank_2021,hdi_1990,hdi_1991,hdi_1992,hdi_1993,hdi_1994,...,mf_2012,mf_2013,mf_2014,mf_2015,mf_2016,mf_2017,mf_2018,mf_2019,mf_2020,mf_2021
0,AFG,Afghanistan,Low,SA,180.0,0.273,0.279,0.287,0.297,0.292,...,1.86,1.88,1.66,1.62,1.66,1.41,1.32,1.38,1.38,1.38
1,AGO,Angola,Medium,SSA,148.0,,,,,,...,4.09,4.53,3.97,3.59,2.79,2.64,2.28,2.18,2.18,2.18
2,ALB,Albania,High,ECA,67.0,0.647,0.629,0.614,0.617,0.624,...,12.44,11.49,13.14,12.61,14.39,14.46,12.85,12.96,12.96,12.96
3,AND,Andorra,Very High,,40.0,,,,,,...,,,,,,,,,,
4,ARE,United Arab Emirates,Very High,AS,26.0,0.728,0.739,0.742,0.748,0.755,...,49.56,49.68,55.49,59.76,64.95,75.61,65.97,68.95,68.95,68.95


## Skeleton dataframe

In [86]:
columns = ["iso3"]
df_new = df[columns].copy()
df_new.shape

(206, 1)

In [87]:
df_code = pd.read_csv("../datasets/AreaCode_ISO3Code_conversion.csv")
df_code = df_code[["Country", "ISO3 Code", "M49 Code"]]
df_code

Unnamed: 0,Country,ISO3 Code,M49 Code
0,Afghanistan,AFG,4.0
1,Africa,X06,2.0
2,Åland Islands,ALA,248.0
3,Albania,ALB,8.0
4,Algeria,DZA,12.0
...,...,...,...
337,Yemen Ar Rp,F246,886.0
338,Yemen Dem,F247,720.0
339,Yugoslav SFR,F248,890.0
340,Zambia,ZMB,894.0


In [88]:
# Area mapping
df_new = df_new.join(df_code.set_index("ISO3 Code"), on="iso3", validate="1:1")
df_new.rename(columns={"iso3": "ISO3_Code", "Country": "Country_Name", "M49 Code": "M49_Code"}, inplace=True)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code
0,AFG,Afghanistan,4.0
1,AGO,Angola,24.0
2,ALB,Albania,8.0
3,AND,Andorra,20.0
4,ARE,United Arab Emirates,784.0


In [89]:
df_new[df_new["Country_Name"].isnull()]
# drop the null rows cause it's the legend
df_new.dropna(inplace=True)
df_new["M49_Code"] = df_new["M49_Code"].astype(int)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code
0,AFG,Afghanistan,4
1,AGO,Angola,24
2,ALB,Albania,8
3,AND,Andorra,20
4,ARE,United Arab Emirates,784


## Life expectancy
Life Expectancy at Birth (years)	le

In [None]:
columns = ["le_2016", "le_2017", "le_2018", "le_2019", "le_2020"]
names_dict = {"le_2016": "Life_Expectancy_2016",
                    "le_2017": "Life_Expectancy_2017",
                    "le_2018": "Life_Expectancy_2018",
                    "le_2019": "Life_Expectancy_2019",
                    "le_2020": "Life_Expectancy_2020"}
df_new[columns] = df[columns]
df_new.rename(columns=names_dict, inplace=True)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Life_Expectancy_2016,Life_Expectancy_2017,Life_Expectancy_2018,Life_Expectancy_2019,Life_Expectancy_2020
0,AFG,Afghanistan,4.0,63.1361,63.016,63.081,63.5645,62.5751
1,AGO,Angola,24.0,61.0923,61.6798,62.1438,62.4484,62.2612
2,ALB,Albania,8.0,78.8602,79.0473,79.1838,79.2825,76.9893
3,AND,Andorra,20.0,82.9671,82.9803,82.9923,83.0039,79.0234
4,ARE,United Arab Emirates,784.0,79.3347,79.5036,79.6274,79.7262,78.9457


In [None]:
columns = list(names_dict.values())
df_new["Life_Expectancy_Avg"] = df_new[columns].mean(axis=1)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Life_Expectancy_2016,Life_Expectancy_2017,Life_Expectancy_2018,Life_Expectancy_2019,Life_Expectancy_2020,Life_Expectancy_Avg
0,AFG,Afghanistan,4.0,63.1361,63.016,63.081,63.5645,62.5751,63.07454
1,AGO,Angola,24.0,61.0923,61.6798,62.1438,62.4484,62.2612,61.9251
2,ALB,Albania,8.0,78.8602,79.0473,79.1838,79.2825,76.9893,78.67262
3,AND,Andorra,20.0,82.9671,82.9803,82.9923,83.0039,79.0234,82.1934
4,ARE,United Arab Emirates,784.0,79.3347,79.5036,79.6274,79.7262,78.9457,79.42752


## Mean years of schooling
Mean Years of Schooling (years)	    mys

In [None]:
columns = ["mys_2016", "mys_2017", "mys_2018", "mys_2019", "mys_2020"]
names_dict = {"mys_2016": "Mean_Years_Of_Schooling_2016",
                    "mys_2017": "Mean_Years_Of_Schooling_2017",
                    "mys_2018": "Mean_Years_Of_Schooling_2018",
                    "mys_2019": "Mean_Years_Of_Schooling_2019",
                    "mys_2020": "Mean_Years_Of_Schooling_2020"}
df_new[columns] = df[columns]
df_new.rename(columns=names_dict, inplace=True)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Life_Expectancy_2016,Life_Expectancy_2017,Life_Expectancy_2018,Life_Expectancy_2019,Life_Expectancy_2020,Life_Expectancy_Avg,Mean_Years_Of_Schooling_2016,Mean_Years_Of_Schooling_2017,Mean_Years_Of_Schooling_2018,Mean_Years_Of_Schooling_2019,Mean_Years_Of_Schooling_2020
0,AFG,Afghanistan,4.0,63.1361,63.016,63.081,63.5645,62.5751,63.07454,2.46366,2.561425,2.659189,2.756953,2.854718
1,AGO,Angola,24.0,61.0923,61.6798,62.1438,62.4484,62.2612,61.9251,5.417391,5.417391,5.417391,5.417391,5.417391
2,ALB,Albania,8.0,78.8602,79.0473,79.1838,79.2825,76.9893,78.67262,10.727528,10.910692,11.096983,11.286455,11.286455
3,AND,Andorra,20.0,82.9671,82.9803,82.9923,83.0039,79.0234,82.1934,10.5561,10.555773,10.555446,10.55512,10.55512
4,ARE,United Arab Emirates,784.0,79.3347,79.5036,79.6274,79.7262,78.9457,79.42752,10.84262,12.0554,12.484,12.69403,12.69403


In [None]:
columns = list(names_dict.values())
df_new["Mean_Years_of_Schooling_Avg"] = df_new[columns].mean(axis=1)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Life_Expectancy_2016,Life_Expectancy_2017,Life_Expectancy_2018,Life_Expectancy_2019,Life_Expectancy_2020,Life_Expectancy_Avg,Mean_Years_Of_Schooling_2016,Mean_Years_Of_Schooling_2017,Mean_Years_Of_Schooling_2018,Mean_Years_Of_Schooling_2019,Mean_Years_Of_Schooling_2020,Mean_Years_of_Schooling_Avg
0,AFG,Afghanistan,4.0,63.1361,63.016,63.081,63.5645,62.5751,63.07454,2.46366,2.561425,2.659189,2.756953,2.854718,2.659189
1,AGO,Angola,24.0,61.0923,61.6798,62.1438,62.4484,62.2612,61.9251,5.417391,5.417391,5.417391,5.417391,5.417391,5.417391
2,ALB,Albania,8.0,78.8602,79.0473,79.1838,79.2825,76.9893,78.67262,10.727528,10.910692,11.096983,11.286455,11.286455,11.061623
3,AND,Andorra,20.0,82.9671,82.9803,82.9923,83.0039,79.0234,82.1934,10.5561,10.555773,10.555446,10.55512,10.55512,10.555512
4,ARE,United Arab Emirates,784.0,79.3347,79.5036,79.6274,79.7262,78.9457,79.42752,10.84262,12.0554,12.484,12.69403,12.69403,12.154016


In [None]:
with open("../datasets/SocioFactors_Processed.csv", "w") as f:
    df_new.to_csv(f, lineterminator="\n", index=False)

## Stats

In [None]:
df_new.isnull().sum()

ISO3_Code                       0
Country_Name                    0
M49_Code                        0
Life_Expectancy_2016            0
Life_Expectancy_2017            0
Life_Expectancy_2018            0
Life_Expectancy_2019            0
Life_Expectancy_2020            0
Life_Expectancy_Avg             0
Mean_Years_Of_Schooling_2016    5
Mean_Years_Of_Schooling_2017    5
Mean_Years_Of_Schooling_2018    4
Mean_Years_Of_Schooling_2019    4
Mean_Years_Of_Schooling_2020    4
Mean_Years_of_Schooling_Avg     4
dtype: int64

## Output: Prevalence of undernourishment

In [128]:
df = pd.read_csv("tmp_trimmed_df.csv")

In [126]:
columns = ["Area Code (M49)"]
df_out = df[columns].drop_duplicates().reset_index(drop=True)
df_out.shape

(247, 1)

In [127]:
# Area mapping
df_out = df_out.join(df_code.set_index("M49 Code"), on="Area Code (M49)", validate="1:1")
df_out.rename(columns={"ISO3 Code": "ISO3_Code", "Country": "Country_Name", "Area Code (M49)": "M49_Code"}, inplace=True)
df_out

  df_out = df_out.join(df_code.set_index("M49 Code"), on="Area Code (M49)", validate="1:1")


Unnamed: 0,M49_Code,Country_Name,ISO3_Code
0,4,Afghanistan,AFG
1,248,Åland Islands,ALA
2,8,Albania,ALB
3,12,Algeria,DZA
4,16,American Samoa,ASM
...,...,...,...
242,876,Wallis and Futuna Islands,WLF
243,732,Western Sahara,ESH
244,887,Yemen,YEM
245,894,Zambia,ZMB


In [134]:
columns = df.columns[-5:]
df[columns] = df[columns].replace("<2.5", 1.25)
df[columns] = df[columns].astype(float)
grouped = df.groupby(by=["Area Code (M49)"], dropna=False)
#df_out[columns] = grouped[columns].aggregate(np.sum)
#df_out
grouped[columns].aggregate(np.sum)[columns[0]]

Area Code (M49)
4      22.2
8       4.7
12      2.8
16      0.0
20      0.0
       ... 
862    16.4
876     0.0
882     4.7
887    46.1
894     0.0
Name: 2.1.1 Prevalence of undernourishment (%)_2016, Length: 247, dtype: float64

In [None]:

df[columns[1]]

0         0
1        23
2         0
3         0
4         0
       ... 
1217      0
1218    NaN
1219      0
1220      0
1221      0
Name: 2.1.1 Prevalence of undernourishment (%)_2017, Length: 1222, dtype: object