In [1]:
import pandas as pd
import numpy as np 
import matplotlib as mpl
import seaborn as sns

In [2]:
df = pd.read_csv("..\..\Datasets\Original\LifeExpectancy_Meanyearsofschooling.csv")
df.head()

Unnamed: 0,iso3,country,hdicode,region,hdi_rank_2021,hdi_1990,hdi_1991,hdi_1992,hdi_1993,hdi_1994,...,mf_2012,mf_2013,mf_2014,mf_2015,mf_2016,mf_2017,mf_2018,mf_2019,mf_2020,mf_2021
0,AFG,Afghanistan,Low,SA,180.0,0.273,0.279,0.287,0.297,0.292,...,1.86,1.88,1.66,1.62,1.66,1.41,1.32,1.38,1.38,1.38
1,AGO,Angola,Medium,SSA,148.0,,,,,,...,4.09,4.53,3.97,3.59,2.79,2.64,2.28,2.18,2.18,2.18
2,ALB,Albania,High,ECA,67.0,0.647,0.629,0.614,0.617,0.624,...,12.44,11.49,13.14,12.61,14.39,14.46,12.85,12.96,12.96,12.96
3,AND,Andorra,Very High,,40.0,,,,,,...,,,,,,,,,,
4,ARE,United Arab Emirates,Very High,AS,26.0,0.728,0.739,0.742,0.748,0.755,...,49.56,49.68,55.49,59.76,64.95,75.61,65.97,68.95,68.95,68.95


## Skeleton dataframe

In [56]:
columns = ["iso3"]
df_new = df[columns].copy()
df_new.shape

(206, 1)

In [57]:
df_code = pd.read_csv("../datasets/Original//AreaCode_ISO3Code_conversion.csv")
df_code = df_code[["Country", "ISO3 Code", "M49 Code"]]
df_new = df_new.join(df_code.set_index("ISO3 Code"), on="iso3", validate="1:1")
df_new

Unnamed: 0,iso3,Country,M49 Code
0,AFG,Afghanistan,4.0
1,AGO,Angola,24.0
2,ALB,Albania,8.0
3,AND,Andorra,20.0
4,ARE,United Arab Emirates,784.0
...,...,...,...
201,ZZG.ECA,,
202,ZZH.LAC,,
203,ZZI.SA,,
204,ZZJ.SSA,,


In [58]:
df_new.rename(columns={"iso3": "ISO3_Code", "Country": "Country_Name", "M49 Code": "M49_Code"}, inplace=True)
df_new["M49_Code"] = df_new[["M49_Code"]].applymap(lambda x: "%.0f" % x, na_action="ignore")
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code
0,AFG,Afghanistan,4
1,AGO,Angola,24
2,ALB,Albania,8
3,AND,Andorra,20
4,ARE,United Arab Emirates,784


In [59]:
df_new[df_new["Country_Name"].isnull()]

Unnamed: 0,ISO3_Code,Country_Name,M49_Code
195,ZZA.VHHD,,
196,ZZB.HHD,,
197,ZZC.MHD,,
198,ZZD.LHD,,
199,ZZE.AS,,
200,ZZF.EAP,,
201,ZZG.ECA,,
202,ZZH.LAC,,
203,ZZI.SA,,
204,ZZJ.SSA,,


In [60]:
# drop the null rows cause it's the legend
df_new.dropna(inplace=True)
df_new

Unnamed: 0,ISO3_Code,Country_Name,M49_Code
0,AFG,Afghanistan,4
1,AGO,Angola,24
2,ALB,Albania,8
3,AND,Andorra,20
4,ARE,United Arab Emirates,784
...,...,...,...
190,WSM,Samoa,882
191,YEM,Yemen,887
192,ZAF,South Africa,710
193,ZMB,Zambia,894


## Life expectancy
Life Expectancy at Birth (years)	le

In [61]:
columns = ["le_2016", "le_2017", "le_2018", "le_2019", "le_2020"]
names_dict = {"le_2016": "Life_Expectancy_2016",
                    "le_2017": "Life_Expectancy_2017",
                    "le_2018": "Life_Expectancy_2018",
                    "le_2019": "Life_Expectancy_2019",
                    "le_2020": "Life_Expectancy_2020"}
df_new[columns] = df[columns]
df_new.rename(columns=names_dict, inplace=True)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Life_Expectancy_2016,Life_Expectancy_2017,Life_Expectancy_2018,Life_Expectancy_2019,Life_Expectancy_2020
0,AFG,Afghanistan,4,63.1361,63.016,63.081,63.5645,62.5751
1,AGO,Angola,24,61.0923,61.6798,62.1438,62.4484,62.2612
2,ALB,Albania,8,78.8602,79.0473,79.1838,79.2825,76.9893
3,AND,Andorra,20,82.9671,82.9803,82.9923,83.0039,79.0234
4,ARE,United Arab Emirates,784,79.3347,79.5036,79.6274,79.7262,78.9457


In [62]:
columns = list(names_dict.values())
df_new["Life_Expectancy_Avg"] = df_new[columns].mean(axis=1)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Life_Expectancy_2016,Life_Expectancy_2017,Life_Expectancy_2018,Life_Expectancy_2019,Life_Expectancy_2020,Life_Expectancy_Avg
0,AFG,Afghanistan,4,63.1361,63.016,63.081,63.5645,62.5751,63.07454
1,AGO,Angola,24,61.0923,61.6798,62.1438,62.4484,62.2612,61.9251
2,ALB,Albania,8,78.8602,79.0473,79.1838,79.2825,76.9893,78.67262
3,AND,Andorra,20,82.9671,82.9803,82.9923,83.0039,79.0234,82.1934
4,ARE,United Arab Emirates,784,79.3347,79.5036,79.6274,79.7262,78.9457,79.42752


## Mean years of schooling
Mean Years of Schooling (years)	    mys

In [63]:
columns = ["mys_2016", "mys_2017", "mys_2018", "mys_2019", "mys_2020"]
names_dict = {"mys_2016": "Mean_Years_Of_Schooling_2016",
                    "mys_2017": "Mean_Years_Of_Schooling_2017",
                    "mys_2018": "Mean_Years_Of_Schooling_2018",
                    "mys_2019": "Mean_Years_Of_Schooling_2019",
                    "mys_2020": "Mean_Years_Of_Schooling_2020"}
df_new[columns] = df[columns]
df_new.rename(columns=names_dict, inplace=True)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Life_Expectancy_2016,Life_Expectancy_2017,Life_Expectancy_2018,Life_Expectancy_2019,Life_Expectancy_2020,Life_Expectancy_Avg,Mean_Years_Of_Schooling_2016,Mean_Years_Of_Schooling_2017,Mean_Years_Of_Schooling_2018,Mean_Years_Of_Schooling_2019,Mean_Years_Of_Schooling_2020
0,AFG,Afghanistan,4,63.1361,63.016,63.081,63.5645,62.5751,63.07454,2.46366,2.561425,2.659189,2.756953,2.854718
1,AGO,Angola,24,61.0923,61.6798,62.1438,62.4484,62.2612,61.9251,5.417391,5.417391,5.417391,5.417391,5.417391
2,ALB,Albania,8,78.8602,79.0473,79.1838,79.2825,76.9893,78.67262,10.727528,10.910692,11.096983,11.286455,11.286455
3,AND,Andorra,20,82.9671,82.9803,82.9923,83.0039,79.0234,82.1934,10.5561,10.555773,10.555446,10.55512,10.55512
4,ARE,United Arab Emirates,784,79.3347,79.5036,79.6274,79.7262,78.9457,79.42752,10.84262,12.0554,12.484,12.69403,12.69403


In [64]:
columns = list(names_dict.values())
df_new["Mean_Years_of_Schooling_Avg"] = df_new[columns].mean(axis=1)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Life_Expectancy_2016,Life_Expectancy_2017,Life_Expectancy_2018,Life_Expectancy_2019,Life_Expectancy_2020,Life_Expectancy_Avg,Mean_Years_Of_Schooling_2016,Mean_Years_Of_Schooling_2017,Mean_Years_Of_Schooling_2018,Mean_Years_Of_Schooling_2019,Mean_Years_Of_Schooling_2020,Mean_Years_of_Schooling_Avg
0,AFG,Afghanistan,4,63.1361,63.016,63.081,63.5645,62.5751,63.07454,2.46366,2.561425,2.659189,2.756953,2.854718,2.659189
1,AGO,Angola,24,61.0923,61.6798,62.1438,62.4484,62.2612,61.9251,5.417391,5.417391,5.417391,5.417391,5.417391,5.417391
2,ALB,Albania,8,78.8602,79.0473,79.1838,79.2825,76.9893,78.67262,10.727528,10.910692,11.096983,11.286455,11.286455,11.061623
3,AND,Andorra,20,82.9671,82.9803,82.9923,83.0039,79.0234,82.1934,10.5561,10.555773,10.555446,10.55512,10.55512,10.555512
4,ARE,United Arab Emirates,784,79.3347,79.5036,79.6274,79.7262,78.9457,79.42752,10.84262,12.0554,12.484,12.69403,12.69403,12.154016


In [65]:
with open("../datasets/SocioFactors_Processed.csv", "w", encoding="utf8") as f:
    df_new.to_csv(f, lineterminator="\n", index=False)

## Stats

In [66]:
df_new.isnull().sum()

ISO3_Code                       0
Country_Name                    0
M49_Code                        0
Life_Expectancy_2016            0
Life_Expectancy_2017            0
Life_Expectancy_2018            0
Life_Expectancy_2019            0
Life_Expectancy_2020            0
Life_Expectancy_Avg             0
Mean_Years_Of_Schooling_2016    5
Mean_Years_Of_Schooling_2017    5
Mean_Years_Of_Schooling_2018    4
Mean_Years_Of_Schooling_2019    4
Mean_Years_Of_Schooling_2020    4
Mean_Years_of_Schooling_Avg     4
dtype: int64