In [10]:
import pandas as pd
import numpy as np 
import matplotlib as mpl
import seaborn as sns

In [11]:
df = pd.read_csv("..\datasets\Original\Access_To_Electricity.csv")
df

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020]
0,Access to electricity (% of population),EG.ELC.ACCS.ZS,Afghanistan,AFG,97.69999695,97.69999695,96.61613464,97.69999695,97.69999695
1,Access to electricity (% of population),EG.ELC.ACCS.ZS,Albania,ALB,99.88999939,99.88999939,100,100,100
2,Access to electricity (% of population),EG.ELC.ACCS.ZS,Algeria,DZA,99.35025024,99.63549042,99.69783783,99.5,99.80413055
3,Access to electricity (% of population),EG.ELC.ACCS.ZS,American Samoa,ASM,..,..,..,..,..
4,Access to electricity (% of population),EG.ELC.ACCS.ZS,Andorra,AND,100,100,100,100,100
...,...,...,...,...,...,...,...,...,...
261,Access to electricity (% of population),EG.ELC.ACCS.ZS,Sub-Saharan Africa,SSF,43.75128699,43.73162988,46.37067224,47.1047724,48.35107879
262,Access to electricity (% of population),EG.ELC.ACCS.ZS,Sub-Saharan Africa (excluding high income),SSA,43.74607839,43.72649408,46.36585943,47.10010705,48.34660195
263,Access to electricity (% of population),EG.ELC.ACCS.ZS,Sub-Saharan Africa (IDA & IBRD countries),TSS,43.75128699,43.73162988,46.37067224,47.1047724,48.35107879
264,Access to electricity (% of population),EG.ELC.ACCS.ZS,Upper middle income,UMC,99.18525079,99.26673501,99.23690771,99.30921926,99.37388382


## Skeleton dataframe

In [12]:
columns = ["Country Code"]
df.drop_duplicates(subset=columns, keep="first")
df_new = df[columns[0]].unique()
df_new = pd.DataFrame(df_new, columns=columns)
df.rename(columns={"Country Code": "ISO3_Code"}, inplace=True)
df_new.rename(columns={"Country Code": "ISO3_Code"}, inplace=True)
df_new.shape

(266, 1)

In [13]:
df_code = pd.read_csv("../datasets/Original/AreaCode_ISO3Code_conversion.csv")
df_code = df_code[["Country", "ISO3 Code", "M49 Code"]]
df_new = df_new.join(df_code.set_index("ISO3 Code"), on="ISO3_Code", validate="1:1")
df_new

Unnamed: 0,ISO3_Code,Country,M49 Code
0,AFG,Afghanistan,4.0
1,ALB,Albania,8.0
2,DZA,Algeria,12.0
3,ASM,American Samoa,16.0
4,AND,Andorra,20.0
...,...,...,...
261,SSF,,
262,SSA,,
263,TSS,,
264,UMC,,


In [14]:
df_new.rename(columns={"ISO3 Code": "ISO3_Code", "Country": "Country_Name", "M49 Code": "M49_Code"}, inplace=True)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code
0,AFG,Afghanistan,4.0
1,ALB,Albania,8.0
2,DZA,Algeria,12.0
3,ASM,American Samoa,16.0
4,AND,Andorra,20.0


In [15]:
df_new[df_new["Country_Name"].isnull()]

Unnamed: 0,ISO3_Code,Country_Name,M49_Code
39,CHI,,
105,XKX,,
217,AFE,,
218,AFW,,
219,ARB,,
220,CSS,,
221,CEB,,
222,EAR,,
223,EAS,,
224,EAP,,


In [16]:
# drop the null rows cause it's the legend
df_new.dropna(inplace=True)
df_new

Unnamed: 0,ISO3_Code,Country_Name,M49_Code
0,AFG,Afghanistan,4.0
1,ALB,Albania,8.0
2,DZA,Algeria,12.0
3,ASM,American Samoa,16.0
4,AND,Andorra,20.0
...,...,...,...
212,VIR,United States Virgin Islands,850.0
213,PSE,Palestine,275.0
214,YEM,Yemen,887.0
215,ZMB,Zambia,894.0


## Access To Electricity
Access To Electricity (% of population)

In [17]:
df_tmp = pd.concat([df.loc[:, ["ISO3_Code"]], df.iloc[:, -5:]], axis=1)
df_new = df_new.merge(df_tmp, on="ISO3_Code", how="inner")
columns = df.columns[-5:].values
names_dict = {}
for i, column in enumerate(columns):
    names_dict[column] = "Access_To_Electricity_%s" % column[:4]
df_new.rename(columns=names_dict, inplace=True)
df_new

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Access_To_Electricity_2016,Access_To_Electricity_2017,Access_To_Electricity_2018,Access_To_Electricity_2019,Access_To_Electricity_2020
0,AFG,Afghanistan,4.0,97.69999695,97.69999695,96.61613464,97.69999695,97.69999695
1,ALB,Albania,8.0,99.88999939,99.88999939,100,100,100
2,DZA,Algeria,12.0,99.35025024,99.63549042,99.69783783,99.5,99.80413055
3,ASM,American Samoa,16.0,..,..,..,..,..
4,AND,Andorra,20.0,100,100,100,100,100
...,...,...,...,...,...,...,...,...
210,VIR,United States Virgin Islands,850.0,100,100,100,100,100
211,PSE,Palestine,275.0,100,99.69999695,100,100,100
212,YEM,Yemen,887.0,68.8595047,79.19999695,62,72.75107574,73.75792694
213,ZMB,Zambia,894.0,35.42545319,40.29999924,40.31789017,43,44.5244751


In [18]:
# add a column for average over the years
columns = df_new.columns[-5:]
df_cols = df_new[columns].applymap(lambda x: float(x) if x not in ["", None, ".."] else None)
df_new = pd.concat([df_new.iloc[:, :3], df_cols], axis=1)
df_new["Access_To_Electricity_Avg"] = df_new[columns].mean(axis=1)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Access_To_Electricity_2016,Access_To_Electricity_2017,Access_To_Electricity_2018,Access_To_Electricity_2019,Access_To_Electricity_2020,Access_To_Electricity_Avg
0,AFG,Afghanistan,4.0,97.699997,97.699997,96.616135,97.699997,97.699997,97.483224
1,ALB,Albania,8.0,99.889999,99.889999,100.0,100.0,100.0,99.956
2,DZA,Algeria,12.0,99.35025,99.63549,99.697838,99.5,99.804131,99.597542
3,ASM,American Samoa,16.0,,,,,,
4,AND,Andorra,20.0,100.0,100.0,100.0,100.0,100.0,100.0


In [19]:
# prettify the float
#df_new = df_new[df_new.columns[-6:]].applymap(na_action='ignore', func=lambda x: ".6f" % x)
with open("../datasets/Processed/Access_To_Electricity_Processed.csv", "w", encoding="utf8") as f:
    df_new.to_csv(f, lineterminator="\n", index=False)

## Stats

In [20]:
df_new.isnull().sum()

ISO3_Code                     0
Country_Name                  0
M49_Code                      0
Access_To_Electricity_2016    1
Access_To_Electricity_2017    1
Access_To_Electricity_2018    1
Access_To_Electricity_2019    1
Access_To_Electricity_2020    1
Access_To_Electricity_Avg     1
dtype: int64