In [1]:
import pandas as pd
import numpy as np 
import matplotlib as mpl
import seaborn as sns

In [2]:
FEATURE = "Tax_Revenue"
df = pd.read_csv("..\..\datasets\Original\%s.csv" % FEATURE)
df

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020]
0,Argentina,ARG,Tax revenue (% of GDP),GC.TAX.TOTL.GD.ZS,12.0972836411962,10.939623860491,9.99239008384335,10.4759215719336,10.758498148152
1,Australia,AUS,Tax revenue (% of GDP),GC.TAX.TOTL.GD.ZS,22.2743077338841,22.0857675207496,23.1935173358308,23.4177085110275,22.5960261824879
2,Brazil,BRA,Tax revenue (% of GDP),GC.TAX.TOTL.GD.ZS,13.7104309816062,13.605228515206,13.9401079935106,13.7376948362641,12.9612195690296
3,China,CHN,Tax revenue (% of GDP),GC.TAX.TOTL.GD.ZS,9.12291556057604,9.41931765253305,9.05268573156305,8.49286376989869,8.09062623388489
4,France,FRA,Tax revenue (% of GDP),GC.TAX.TOTL.GD.ZS,23.0648275010082,23.6293346543377,24.1708437248498,24.5146627776513,24.7758439505658
...,...,...,...,...,...,...,...,...,...
266,,,,,,,,,
267,,,,,,,,,
268,,,,,,,,,
269,Data from database: World Development Indicators,,,,,,,,


## Skeleton dataframe

In [3]:
columns = ["Country Code"]
df.drop_duplicates(subset=columns, keep="first")
df_new = df[columns[0]].unique()
df_new = pd.DataFrame(df_new, columns=columns)
df.rename(columns={"Country Code": "ISO3_Code"}, inplace=True)
df_new.rename(columns={"Country Code": "ISO3_Code"}, inplace=True)
df_new.shape

(267, 1)

In [4]:
df_code = pd.read_csv("../../datasets/Original/AreaCode_ISO3Code_conversion.csv")
df_code = df_code[["Country", "ISO3 Code", "M49 Code"]]
df_new = df_new.join(df_code.set_index("ISO3 Code"), on="ISO3_Code", validate="1:1")
df_new

Unnamed: 0,ISO3_Code,Country,M49 Code
0,ARG,Argentina,32.0
1,AUS,Australia,36.0
2,BRA,Brazil,76.0
3,CHN,China,159.0
4,FRA,France,250.0
...,...,...,...
262,SSA,,
263,TSS,,
264,UMC,,
265,WLD,,


In [5]:
df_new.rename(columns={"ISO3 Code": "ISO3_Code", "Country": "Country_Name", "M49 Code": "M49_Code"}, inplace=True)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code
0,ARG,Argentina,32.0
1,AUS,Australia,36.0
2,BRA,Brazil,76.0
3,CHN,China,159.0
4,FRA,France,250.0


In [6]:
df_new[df_new["Country_Name"].isnull()]

Unnamed: 0,ISO3_Code,Country_Name,M49_Code
56,CHI,,
114,XKX,,
217,AFE,,
218,AFW,,
219,ARB,,
220,CSS,,
221,CEB,,
222,EAR,,
223,EAS,,
224,EAP,,


In [7]:
# drop the null rows cause it's the legend
df_new.dropna(inplace=True)
df_new

Unnamed: 0,ISO3_Code,Country_Name,M49_Code
0,ARG,Argentina,32.0
1,AUS,Australia,36.0
2,BRA,Brazil,76.0
3,CHN,China,159.0
4,FRA,France,250.0
...,...,...,...
212,VIR,United States Virgin Islands,850.0
213,PSE,Palestine,275.0
214,YEM,Yemen,887.0
215,ZMB,Zambia,894.0


## Unemployment
Unemployment (% of labor force)

In [8]:
df_tmp = pd.concat([df.loc[:, ["ISO3_Code"]], df.iloc[:, -5:]], axis=1)
df_new = df_new.merge(df_tmp, on="ISO3_Code", how="inner")
columns = df.columns[-5:].values
names_dict = {}
for i, column in enumerate(columns):
    names_dict[column] = "%s_%s" % (FEATURE, column[:4])
df_new.rename(columns=names_dict, inplace=True)
df_new

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Tax_Revenue_2016,Tax_Revenue_2017,Tax_Revenue_2018,Tax_Revenue_2019,Tax_Revenue_2020
0,ARG,Argentina,32.0,12.0972836411962,10.939623860491,9.99239008384335,10.4759215719336,10.758498148152
1,AUS,Australia,36.0,22.2743077338841,22.0857675207496,23.1935173358308,23.4177085110275,22.5960261824879
2,BRA,Brazil,76.0,13.7104309816062,13.605228515206,13.9401079935106,13.7376948362641,12.9612195690296
3,CHN,China,159.0,9.12291556057604,9.41931765253305,9.05268573156305,8.49286376989869,8.09062623388489
4,FRA,France,250.0,23.0648275010082,23.6293346543377,24.1708437248498,24.5146627776513,24.7758439505658
...,...,...,...,...,...,...,...,...
210,VIR,United States Virgin Islands,850.0,..,..,..,..,..
211,PSE,Palestine,275.0,5.3138898775539,5.74072201287886,18.7818419132978,..,..
212,YEM,Yemen,887.0,..,..,..,..,..
213,ZMB,Zambia,894.0,13.3531814269452,15.1848569608409,16.5870654729348,16.6776226628035,16.442596894579


In [9]:
# add a column for average over the years
columns = df_new.columns[-5:]
df_cols = df_new[columns].applymap(lambda x: float(x) if x not in ["", None, ".."] else None)
df_new = pd.concat([df_new.iloc[:, :3], df_cols], axis=1)
df_new["%s_Avg" % FEATURE] = df_new[columns].mean(axis=1)
df_new.head()

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Tax_Revenue_2016,Tax_Revenue_2017,Tax_Revenue_2018,Tax_Revenue_2019,Tax_Revenue_2020,Tax_Revenue_Avg
0,ARG,Argentina,32.0,12.097284,10.939624,9.99239,10.475922,10.758498,10.852743
1,AUS,Australia,36.0,22.274308,22.085768,23.193517,23.417709,22.596026,22.713465
2,BRA,Brazil,76.0,13.710431,13.605229,13.940108,13.737695,12.96122,13.590936
3,CHN,China,159.0,9.122916,9.419318,9.052686,8.492864,8.090626,8.835682
4,FRA,France,250.0,23.064828,23.629335,24.170844,24.514663,24.775844,24.031103


In [10]:
# prettify the float
#df_new = df_new[df_new.columns[-6:]].applymap(na_action='ignore', func=lambda x: ".6f" % x)
with open("../../datasets/Processed/%s_Processed.csv" % FEATURE, "w", encoding="utf8") as f:
    df_new.to_csv(f, lineterminator="\n", index=False)

## Stats

In [11]:
df_new.isnull().sum()

ISO3_Code            0
Country_Name         0
M49_Code             0
Tax_Revenue_2016    77
Tax_Revenue_2017    76
Tax_Revenue_2018    83
Tax_Revenue_2019    90
Tax_Revenue_2020    98
Tax_Revenue_Avg     72
dtype: int64