In [2]:
import pandas as pd
import numpy as np 
import matplotlib as mpl
import seaborn as sns

In [3]:
df = pd.read_csv("..\datasets\ConsumerPriceIndices_2016_2020_AllMths_ValueUS$_byCountry.csv")
df[df["Months"] == "December"]

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Year Code,Year,Item Code,Item,Months Code,Months,Unit,Value,Flag,Flag Description,Note
11,CP,Consumer Price Indices,4,Afghanistan,2016,2016,23013,"Consumer Prices, Food Indices (2015 = 100)",7012,December,,108.916832,X,Figure from international organizations,base year is 2015
23,CP,Consumer Price Indices,4,Afghanistan,2017,2017,23013,"Consumer Prices, Food Indices (2015 = 100)",7012,December,,113.832502,X,Figure from international organizations,base year is 2015
35,CP,Consumer Price Indices,4,Afghanistan,2018,2018,23013,"Consumer Prices, Food Indices (2015 = 100)",7012,December,,113.490296,X,Figure from international organizations,base year is 2015
47,CP,Consumer Price Indices,4,Afghanistan,2019,2019,23013,"Consumer Prices, Food Indices (2015 = 100)",7012,December,,119.067918,X,Figure from international organizations,base year is 2015
59,CP,Consumer Price Indices,4,Afghanistan,2020,2020,23013,"Consumer Prices, Food Indices (2015 = 100)",7012,December,,128.002089,A,Official figure,base year is 2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12131,CP,Consumer Price Indices,716,Zimbabwe,2016,2016,23013,"Consumer Prices, Food Indices (2015 = 100)",7012,December,,96.878032,X,Figure from international organizations,base year is 2015
12143,CP,Consumer Price Indices,716,Zimbabwe,2017,2017,23013,"Consumer Prices, Food Indices (2015 = 100)",7012,December,,103.383454,X,Figure from international organizations,base year is 2015
12155,CP,Consumer Price Indices,716,Zimbabwe,2018,2018,23013,"Consumer Prices, Food Indices (2015 = 100)",7012,December,,158.767447,X,Figure from international organizations,base year is 2015
12167,CP,Consumer Price Indices,716,Zimbabwe,2019,2019,23013,"Consumer Prices, Food Indices (2015 = 100)",7012,December,,1300.732638,X,Figure from international organizations,base year is 2015


## Skeleton dataframe

In [4]:
columns = ["Area Code (M49)"]
df_new = df[columns[0]].unique()
df_new = pd.DataFrame(df_new, columns=columns)
df_new.shape

(203, 1)

In [5]:
df_code = pd.read_csv("../datasets/AreaCode_ISO3Code_conversion.csv")
df_code = df_code[["Country", "ISO3 Code", "M49 Code"]]
df_new = df_new.join(df_code.set_index("M49 Code"), on="Area Code (M49)", validate="1:1")
df_new

  df_new = df_new.join(df_code.set_index("M49 Code"), on="Area Code (M49)", validate="1:1")


Unnamed: 0,Area Code (M49),Country,ISO3 Code
0,4,Afghanistan,AFG
1,248,Åland Islands,ALA
2,8,Albania,ALB
3,12,Algeria,DZA
4,20,Andorra,AND
...,...,...,...
198,862,Venezuela (Bolivarian Republic of),VEN
199,704,Viet Nam,VNM
200,887,Yemen,YEM
201,894,Zambia,ZMB


In [6]:
df_new.rename(columns={"iso3": "ISO3_Code", "Country": "Country_Name", "Area Code (M49)": "M49_Code"}, inplace=True)
df_new.head()

Unnamed: 0,M49_Code,Country_Name,ISO3 Code
0,4,Afghanistan,AFG
1,248,Åland Islands,ALA
2,8,Albania,ALB
3,12,Algeria,DZA
4,20,Andorra,AND


In [7]:
df_new[df_new["Country_Name"].isnull()]

Unnamed: 0,M49_Code,Country_Name,ISO3 Code


In [8]:
# drop the null rows cause it's the legend
df_new.dropna(inplace=True)
df_new

Unnamed: 0,M49_Code,Country_Name,ISO3 Code
0,4,Afghanistan,AFG
1,248,Åland Islands,ALA
2,8,Albania,ALB
3,12,Algeria,DZA
4,20,Andorra,AND
...,...,...,...
198,862,Venezuela (Bolivarian Republic of),VEN
199,704,Viet Nam,VNM
200,887,Yemen,YEM
201,894,Zambia,ZMB


## Consumer Price Food Indices
Consumer Prices, Food Indices (2015 = 100), monthly

In [9]:
# filter out december only
df = df[df["Months"] == "December"]
df = df[["Area Code (M49)", "Year", "Value"]]
print(df.shape)

# copy over cpi values
def copy_over(year, df_tmp):
    df_col = df[df["Year"] == year].copy()
    df_col = df_col[["Area Code (M49)", "Value"]]
    assert df_col.shape[0] == df_tmp.shape[0]
    df_tmp = df_tmp.join(df_col.set_index("Area Code (M49)"), on="M49_Code", validate="1:1")
    df_tmp["Value"] = df_tmp["Value"].apply(lambda x: float(x))
    df_tmp = df_tmp.rename(columns={"Value": "CPI_Food_%d" % year})
    return df_tmp
df_new = copy_over(2016, df_new)
df_new = copy_over(2017, df_new)
df_new = copy_over(2018, df_new)
df_new = copy_over(2019, df_new)
df_new = copy_over(2020, df_new)
df_new

(1015, 3)


Unnamed: 0,M49_Code,Country_Name,ISO3 Code,CPI_Food_2016,CPI_Food_2017,CPI_Food_2018,CPI_Food_2019,CPI_Food_2020
0,4,Afghanistan,AFG,108.916832,113.832502,1.134903e+02,1.190679e+02,1.280021e+02
1,248,Åland Islands,ALA,99.614148,100.584200,1.013542e+02,1.045143e+02,1.033243e+02
2,8,Albania,ALB,104.588667,107.497139,1.106383e+02,1.134304e+02,1.163389e+02
3,12,Algeria,DZA,105.435418,111.015653,1.108684e+02,1.106159e+02,1.133403e+02
4,20,Andorra,AND,101.932302,104.825577,1.068286e+02,1.084978e+02,1.096106e+02
...,...,...,...,...,...,...,...,...
198,862,Venezuela (Bolivarian Republic of),VEN,953.574367,11086.187138,1.594700e+07,1.288889e+09,3.875371e+10
199,704,Viet Nam,VNM,103.139639,101.286235,1.064430e+02,1.161996e+02,1.193164e+02
200,887,Yemen,YEM,121.903092,128.044919,1.349019e+02,1.439082e+02,1.529995e+02
201,894,Zambia,ZMB,125.120587,131.148498,1.417520e+02,1.633637e+02,1.963545e+02


In [10]:
# add a column for average over the years
columns = df_new.columns[-5:]
df_new["CPI_Food_Avg"] = df_new[columns].mean(axis=1)
df_new.head()

Unnamed: 0,M49_Code,Country_Name,ISO3 Code,CPI_Food_2016,CPI_Food_2017,CPI_Food_2018,CPI_Food_2019,CPI_Food_2020,CPI_Food_Avg
0,4,Afghanistan,AFG,108.916832,113.832502,113.490296,119.067918,128.002089,116.661927
1,248,Åland Islands,ALA,99.614148,100.5842,101.354238,104.514337,103.324285,101.878242
2,8,Albania,ALB,104.588667,107.497139,110.638289,113.430423,116.338895,110.498683
3,12,Algeria,DZA,105.435418,111.015653,110.86839,110.615938,113.340313,110.255142
4,20,Andorra,AND,101.932302,104.825577,106.828613,108.49781,109.610608,106.338982


In [11]:
# prettify the float
#df_new = df_new[df_new.columns[-6:]].applymap(na_action='ignore', func=lambda x: ".6f" % x)
with open("../datasets/CPI_Food_Processed.csv", "w", encoding="utf8") as f:
    df_new.to_csv(f, lineterminator="\n", index=False)

## Stats

In [243]:
df_new.isnull().sum()

M49_Code         0
Country_Name     0
ISO3 Code        0
CPI_Food_2016    0
CPI_Food_2017    0
CPI_Food_2018    0
CPI_Food_2019    0
CPI_Food_2020    0
CPI_Food_Avg     0
dtype: int64