In [2]:
import pandas as pd
import numpy as np 
import matplotlib as mpl
import seaborn as sns

In [3]:
df = pd.read_csv("..\Datasets\Original\PrevalenceofUndernourishment_2016_2020_Percent_byCountry.csv")
df.columns

Index(['Domain Code', 'Domain', 'Area Code (M49)', 'Area', 'Element Code',
       'Element', 'Item Code (SDG)', 'Item', 'Year Code', 'Year', 'Unit',
       'Value', 'Flag', 'Flag Description', 'Note'],
      dtype='object')

In [4]:
"""
Mapping:
    Dependent var: Use AreaCode(M49)
    Independent var:
        Consumer price indice: ISO3

Columns:
    ISO3
    Area Code (M49)
    Country Name
    var_name_year (order dependent var first then independent vars) 
    var_name_avg (after every var)
    var_name_unit (after every var)

'Domain Code', 'Domain', 'Area Code (M49)', 'Area', 'Element Code',
       'Element', 'Item Code (SDG)', 'Item', 'Year Code', 'Year', 'Unit',
       'Value', 'Flag', 'Flag Description', 'Note'
"""

"\nMapping:\n    Dependent var: Use AreaCode(M49)\n    Independent var:\n        Consumer price indice: ISO3\n\nColumns:\n    ISO3\n    Area Code (M49)\n    Country Name\n    var_name_year (order dependent var first then independent vars) \n    var_name_avg (after every var)\n    var_name_unit (after every var)\n\n'Domain Code', 'Domain', 'Area Code (M49)', 'Area', 'Element Code',\n       'Element', 'Item Code (SDG)', 'Item', 'Year Code', 'Year', 'Unit',\n       'Value', 'Flag', 'Flag Description', 'Note'\n"

## Data Pre-Processing


In [5]:
# Selecting only relevant cols
cols = ["Area","Area Code (M49)", "Year", "Unit", "Value"]
df_trim = df[cols]

# Renaming Cols
df_trim = df_trim.rename(columns={"Area Code (M49)": "M49_Code", "Area": "Country_Name"})

df_trim.columns

Index(['Country_Name', 'M49_Code', 'Year', 'Unit', 'Value'], dtype='object')

In [6]:
## Dealing with NA/Empty Cells

# Method 1 => Removing all empty rows
def remove_NaN_rows(df_in: pd.DataFrame, val_col_name: str):
    df_out = df_in.loc[df_in[val_col_name].notna(), :]
    return df_out

# Method 2 => Replacing NaN with mean value
def replace_NaN_with_mean(df_in: pd.DataFrame, val_col_name: str):
    ...

df_trim = remove_NaN_rows(df_in=df_trim, val_col_name="Value")
df_trim

Unnamed: 0,Country_Name,M49_Code,Year,Unit,Value
0,Afghanistan,4,2016,%,22.2
1,Afghanistan,4,2017,%,23
2,Afghanistan,4,2018,%,24
3,Afghanistan,4,2019,%,26.9
4,Afghanistan,4,2020,%,29.8
...,...,...,...,...,...
1207,Yemen,887,2016,%,46.1
1208,Yemen,887,2017,%,46.6
1209,Yemen,887,2018,%,44.7
1210,Yemen,887,2019,%,42.8


In [7]:
year_l = [2016, 2017, 2018, 2019, 2020]

In [8]:
## Transposing Value Column Data

def transpose_data(df1: pd.DataFrame):
    d = {}
    for y in year_l:
        d[y] = df1.loc[df1["Year"] == y, "Value"]

    for j in range(len(year_l)):
        col_name = f"Prevalence_of_undernourishment_{year_l[j]}"
        df1[col_name] = 0

    for k,v in d.items():
        df1.loc[v.index, f"Prevalence_of_undernourishment_{str(k)}"] = v

    df1 = df1.drop(["Year", "Value"], axis=1, inplace=False)

    return df1

df_transformed = transpose_data(df1=df_trim)
df_transformed

Unnamed: 0,Country_Name,M49_Code,Unit,Prevalence_of_undernourishment_2016,Prevalence_of_undernourishment_2017,Prevalence_of_undernourishment_2018,Prevalence_of_undernourishment_2019,Prevalence_of_undernourishment_2020
0,Afghanistan,4,%,22.2,0,0,0,0
1,Afghanistan,4,%,0,23,0,0,0
2,Afghanistan,4,%,0,0,24,0,0
3,Afghanistan,4,%,0,0,0,26.9,0
4,Afghanistan,4,%,0,0,0,0,29.8
...,...,...,...,...,...,...,...,...
1207,Yemen,887,%,46.1,0,0,0,0
1208,Yemen,887,%,0,46.6,0,0,0
1209,Yemen,887,%,0,0,44.7,0,0
1210,Yemen,887,%,0,0,0,42.8,0


In [9]:
## Condensing sparse matrix into a condensed form

def condense_df(df: pd.DataFrame):
    country_l = pd.DataFrame(df["Country_Name"].unique())
    m49_code_l = pd.DataFrame(df["M49_Code"].unique())

    df_out = pd.concat([m49_code_l, country_l], axis=1)
    df_out.columns = ["M49_Code", "Country_Name"]

    for i in year_l:
        fdi_df = pd.DataFrame()
        fdi_df = df.loc[df[f"Prevalence_of_undernourishment_{i}"] != 0]
        fdi_df = fdi_df[["M49_Code", f"Prevalence_of_undernourishment_{i}"]]
        df_out = pd.merge(df_out, fdi_df, on="M49_Code", how="outer")

    return df_out

df_transformed = condense_df(df_transformed)
df_transformed

Unnamed: 0,M49_Code,Country_Name,Prevalence_of_undernourishment_2016,Prevalence_of_undernourishment_2017,Prevalence_of_undernourishment_2018,Prevalence_of_undernourishment_2019,Prevalence_of_undernourishment_2020
0,4,Afghanistan,22.2,23,24,26.9,29.8
1,8,Albania,4.7,4.7,4.6,4.3,3.9
2,12,Algeria,2.8,2.7,<2.5,<2.5,<2.5
3,24,Angola,15.4,15.4,15.7,17.9,20.8
4,32,Argentina,2.6,3.1,3.4,3.5,3.7
...,...,...,...,...,...,...,...
156,860,Uzbekistan,<2.5,<2.5,<2.5,<2.5,<2.5
157,548,Vanuatu,11.2,12.3,12.6,12.4,11.9
158,862,Venezuela (Bolivarian Republic of),16.4,22.2,22.7,24.9,22.9
159,704,Viet Nam,7.8,7.2,6.8,6.2,5.7


In [12]:
# Replacing <2.5 Percentage Values

df_transformed = df_transformed.replace("<2.5", "1.5")
df_transformed

Unnamed: 0,M49_Code,Country_Name,Prevalence_of_undernourishment_2016,Prevalence_of_undernourishment_2017,Prevalence_of_undernourishment_2018,Prevalence_of_undernourishment_2019,Prevalence_of_undernourishment_2020
0,4,Afghanistan,22.2,23,24,26.9,29.8
1,8,Albania,4.7,4.7,4.6,4.3,3.9
2,12,Algeria,2.8,2.7,1.5,1.5,1.5
3,24,Angola,15.4,15.4,15.7,17.9,20.8
4,32,Argentina,2.6,3.1,3.4,3.5,3.7
...,...,...,...,...,...,...,...
156,860,Uzbekistan,1.5,1.5,1.5,1.5,1.5
157,548,Vanuatu,11.2,12.3,12.6,12.4,11.9
158,862,Venezuela (Bolivarian Republic of),16.4,22.2,22.7,24.9,22.9
159,704,Viet Nam,7.8,7.2,6.8,6.2,5.7


In [11]:
# df_transformed.to_csv("../Datasets\Processed/Prevalence_of_Undernourishment_Processed.csv")