In [118]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os

## DF Parsing + Pre-Processing

In [119]:
df_undernourishment = pd.read_csv("..\..\Datasets\Processed\Prevalence_of_Undernourishment_Processed.csv")
df_net_fdi = pd.read_csv("..\..\Datasets\Processed/Net_FDI_Processed.csv")
df_cpi = pd.read_csv("..\..\Datasets\Processed\CPI_Food_Processed.csv", encoding="ISO-8859-1")
df_socio_factors = pd.read_csv("..\..\Datasets\Processed\SocioFactors_Processed.csv", encoding="ISO-8859-1")


In [120]:
df_undernourishment.rename(columns={'Prevalence_of_undernourishment_2016': "2016_(%)",'Prevalence_of_undernourishment_2017': "2017_(%)",'Prevalence_of_undernourishment_2018': "2018_(%)",'Prevalence_of_undernourishment_2019': "2019_(%)", 'Prevalence_of_undernourishment_2020': "2020_(%)"}, inplace=True)
df_undernourishment = df_undernourishment.replace("<2.5", "1.5")

df_cpi.rename(columns={"ISO3 Code": "ISO3_Code"}, inplace=True) # remove after CPI df col naming resolved
df_undernourishment.drop("Unnamed: 0", axis=1, inplace=True)
df_net_fdi.drop("Unnamed: 0", axis=1, inplace=True)

print(df_undernourishment.columns)
print(df_net_fdi.columns)
print(df_cpi.columns)
print(df_socio_factors.columns)

Index(['M49_Code', 'Country_Name', '2016_(%)', '2017_(%)', '2018_(%)',
       '2019_(%)', '2020_(%)'],
      dtype='object')
Index(['M49_Code', 'Country_Name', 'Total_FDI_Inflows_2016',
       'Total_FDI_Inflows_2017', 'Total_FDI_Inflows_2018',
       'Total_FDI_Inflows_2019', 'Total_FDI_Inflows_2020',
       'Total_FDI_Outflows_2016', 'Total_FDI_Outflows_2017',
       'Total_FDI_Outflows_2018', 'Total_FDI_Outflows_2019',
       'Total_FDI_Outflows_2020', 'Net_FDI_2016', 'Net_FDI_2017',
       'Net_FDI_2018', 'Net_FDI_2019', 'Net_FDI_2020'],
      dtype='object')
Index(['M49_Code', 'Country_Name', 'ISO3_Code', 'CPI_Food_2016',
       'CPI_Food_2017', 'CPI_Food_2018', 'CPI_Food_2019', 'CPI_Food_2020',
       'CPI_Food_Avg'],
      dtype='object')
Index(['ISO3_Code', 'Country_Name', 'M49_Code', 'Life_Expectancy_2016',
       'Life_Expectancy_2017', 'Life_Expectancy_2018', 'Life_Expectancy_2019',
       'Life_Expectancy_2020', 'Life_Expectancy_Avg',
       'Mean_Years_Of_Schooling_2016', 

In [121]:
# print(df_undernourishment.dtypes)
# print(df_net_fdi.dtypes)
print(df_cpi.dtypes)
print(df_socio_factors.dtypes)

M49_Code           int64
Country_Name      object
ISO3_Code         object
CPI_Food_2016    float64
CPI_Food_2017    float64
CPI_Food_2018    float64
CPI_Food_2019    float64
CPI_Food_2020    float64
CPI_Food_Avg     float64
dtype: object
ISO3_Code                        object
Country_Name                     object
M49_Code                        float64
Life_Expectancy_2016            float64
Life_Expectancy_2017            float64
Life_Expectancy_2018            float64
Life_Expectancy_2019            float64
Life_Expectancy_2020            float64
Life_Expectancy_Avg             float64
Mean_Years_Of_Schooling_2016    float64
Mean_Years_Of_Schooling_2017    float64
Mean_Years_Of_Schooling_2018    float64
Mean_Years_Of_Schooling_2019    float64
Mean_Years_Of_Schooling_2020    float64
Mean_Years_of_Schooling_Avg     float64
dtype: object


### Relevant Plotting Functions


In [122]:
PLOTS_TO_FUNC = {
    "Hist": sns.histplot,
    "KDE": sns.kdeplot,
    "Ecdf": sns.ecdfplot,
    "Bar": sns.barplot,
    "Scatter": sns.scatterplot,
    "Violin": sns.violinplot,
    "Line": sns.lineplot
}


In [123]:
def custom_settings(
    markersize: float = 2,
    linewidth: float = 0.5,
    labelsize: float = 6.5
):
    mpl.rcdefaults()
    mpl.rcParams["lines.markersize"] = markersize
    mpl.rcParams["lines.linewidth"] = linewidth
    mpl.rcParams["xtick.labelsize"] = labelsize
    ax = plt.gca()
    ax.set_xlim((0, 55))
    # simplefilter(action="ignore")

# custom_settings()

In [124]:
def split_cols_exceeding_thresh(
    df: pd.DataFrame,
    label_name: str,
    thresh: int = 5
) -> dict:
    """
    For the cols that exceed the categorical threshold 
        => This function will break down the unique vals of the col into n groups
            where n = n_unique values of the col / categorical threshold
    Returns dictionary where:
        keys => arbitrary names (1,2,3,4,...)
        values => list of values in the label col split according to the threshold
    """
    d = {}
    n_cuts = int(np.ceil((df[label_name].nunique()) / thresh))
    l = df[label_name].sort_values()
    col_names_l = np.array_split(l, n_cuts) 
    var_name_l = [i for i in range(0, n_cuts)]
    for j in range(len(var_name_l)):
        d[var_name_l[j]] = col_names_l[j]
    return d


In [125]:
def save_fig(
    fig,
    dest_path,
    file_name
):
    locs, labels = plt.xticks()
    plt.setp(labels, horizontalalignment="right")

    plt.savefig(os.path.join(dest_path, file_name), dpi=300, bbox_inches="tight")
    plt.close()

## Univariate Analysis

In [147]:
## Making Targeted Dir for plots to be saved in

cwd = os.getcwd()
UNIVARIATE = os.path.join(cwd,"Graphs/Graphs_EDA/Univariate_Analysis")
BIVARIATE = os.path.join(cwd,"Graphs/Graphs_EDA/Bivariate_Analysis")
if not os.path.isdir(UNIVARIATE):
    os.makedirs(UNIVARIATE)
if not os.path.isdir(BIVARIATE):
    os.makedirs(BIVARIATE)

plot_types_uni = ["Hist", "KDE"]

In [139]:
cols_dict = {
    "Prevalence" : ["2016_(%)", "2017_(%)", "2018_(%)","2019_(%)","2020_(%)"],
    "Life_Expectancy" : ['Life_Expectancy_2016','Life_Expectancy_2017', 'Life_Expectancy_2018', 'Life_Expectancy_2019','Life_Expectancy_2020', 'Life_Expectancy_Avg'],
    # "CPI": ['CPI_Food_2016', 'CPI_Food_2017', 'CPI_Food_2018', 'CPI_Food_2019', 'CPI_Food_2020','CPI_Food_Avg'],
    "Net_FDI": ['Net_FDI_2016', 'Net_FDI_2017','Net_FDI_2018', 'Net_FDI_2019', 'Net_FDI_2020']
}



# for k,v in cols_dict.items():
#     df = pd.DataFrame
#     if k == "Prevalence":
#         df = df_undernourishment
#     elif k == "Life_Expectancy":
#         df = df_socio_factors
#     elif k == "CPI":
#         df = df_cpi
#     elif k == "Net_FDI":
#         df = df_net_fdi
#     print(k)
#     for col in cols_dict[k]:
#         for plot_type in plot_types_uni:
#             fig = PLOTS_TO_FUNC[plot_type](data=df, x=col)
#             file_name = f"{plot_type}_plot_{col}.png"
#             save_fig(fig, UNIVARIATE, file_name)



Prevalence
Life_Expectancy
Net_FDI


### Creating Categories from Numerical Cols


In [128]:
df_all_processed = pd.read_csv("..\..\Datasets\Processed\All_Processed.csv")
df_all_processed.sort_values(by="Country_Name", inplace=True)

In [129]:
df_all_processed.columns

Index(['ISO3_Code', 'Country_Name', 'M49_Code', 'Year', 'Life_Expectancy',
       'Mean_Years_Of_Schooling', 'CPI_Food', 'Net_FDI',
       'Prevalence_of_undernourishment'],
      dtype='object')

In [137]:
## Life Expectancy
df_all_processed["Life_Expectancy_Cat"] = pd.cut(df_all_processed["Life_Expectancy"], bins=[51,60,70,80,90], labels=["51-60","61-70","71-80","81-90"])
# df_all_processed["Life_Expectancy_Cat"] = pd.cut(df_all_processed["Life_Expectancy"], bins=[51,55,60,65,70,75,80,85,90], labels=["51-55","56-60","61-65","66-70","71-75","76-80","81-85","86-90"])

## Net FDI
df_all_processed["Net_FDI_Cat"] = pd.cut(df_all_processed["Net_FDI"], bins=[-400000,-200000,0,200000, 400000, 600000], labels=["<-200000","(-200000)-0","0-200000","200000-400000", ">400000"])

## CPI
print(df_all_processed["CPI_Food"].max())
print(df_all_processed["CPI_Food"].min())
# CPI has a big outlier data points, might not be ideal for this
df_all_processed.sort_values(by="CPI_Food", ascending=False)

38753710727.95829
86.673816


Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Year,Life_Expectancy,Mean_Years_Of_Schooling,CPI_Food,Net_FDI,Prevalence_of_undernourishment,Life_Expectancy_Cat,Net_FDI_Cat
754,VEN,Venezuela (Bolivarian Republic of),862.0,2020,71.0949,11.107277,3.875371e+10,1037.333334,22.9,71-80,0-200000
602,VEN,Venezuela (Bolivarian Republic of),862.0,2019,72.1614,11.107277,1.288889e+09,-142.000000,24.9,71-80,(-200000)-0
450,VEN,Venezuela (Bolivarian Republic of),862.0,2018,71.9788,10.835699,1.594700e+07,225.000000,22.7,71-80,0-200000
298,VEN,Venezuela (Bolivarian Republic of),862.0,2017,71.9430,10.570760,1.108619e+04,-2302.000000,22.2,71-80,(-200000)-0
728,SDN,Sudan,729.0,2020,65.6136,3.820000,1.621521e+03,1284.407750,12.8,61-70,0-200000
...,...,...,...,...,...,...,...,...,...,...,...
741,TCD,Chad,148.0,2020,52.7774,2.573774,9.113692e+01,1284.407750,32.7,51-60,0-200000
285,TCD,Chad,148.0,2017,52.3076,2.388869,9.091219e+01,204.867505,29.1,51-60,0-200000
133,TCD,Chad,148.0,2016,52.0834,2.301459,8.902543e+01,3457.305457,28.5,51-60,0-200000
437,TCD,Chad,148.0,2018,52.8253,2.479598,8.678160e+01,3746.793298,30.3,51-60,0-200000


In [140]:
fig = sns.histplot(data=df_all_processed, x="Prevalence_of_undernourishment", hue="Life_Expectancy_Cat")
save_fig(fig, UNIVARIATE, "Hist_plot_Prevalence_hue_life_expectancy.png")


In [145]:
fig = sns.histplot(data=df_all_processed, x="Prevalence_of_undernourishment", hue="Net_FDI_Cat")
save_fig(fig, UNIVARIATE, "Hist_plot_Prevalence_hue_NET_FDI.png")

## Bivariate Analysis



In [149]:
plot_types_bi = ["Hist", "Bar", "Scatter", "Violin"]

for plot_type in plot_types_bi:
    fig = PLOTS_TO_FUNC[plot_type](data=df_all_processed, x="Life_Expectancy_Cat", y="Prevalence_of_undernourishment")
    file_name = f"{plot_type}_plot_Life_Expectancy_Prevalence.png"
    save_fig(fig, BIVARIATE, file_name)


In [155]:
## Heatmap

corr = df_all_processed.corr()
print(corr)
fig = sns.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap="BrBG")
save_fig(fig, BIVARIATE, "Heat_Map_Correlation.png")


                                    M49_Code          Year  Life_Expectancy  \
M49_Code                        1.000000e+00  5.455710e-13        -0.033021   
Year                            5.455710e-13  1.000000e+00         0.005299   
Life_Expectancy                -3.302071e-02  5.298630e-03         1.000000   
Mean_Years_Of_Schooling        -9.138736e-03  3.802575e-02         0.773977   
CPI_Food                        6.565036e-02  5.215979e-02        -0.007300   
Net_FDI                         6.906865e-02 -1.435168e-02        -0.024566   
Prevalence_of_undernourishment  7.108758e-02  2.033980e-02        -0.680891   

                                Mean_Years_Of_Schooling  CPI_Food   Net_FDI  \
M49_Code                                      -0.009139  0.065650  0.069069   
Year                                           0.038026  0.052160 -0.014352   
Life_Expectancy                                0.773977 -0.007300 -0.024566   
Mean_Years_Of_Schooling                        1.00