In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os

## DF Parsing + Pre-Processing

In [3]:
df_undernourishment = pd.read_csv(".\Datasets\Processed\Prevalence_of_Undernourishment_Processed.csv")
df_net_fdi = pd.read_csv(".\Datasets\Processed/Net_FDI_Processed.csv")
df_cpi = pd.read_csv(".\Datasets\Processed\CPI_Food_Processed.csv", encoding="ISO-8859-1")
df_socio_factors = pd.read_csv(".\Datasets\Processed\SocioFactors_Processed.csv", encoding="ISO-8859-1")


In [5]:
df_undernourishment.rename(columns={'Prevalence_of_undernourishment_2016': "2016_(%)",'Prevalence_of_undernourishment_2017': "2017_(%)",'Prevalence_of_undernourishment_2018': "2018_(%)",'Prevalence_of_undernourishment_2019': "2019_(%)", 'Prevalence_of_undernourishment_2020': "2020_(%)"}, inplace=True)
df_undernourishment = df_undernourishment.replace("<2.5", "1.5")

df_cpi.rename(columns={"ISO3 Code": "ISO3_Code"}, inplace=True) # remove after CPI df col naming resolved
df_undernourishment.drop("Unnamed: 0", axis=1, inplace=True)
df_net_fdi.drop("Unnamed: 0", axis=1, inplace=True)

print(df_undernourishment.columns)
print(df_net_fdi.columns)
print(df_cpi.columns)
print(df_socio_factors.columns)

Index(['M49_Code', 'Country_Name', '2016_(%)', '2017_(%)', '2018_(%)',
       '2019_(%)', '2020_(%)'],
      dtype='object')
Index(['M49_Code', 'Country_Name', 'Total_FDI_Inflows_2016',
       'Total_FDI_Inflows_2017', 'Total_FDI_Inflows_2018',
       'Total_FDI_Inflows_2019', 'Total_FDI_Inflows_2020',
       'Total_FDI_Outflows_2016', 'Total_FDI_Outflows_2017',
       'Total_FDI_Outflows_2018', 'Total_FDI_Outflows_2019',
       'Total_FDI_Outflows_2020', 'Net_FDI_2016', 'Net_FDI_2017',
       'Net_FDI_2018', 'Net_FDI_2019', 'Net_FDI_2020'],
      dtype='object')
Index(['M49_Code', 'Country_Name', 'ISO3_Code', 'CPI_Food_2016',
       'CPI_Food_2017', 'CPI_Food_2018', 'CPI_Food_2019', 'CPI_Food_2020',
       'CPI_Food_Avg'],
      dtype='object')
Index(['ISO3_Code', 'Country_Name', 'M49_Code', 'Life_Expectancy_2016',
       'Life_Expectancy_2017', 'Life_Expectancy_2018', 'Life_Expectancy_2019',
       'Life_Expectancy_2020', 'Life_Expectancy_Avg',
       'Mean_Years_Of_Schooling_2016', 

In [None]:
# print(df_undernourishment.dtypes)
# print(df_net_fdi.dtypes)
print(df_cpi.dtypes)
print(df_socio_factors.dtypes)

### Relevant Plotting Functions


In [12]:
PLOTS_TO_FUNC = {
    "Hist": sns.histplot,
    "KDE": sns.kdeplot,
    "Ecdf": sns.ecdfplot,
    "Bar": sns.barplot,
    "Scatter": sns.scatterplot,
    "Violin": sns.violinplot,
    "Line": sns.lineplot
}


In [7]:
def custom_settings(
    markersize: float = 2,
    linewidth: float = 0.5,
    labelsize: float = 6.5
):
    mpl.rcdefaults()
    mpl.rcParams["lines.markersize"] = markersize
    mpl.rcParams["lines.linewidth"] = linewidth
    mpl.rcParams["xtick.labelsize"] = labelsize
    ax = plt.gca()
    ax.set_xlim((0, 55))
    # simplefilter(action="ignore")

# custom_settings()

In [8]:
def split_cols_exceeding_thresh(
    df: pd.DataFrame,
    label_name: str,
    thresh: int = 5
) -> dict:
    """
    For the cols that exceed the categorical threshold 
        => This function will break down the unique vals of the col into n groups
            where n = n_unique values of the col / categorical threshold
    Returns dictionary where:
        keys => arbitrary names (1,2,3,4,...)
        values => list of values in the label col split according to the threshold
    """
    d = {}
    n_cuts = int(np.ceil((df[label_name].nunique()) / thresh))
    l = df[label_name].sort_values()
    col_names_l = np.array_split(l, n_cuts) 
    var_name_l = [i for i in range(0, n_cuts)]
    for j in range(len(var_name_l)):
        d[var_name_l[j]] = col_names_l[j]
    return d


In [9]:
def save_fig(
    fig,
    dest_path,
    file_name
):
    locs, labels = plt.xticks()
    plt.setp(labels, horizontalalignment="right")

    plt.savefig(os.path.join(dest_path, file_name), dpi=300, bbox_inches="tight")
    plt.close()

## Univariate Analysis

In [10]:
cwd = os.getcwd()
UNIVARIATE = os.path.join(cwd,"Graphs/Graphs_EDA/Univariate_Analysis")
BIVARIATE = os.path.join(cwd,"Graphs/Graphs_EDA/Bivariate_Analysis")

In [11]:
if not os.path.isdir(UNIVARIATE):
    os.makedirs(UNIVARIATE)

In [13]:
cols_dict = {
    "Prevalence" : ["2016_(%)", "2017_(%)", "2018_(%)","2019_(%)","2020_(%)"],
    "Life_Expectancy" : ['Life_Expectancy_2016','Life_Expectancy_2017', 'Life_Expectancy_2018', 'Life_Expectancy_2019','Life_Expectancy_2020', 'Life_Expectancy_Avg'],
    "CPI": ['CPI_Food_2016', 'CPI_Food_2017', 'CPI_Food_2018', 'CPI_Food_2019', 'CPI_Food_2020','CPI_Food_Avg'],
    "Net_FDI": ['Net_FDI_2016', 'Net_FDI_2017','Net_FDI_2018', 'Net_FDI_2019', 'Net_FDI_2020']
}

plot_types = ["Hist", "KDE"]

for k,v in cols_dict.items():
    df = pd.DataFrame
    if k == "Prevalence":
        df = df_undernourishment
    elif k == "Life_Expectancy":
        df = df_socio_factors
    elif k == "CPI":
        df = df_cpi
    elif k == "Net_FDI":
        df = df_net_fdi
    
    for col in cols_dict[k]:
        for plot_type in plot_types:
            fig = PLOTS_TO_FUNC[plot_type](data=df, x=col)
            file_name = f"{plot_type}_plot_{k}_{col}.png"
            save_fig(fig, UNIVARIATE, file_name)