In [1]:
import pandas as pd
import numpy as np 
import matplotlib as mpl
import seaborn as sns

In [2]:
# Merge all csv files
DATASETS = ["SocioFactors_Processed.csv", 
        "CPI_Food_Processed.csv", \
        "Net_FDI_Processed.csv", \
        "Infant_Mortality_Rate_Processed.csv", \
        "Unemployment_Processed.csv", \
        "Access_To_Electricity_Processed.csv", \
        "Prevalence_of_Undernourishment_Processed.csv"]


In [3]:
# Merge all csv files
files = ["..\..\datasets\Processed\%s" % x for x in DATASETS]
try:    
    dfs = [pd.read_csv(file, encoding='utf8') for file in files]
except:
    dfs = [pd.read_csv(file, encoding='ISO-8859-1') for file in files]

# Drop columns
for i in range(1, len(dfs)):
    columns = ["Unnamed: 0", "Country_Name", "ISO3 Code", "ISO3_Code"]
    for col in columns:
        if col in dfs[i].columns:
            dfs[i].drop(columns=[col], inplace=True)
    if "Area Code (M49)" in dfs[i].columns:
        dfs[i].rename(columns={"Area Code (M49)": "M49_Code"}, inplace=True)
    if "Net_FDI" in DATASETS[i]:
        dfs[i].drop(columns=dfs[i].columns[dfs[i].columns.str.contains("Total_FDI")], inplace=True)
for i in range(len(dfs)):
    print(dfs[i].columns)

# Merge
df = dfs[0]
for i in range(1, len(dfs)):
    df = df.merge(dfs[i], on="M49_Code", how="inner")
df

Index(['ISO3_Code', 'Country_Name', 'M49_Code', 'Life_Expectancy_2016',
       'Life_Expectancy_2017', 'Life_Expectancy_2018', 'Life_Expectancy_2019',
       'Life_Expectancy_2020', 'Life_Expectancy_Avg',
       'Mean_Years_Of_Schooling_2016', 'Mean_Years_Of_Schooling_2017',
       'Mean_Years_Of_Schooling_2018', 'Mean_Years_Of_Schooling_2019',
       'Mean_Years_Of_Schooling_2020', 'Mean_Years_of_Schooling_Avg'],
      dtype='object')
Index(['M49_Code', 'CPI_Food_2016', 'CPI_Food_2017', 'CPI_Food_2018',
       'CPI_Food_2019', 'CPI_Food_2020', 'CPI_Food_Avg'],
      dtype='object')
Index(['M49_Code', 'Net_FDI_2016', 'Net_FDI_2017', 'Net_FDI_2018',
       'Net_FDI_2019', 'Net_FDI_2020'],
      dtype='object')
Index(['M49_Code', 'Infant_Mortality_Rate_2016', 'Infant_Mortality_Rate_2017',
       'Infant_Mortality_Rate_2018', 'Infant_Mortality_Rate_2019',
       'Infant_Mortality_Rate_2020', 'Infant_Mortality_Rate_Avg'],
      dtype='object')
Index(['M49_Code', 'Unemployment_2016', 'Unemp

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Life_Expectancy_2016,Life_Expectancy_2017,Life_Expectancy_2018,Life_Expectancy_2019,Life_Expectancy_2020,Life_Expectancy_Avg,Mean_Years_Of_Schooling_2016,...,Access_To_Electricity_2017,Access_To_Electricity_2018,Access_To_Electricity_2019,Access_To_Electricity_2020,Access_To_Electricity_Avg,Prevalence_of_undernourishment_2016,Prevalence_of_undernourishment_2017,Prevalence_of_undernourishment_2018,Prevalence_of_undernourishment_2019,Prevalence_of_undernourishment_2020
0,AFG,Afghanistan,4.0,63.1361,63.0160,63.0810,63.5645,62.5751,63.07454,2.463660,...,97.699997,96.616135,97.699997,97.699997,97.483224,22.2,23.0,24.0,26.9,29.8
1,AGO,Angola,24.0,61.0923,61.6798,62.1438,62.4484,62.2612,61.92510,5.417391,...,43.013260,45.290001,45.642799,46.890610,44.529960,15.4,15.4,15.7,17.9,20.8
2,ALB,Albania,8.0,78.8602,79.0473,79.1838,79.2825,76.9893,78.67262,10.727528,...,99.889999,100.000000,100.000000,100.000000,99.956000,4.7,4.7,4.6,4.3,3.9
3,ARE,United Arab Emirates,784.0,79.3347,79.5036,79.6274,79.7262,78.9457,79.42752,10.842620,...,100.000000,100.000000,100.000000,100.000000,100.000000,6.3,6.4,6.2,6.0,5.6
4,ARG,Argentina,32.0,76.3077,76.8330,76.9994,77.2845,75.8921,76.66334,10.928190,...,100.000000,99.989578,100.000000,100.000000,99.967831,2.6,3.1,3.4,3.5,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,VNM,Viet Nam,704.0,73.9382,73.9632,73.9757,74.0929,75.3779,74.26958,8.122675,...,100.000000,100.000000,99.400002,100.000000,99.720000,7.8,7.2,6.8,6.2,5.7
148,VUT,Vanuatu,548.0,69.6496,69.7095,69.7948,69.8769,70.2995,69.86606,6.680000,...,62.799999,61.754513,64.590187,67.333267,62.859593,11.2,12.3,12.6,12.4,11.9
149,WSM,Samoa,882.0,72.5397,72.5900,72.6358,72.1572,72.7677,72.53808,11.526498,...,96.800003,99.994476,99.199997,100.000000,99.123553,4.7,4.6,4.5,4.4,4.4
150,YEM,Yemen,887.0,66.0641,65.9573,64.5751,65.0917,64.6501,65.26766,3.000000,...,79.199997,62.000000,72.751076,73.757927,71.313701,46.1,46.6,44.7,42.8,41.4


## Preprocessing

In [7]:
# Drop Avg columns
df.drop(columns=df.columns[df.columns.str.contains("Avg")], inplace=True)
# Replace all null values by mean
df.fillna(df.mean(), inplace=True)
df.isnull().sum()

  df.fillna(df.mean(), inplace=True)


ISO3_Code                              0
Country_Name                           0
M49_Code                               0
Life_Expectancy_2016                   0
Life_Expectancy_2017                   0
Life_Expectancy_2018                   0
Life_Expectancy_2019                   0
Life_Expectancy_2020                   0
Mean_Years_Of_Schooling_2016           0
Mean_Years_Of_Schooling_2017           0
Mean_Years_Of_Schooling_2018           0
Mean_Years_Of_Schooling_2019           0
Mean_Years_Of_Schooling_2020           0
CPI_Food_2016                          0
CPI_Food_2017                          0
CPI_Food_2018                          0
CPI_Food_2019                          0
CPI_Food_2020                          0
Net_FDI_2016                           0
Net_FDI_2017                           0
Net_FDI_2018                           0
Net_FDI_2019                           0
Net_FDI_2020                           0
Infant_Mortality_Rate_2016             0
Infant_Mortality

## Convert to csv

In [8]:
import csv
with open("..\..\datasets\Processed\All_DF_Processed.csv", "w", encoding="utf8") as f:
    df.to_csv(f, lineterminator="\n", index=False)