In [12]:
import pandas as pd
import numpy as np 
import matplotlib as mpl
import seaborn as sns

In [13]:
DATASETS = [
    "SocioFactors_Processed.csv",
    # Add another file/delete a file if there's any change in dataset HERE
    "CPI_Food_Processed.csv",
    "Net_FDI_Processed.csv",
    "Unemployment_Processed.csv",
    "Infant_Mortality_Rate_Processed.csv",
    "Access_To_Electricity_Processed.csv",
    "Prevalence_of_Undernourishment_Processed.csv"
]

In [14]:
# Merge all csv files
files = ["..\..\datasets\Processed\%s" % x for x in DATASETS]
try:    
    dfs = [pd.read_csv(file, encoding='utf8') for file in files]
except:
    dfs = [pd.read_csv(file, encoding='ISO-8859-1') for file in files]

# Drop columns
for i in range(1, len(dfs)):
    columns = ["Unnamed: 0", "Country_Name", "ISO3 Code", "ISO3_Code"]
    for col in columns:
        if col in dfs[i].columns:
            dfs[i].drop(columns=[col], inplace=True)
    if "Area Code (M49)" in dfs[i].columns:
        dfs[i].rename(columns={"Area Code (M49)": "M49_Code"}, inplace=True)
    if "Net_FDI" in DATASETS[i]:
        dfs[i].drop(columns=dfs[i].columns[dfs[i].columns.str.contains("Total_FDI")], inplace=True)
for i in range(len(dfs)):
    print(dfs[i].columns)

# Merge
df = dfs[0]
for i in range(1, len(dfs)):
    df = df.merge(dfs[i], on="M49_Code", how="inner")
df

Index(['ISO3_Code', 'Country_Name', 'M49_Code', 'Life_Expectancy_2016',
       'Life_Expectancy_2017', 'Life_Expectancy_2018', 'Life_Expectancy_2019',
       'Life_Expectancy_2020', 'Life_Expectancy_Avg',
       'Mean_Years_Of_Schooling_2016', 'Mean_Years_Of_Schooling_2017',
       'Mean_Years_Of_Schooling_2018', 'Mean_Years_Of_Schooling_2019',
       'Mean_Years_Of_Schooling_2020', 'Mean_Years_of_Schooling_Avg'],
      dtype='object')
Index(['M49_Code', 'CPI_Food_2016', 'CPI_Food_2017', 'CPI_Food_2018',
       'CPI_Food_2019', 'CPI_Food_2020', 'CPI_Food_Avg'],
      dtype='object')
Index(['M49_Code', 'Net_FDI_2016', 'Net_FDI_2017', 'Net_FDI_2018',
       'Net_FDI_2019', 'Net_FDI_2020'],
      dtype='object')
Index(['M49_Code', 'Unemployment_2016', 'Unemployment_2017',
       'Unemployment_2018', 'Unemployment_2019', 'Unemployment_2020',
       'Unemployment_Avg'],
      dtype='object')
Index(['M49_Code', 'Infant_Mortality_Rate_2016', 'Infant_Mortality_Rate_2017',
       'Infant_Mortal

Unnamed: 0,ISO3_Code,Country_Name,M49_Code,Life_Expectancy_2016,Life_Expectancy_2017,Life_Expectancy_2018,Life_Expectancy_2019,Life_Expectancy_2020,Life_Expectancy_Avg,Mean_Years_Of_Schooling_2016,...,Access_To_Electricity_2017,Access_To_Electricity_2018,Access_To_Electricity_2019,Access_To_Electricity_2020,Access_To_Electricity_Avg,Prevalence_of_undernourishment_2016,Prevalence_of_undernourishment_2017,Prevalence_of_undernourishment_2018,Prevalence_of_undernourishment_2019,Prevalence_of_undernourishment_2020
0,AFG,Afghanistan,4.0,63.1361,63.0160,63.0810,63.5645,62.5751,63.07454,2.463660,...,97.699997,96.616135,97.699997,97.699997,97.483224,22.2,23.0,24.0,26.9,29.8
1,AGO,Angola,24.0,61.0923,61.6798,62.1438,62.4484,62.2612,61.92510,5.417391,...,43.013260,45.290001,45.642799,46.890610,44.529960,15.4,15.4,15.7,17.9,20.8
2,ALB,Albania,8.0,78.8602,79.0473,79.1838,79.2825,76.9893,78.67262,10.727528,...,99.889999,100.000000,100.000000,100.000000,99.956000,4.7,4.7,4.6,4.3,3.9
3,ARE,United Arab Emirates,784.0,79.3347,79.5036,79.6274,79.7262,78.9457,79.42752,10.842620,...,100.000000,100.000000,100.000000,100.000000,100.000000,6.3,6.4,6.2,6.0,5.6
4,ARG,Argentina,32.0,76.3077,76.8330,76.9994,77.2845,75.8921,76.66334,10.928190,...,100.000000,99.989578,100.000000,100.000000,99.967831,2.6,3.1,3.4,3.5,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,VNM,Viet Nam,704.0,73.9382,73.9632,73.9757,74.0929,75.3779,74.26958,8.122675,...,100.000000,100.000000,99.400002,100.000000,99.720000,7.8,7.2,6.8,6.2,5.7
148,VUT,Vanuatu,548.0,69.6496,69.7095,69.7948,69.8769,70.2995,69.86606,6.680000,...,62.799999,61.754513,64.590187,67.333267,62.859593,11.2,12.3,12.6,12.4,11.9
149,WSM,Samoa,882.0,72.5397,72.5900,72.6358,72.1572,72.7677,72.53808,11.526498,...,96.800003,99.994476,99.199997,100.000000,99.123553,4.7,4.6,4.5,4.4,4.4
150,YEM,Yemen,887.0,66.0641,65.9573,64.5751,65.0917,64.6501,65.26766,3.000000,...,79.199997,62.000000,72.751076,73.757927,71.313701,46.1,46.6,44.7,42.8,41.4


## Preprocessing

In [15]:
# Replace all null values by mean
df.fillna(df.mean(), inplace=True)
df.isnull().sum()

  df.fillna(df.mean(), inplace=True)


ISO3_Code                              0
Country_Name                           0
M49_Code                               0
Life_Expectancy_2016                   0
Life_Expectancy_2017                   0
Life_Expectancy_2018                   0
Life_Expectancy_2019                   0
Life_Expectancy_2020                   0
Life_Expectancy_Avg                    0
Mean_Years_Of_Schooling_2016           0
Mean_Years_Of_Schooling_2017           0
Mean_Years_Of_Schooling_2018           0
Mean_Years_Of_Schooling_2019           0
Mean_Years_Of_Schooling_2020           0
Mean_Years_of_Schooling_Avg            0
CPI_Food_2016                          0
CPI_Food_2017                          0
CPI_Food_2018                          0
CPI_Food_2019                          0
CPI_Food_2020                          0
CPI_Food_Avg                           0
Net_FDI_2016                           0
Net_FDI_2017                           0
Net_FDI_2018                           0
Net_FDI_2019    

## Convert to excel format

In [16]:
# Remove outliers i.e. 3std away from mean
# Apply this function during trials
# This ain't working!!!
# def remove_outliers(df_tmp):
#     df_features_tmp = df_tmp.iloc[:, :-1].copy()
#     mean, std = df_features_tmp.mean(), df_features_tmp.std()
#     lower, upper = mean - 3 * std, mean + 3 * std
#     lower, upper = np.expand_dims(lower, axis=0), np.expand_dims(upper, axis=0)
#     lower_indexes = df_features_tmp[df_features_tmp < lower].index
#     upper_indexes = df_features_tmp[df_features_tmp > upper].index
#     original_m = df_tmp.shape[0]
#     print("Outliers:      ", lower_indexes.size + upper_indexes.size)
#     return df_tmp.iloc[(~lower_indexes) & (~upper_indexes), :]

## Native Functions for Data Processing

In [17]:
def normalize_z(df):
    return (df - df.mean(axis=0))/df.std(axis=0)

def get_features_targets(df, feature_names, target_names):
    df_feature = df[feature_names]
    df_target = df[target_names]
    return df_feature, df_target

def prepare_feature(df_feature):
    const = np.full(shape=(df_feature.shape[0], 1), fill_value=1)
    conv = np.array(df_feature)
    return np.concatenate((const, conv), axis=1)

def prepare_target(df_target):
    return np.array(df_target)

def predict(df_feature, beta):
    arr_feature_norm = prepare_feature(normalize_z(df_feature))
    return calc_linear(arr_feature_norm, beta)

def calc_linear(X, beta):
    return np.matmul(X, beta)

def split_data(df_feature, df_target, random_state=None, test_size=0.5):
    if random_state != None:
        np.random.seed(random_state)
    indexes = df_feature.index
    k = int(len(indexes) * test_size)
    test_index = set(np.random.choice(a=indexes, size=k, replace=False))
    train_index = set(indexes) - test_index

    df_feature_train = df_feature.loc[train_index, :]
    df_feature_test = df_feature.loc[test_index, :]
    df_target_train = df_target.loc[train_index, :]
    df_target_test = df_target.loc[test_index, :]
    return df_feature_train, df_feature_test, df_target_train, df_target_test
  
def r2_score(y, ypred):
    y_bar = (1/len(y))*(np.sum(y, axis=0))
    SS_tot = np.sum((y - y_bar)**2, axis=0)
    SS_res = np.sum((y - ypred)**2, axis=0)
    return (1 - (SS_res/SS_tot))

def mean_squared_error(target, pred):
    return ( (1/len(target)) * (np.sum((target - pred)**2, axis=0)))

In [18]:
# Preparing the df_features & df_target



### Trial 1
Input: 16 variables (format: Var_Year) <br>
Output: 1 variable (undernourishment in 2020)

In [19]:
# Leave output column as prevalence of undernourishment in 2020
df_proc = pd.concat([df.iloc[:, :-5], df.iloc[:, -1]], axis=1)
print(df_proc.shape)
# Remove all avg columns
df_proc = df_proc.loc[:, ~df_proc.columns.str.contains('Avg')]
print(df_proc.shape)

(152, 45)
(152, 39)


In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Remove first 3 columns i.e. M49 Code etc
df_features, df_label = df_proc.iloc[:, 3:-1], df_proc.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(df_features, df_label, test_size = 0.2, random_state=100)
model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("MSE: ", mean_squared_error(y_test, y_pred))
print("r2_score", r2_score(y_test, y_pred))

MSE:  337.47731723887665
r2_score -2.403062858721497


### Trial 2
Input: 4 variables (format: Var_Avg) <br>
Output: 1 variable (undernourishment in 2020)

In [21]:
# Remove all columns but avg columns
df_proc = df.loc[:, ['ISO3_Code', 'Country_Name', 'M49_Code', "Life_Expectancy_Avg", "Mean_Years_of_Schooling_Avg", "CPI_Food_Avg", 'Prevalence_of_undernourishment_2020']]
print(df_proc.columns)

Index(['ISO3_Code', 'Country_Name', 'M49_Code', 'Life_Expectancy_Avg',
       'Mean_Years_of_Schooling_Avg', 'CPI_Food_Avg',
       'Prevalence_of_undernourishment_2020'],
      dtype='object')


In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Remove first 3 columns i.e. M49 Code etc
df_features, df_label = df_proc.iloc[:, 3:-1], df_proc.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(df_features, df_label, test_size = 0.2, random_state=100)
model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("MSE:           ", mean_squared_error(y_test, y_pred))
print("MAE (%):       ", mean_absolute_percentage_error(y_test, y_pred))
print("r2_score:      ", r2_score(y_test, y_pred))
r2, n, p = r2_score(y_test, y_pred), y_test.shape[0], X_test.shape[1]
adj = 1 - (1 - r2) * (n-1) / (n-p-1)
print("Adj r2_score:  ", adj)

MSE:            51.008948818955815
MAE (%):        1.0758083312283153
r2_score:       0.4856345883971088
Adj r2_score:   0.4284828759967876


### Trial 3
Input: 20 variables (format: Var_Year & Var_Avg) <br>
Output: 1 variable (undernourishment in 2020)

In [23]:
# Leave output column as prevalence of undernourishment in 2020
df_proc = pd.concat([df.iloc[:, :-5], df.iloc[:, -1]], axis=1)
print(df_proc.shape)

(152, 45)


In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Remove first 3 columns i.e. M49 Code etc
df_features, df_label = df_proc.iloc[:, 3:-1], df_proc.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(df_features, df_label, test_size = 0.2, random_state=100)
model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("MSE:           ", mean_squared_error(y_test, y_pred))
print("MAE (%):       ", mean_absolute_percentage_error(y_test, y_pred))
print("r2_score:      ", r2_score(y_test, y_pred))
r2, n, p = r2_score(y_test, y_pred), y_test.shape[0], X_test.shape[1]
adj = 1 - (1 - r2) * (n-1) / (n-p-1)
print("Adj r2_score:  ", adj)

MSE:            333.39365805365674
MAE (%):        0.9648724238336495
r2_score:       -2.3618839462701384
Adj r2_score:   10.16877439891856


### Trial 4
Input: 4 variables (format: Var_20xx) <br>
Output: 1 variable (undernourishment in 20xx)

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Remove all columns but 20xx
years = [2016, 2017, 2018, 2019, 2020]
for i, year in enumerate(years):
    df_proc = df.loc[:, df.columns.str.contains(str(year))]
    print("Year:          ", year)

    df_features, df_label = df_proc.iloc[:, :-1], df_proc.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(df_features, df_label, test_size = 0.2, random_state=100)
    model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("MSE:           ", mean_squared_error(y_test, y_pred))
    print("MAE (%):       ", mean_absolute_percentage_error(y_test, y_pred))
    print("r2_score:      ", r2_score(y_test, y_pred))
    r2, n, p = r2_score(y_test, y_pred), y_test.shape[0], X_test.shape[1]
    adj = 1 - (1 - r2) * (n-1) / (n-p-1)
    print("Adj r2_score:  ", adj)

Year:           2016
MSE:            49.162474803173716
MAE (%):        0.7621796147752439
r2_score:       0.38511059275386683
Adj r2_score:   0.19797033837460898
Year:           2017
MSE:            50.41335089860596
MAE (%):        0.7168582345908503
r2_score:       0.36907169951509733
Adj r2_score:   0.1770500428457792
Year:           2018
MSE:            52.285760825174364
MAE (%):        0.7316064014780843
r2_score:       0.35101159690582096
Adj r2_score:   0.15349338726846218
Year:           2019
MSE:            53.11864685611048
MAE (%):        0.6612872960271488
r2_score:       0.39259776024187365
Adj r2_score:   0.20773620901113954
Year:           2020
MSE:            51.304896188000086
MAE (%):        0.6596025723516865
r2_score:       0.48265030634825623
Adj r2_score:   0.325196051758595


### Trial 5
Input: 3 variables (format: Var_2020) <br>
Output: 1 variable (undernourishment in 2020)

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Remove all columns but 2020
df_proc = df.loc[:, df.columns.str.contains("2020")]
columns = df_proc.columns[:-1]
for i, column in enumerate(columns):
    df_tmp = df_proc.loc[:, ~df_proc.columns.str.contains(column)]
    print("Dropped col:   ", column)

    # Remove outliers
    # NOt working yet
    # df_tmp = remove_outliers(df_tmp)

    df_features, df_label = df_tmp.iloc[:, :-1], df_tmp.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(df_features, df_label, test_size = 0.2, random_state=100)
    model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("MSE:           ", mean_squared_error(y_test, y_pred), "      MAE:           ", mean_absolute_error(y_test, y_pred))
    print("MAE (%):       ", mean_absolute_percentage_error(y_test, y_pred), "      r2_score:      ", r2_score(y_test, y_pred))
    r2, n, p = r2_score(y_test, y_pred), y_test.shape[0], X_test.shape[1]
    adj = 1 - (1 - r2) * (n-1) / (n-p-1)
    print("Adj r2_score:  ", adj)

Dropped col:    Life_Expectancy_2020
MSE:            51.415684863556905       MAE:            4.991210966699863
MAE (%):        0.6021373561690883       r2_score:       0.4815331325185037
Adj r2_score:   0.3519164156481297
Dropped col:    Mean_Years_Of_Schooling_2020
MSE:            55.24064992276049       MAE:            5.36979363135765
MAE (%):        0.7420048762976054       r2_score:       0.44296284686084675
Adj r2_score:   0.30370355857605846
Dropped col:    CPI_Food_2020
MSE:            53.220502574849405       MAE:            5.247942045461805
MAE (%):        0.7194883078760532       r2_score:       0.4633336630836
Adj r2_score:   0.32916707885450014
Dropped col:    Net_FDI_2020
MSE:            51.22362870172242       MAE:            5.057319121941448
MAE (%):        0.6419040150219391       r2_score:       0.4834697936146476
Adj r2_score:   0.3543372420183095
Dropped col:    Unemployment_2020
MSE:            54.12191985993046       MAE:            5.291247206840111
MAE (%):  

### Trial 6
Input: Combinations of variables (format: Var_2020) <br>
Output: 1 variable (undernourishment in 2020)

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from itertools import combinations
from pprint import pprint

# Remove all columns but 2020
df_proc = df.loc[:, df.columns.str.contains("2020")]
columns = df_proc.columns[:-1].values
combis = []
res = {}
fullres = {}
for i in range(2, len(columns)):
    combis += list(combinations(columns, i))
for i, combi in enumerate(combis):
    df_tmp = pd.concat([df_proc.loc[:, combi], df_proc.iloc[:, -1]], axis=1)

    # Remove outliers
    # NOt working yet
    # df_tmp = remove_outliers(df_tmp)

    df_features, df_label = df_tmp.iloc[:, :-1], df_tmp.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(df_features, df_label, test_size = 0.2, random_state=100)
    model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    #print("MSE:           ", mean_squared_error(y_test, y_pred), "      MAE:           ", mean_absolute_error(y_test, y_pred))
    #print("MAE (%):       ", mean_absolute_percentage_error(y_test, y_pred), "      r2_score:      ", r2_score(y_test, y_pred))
    r2, n, p = r2_score(y_test, y_pred), y_test.shape[0], X_test.shape[1]
    adj = 1 - (1 - r2) * (n-1) / (n-p-1)
    #print("Adj r2_score:  ", adj)
    fullres[combi] = {"MAE": mean_absolute_error(y_test, y_pred), "ADJ_R2": adj}
    res[combi] = adj
res = {k: v for k, v in sorted(res.items(), key=lambda item: item[1], reverse=True)}
for key in list(res.keys())[:5]:
    print(key)
    print(fullres[key])
    print()

('Mean_Years_Of_Schooling_2020', 'Unemployment_2020')
{'MAE': 4.550100289033643, 'ADJ_R2': 0.5945073478416296}

('Mean_Years_Of_Schooling_2020', 'CPI_Food_2020', 'Unemployment_2020')
{'MAE': 4.497087496935168, 'ADJ_R2': 0.5833501305886548}

('Mean_Years_Of_Schooling_2020', 'Net_FDI_2020', 'Unemployment_2020')
{'MAE': 4.506582231092428, 'ADJ_R2': 0.5819942072283368}

('Mean_Years_Of_Schooling_2020', 'Unemployment_2020', 'Access_To_Electricity_2020')
{'MAE': 4.500001813584692, 'ADJ_R2': 0.5699071820397457}

('Mean_Years_Of_Schooling_2020', 'CPI_Food_2020', 'Net_FDI_2020', 'Unemployment_2020')
{'MAE': 4.436365677918919, 'ADJ_R2': 0.5698615018806678}

