In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from xgboost import XGBRegressor
import matplotlib.pyplot as plt

### Importing data

In [2]:
fat_df = pd.read_csv('data/data_clean/fat_data_clean.csv')
kcal_df = pd.read_csv('data/data_clean/kcal_data_clean.csv')
kg_df = pd.read_csv('data/data_clean/kg_data_clean.csv')
protein_df = pd.read_csv('data/data_clean/protein_data_clean.csv')

## Identifying Multicollinearity in Data

In [3]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [4]:
# Using fat_df just as preliminary dataset to identify multicollinearity
fat_df = fat_df.drop(['Country'], axis=1)

X = fat_df.iloc[:, :-6]
X_col = X.copy()
X = MinMaxScaler().fit_transform(X)
y = fat_df['Deaths']


vif_data = pd.DataFrame()
vif_data["feature"] = X_col.columns

In [5]:
# Training-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 5)

# Initial results
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X, i)
                          for i in range(len(X_col.columns))]

In [6]:
#Identified that:
# - Vegetable Oil is highly correlative with Vegetable Products attribute
# - Milk and Animal Fats might correlate and fit with Animal Products
vif_data.sort_values('VIF')

Unnamed: 0,feature,VIF
18,Sugar & Sweeteners_f,1.894525
3,"Aquatic Products, Other_f",3.423765
0,Alcoholic Beverages_f,3.743534
23,Obesity,8.263887
17,Sugar Crops_f,10.79844
9,Miscellaneous_f,132.1336
11,Offals_f,1421.459
22,Vegetables_f,2050.995
15,Starchy Roots_f,3085.332
13,Pulses_f,3408.131


## Using XGBoost for Feature Importance

### Using Food Supply in Kcal dataset

In [7]:
# Removing Country column due to it being categorical and not providing much information
kcal_df = kcal_df.drop(['Country', 'Obesity'], axis=1)
kcal_df.describe()

Unnamed: 0,Alcoholic Beverages_kcal,Animal Products_kcal,Animal fats_kcal,"Aquatic Products, Other_kcal",Cereals - Excluding Beer_kcal,Eggs_kcal,"Fish, Seafood_kcal",Fruits - Excluding Wine_kcal,Meat_kcal,Milk - Excluding Butter_kcal,...,Treenuts_kcal,Vegetal Products_kcal,Vegetable Oils_kcal,Vegetables_kcal,Undernourished,Confirmed,Deaths,Recovered,Active,Population
count,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,...,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0
mean,1.341392,9.25968,1.276999,0.002888,20.319704,0.42854,0.607802,2.030773,3.842554,2.956138,...,0.26436,40.741763,4.898168,1.082252,10.902927,2.021972,0.03937,1.452356,0.528685,45620760.0
std,1.068834,4.766646,1.299003,0.031438,6.490782,0.308497,0.549763,1.437449,2.198052,2.024847,...,0.28935,4.766396,2.182019,0.653259,11.527729,2.360454,0.048718,1.927617,1.32236,159116500.0
min,0.0,1.6237,0.0,0.0,8.9565,0.0188,0.0,0.1471,0.298,0.1169,...,0.0,27.7089,0.9325,0.0957,2.5,0.000312,0.0,0.0,0.0,54000.0
25%,0.3889,5.03195,0.34975,0.0,14.7346,0.13935,0.2375,1.210025,2.06485,1.11205,...,0.044125,36.841775,3.1079,0.602,2.5,0.140976,0.002013,0.099107,0.010451,2926500.0
50%,1.2446,8.93325,0.8775,0.0,19.61995,0.39915,0.46675,1.7461,3.6688,2.76645,...,0.17605,41.0627,4.6828,1.0013,6.25,1.01157,0.011998,0.475402,0.074515,10316000.0
75%,2.04225,13.160125,1.7769,0.0,24.82855,0.634925,0.86195,2.386525,5.161625,4.358275,...,0.3942,44.9856,6.538,1.34985,13.675,3.487069,0.069503,2.622874,0.34788,32791750.0
max,5.1574,22.2911,7.8007,0.4007,37.5265,1.4461,4.4183,8.854,10.5674,9.9441,...,1.421,48.3864,10.3839,3.3524,59.599998,10.408199,0.185428,9.039871,8.019819,1402385000.0


##### Feature Importance for Confirmed Attribute

In [8]:
# Dividing the data between X and y of interest
X = kcal_df.iloc[:, :-6]
X_col = X.copy()
# X = MinMaxScaler().fit_transform(X)
y = kcal_df['Confirmed'] 

In [None]:
# Training-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Training split using K-Fold for cross_val
kfold = KFold(n_splits=10)

# Parameter grid for XGBoost
params = {'gamma': [0.5, 1, 1.5, 2, 5],
          'max_depth': [3, 4, 5], 
          'min_child_weight': [1, 2, 5, 10]
         }

# Create XGBoost model instance
xgb = XGBRegressor()

# Create GridSearch instance
xgb_gs = GridSearchCV(xgb, param_grid=params)
xgb_gs.fit(X, y)

# Metrics and Cross Validation of model
mse = cross_val_score(xgb_gs, X, y, scoring='neg_mean_squared_error', cv=kfold)
rmse = cross_val_score(xgb_gs, X, y, scoring='neg_root_mean_squared_error', cv=kfold)
mae = cross_val_score(xgb_gs, X, y, scoring='neg_mean_absolute_error', cv=kfold)
r2 = cross_val_score(xgb_gs, X, y, scoring='r2', cv=kfold)

print(f"MSE: {-mse.mean()} ({mse.std()})")
print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
print(f"MAE: {-mae.mean()} ({mae.std()})")

In [None]:
# Get Feature Importance from model
importance = xgb_gs.best_estimator_.feature_importances_


print("\nFeature Importances:")
for item in zip(X_col.columns, xgb_gs.best_estimator_.feature_importances_):
    print("{1:10.4f} - {0}".format(item[0],item[1]))

# Plot feature importance
feat_importance = zip(X_col.columns, importance)
feat_importance = sorted([x for x in feat_importance], key = lambda x: x[1], reverse=True)

plt.bar(*zip(*feat_importance))
plt.xticks(rotation=90)
plt.show()

##### Feature Importance for Deaths Attribute

In [None]:
# Dividing the data between X and y of interest
X = kcal_df.iloc[:, :-6]
X_col = X.copy()

# Scaled data values by a scale of 100 so XGBoost can learn effectively (issues with small data values)
# Did this since we are only using model for feature importance (would not do for prediction model)
y = kcal_df['Deaths']*100

In [None]:
# Training-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Training split using K-Fold for cross_val
kfold = KFold(n_splits=10)

# Parameter grid for XGBoost
params = {'gamma': [0.5, 1, 1.5, 2, 5],
          'max_depth': [3, 4, 5],
          'min_child_weight': [1, 2, 5, 10]
         }

# Create XGBoost model instance
xgb = XGBRegressor()

# Create GridSearch instance
xgb_gs = GridSearchCV(xgb, param_grid=params)
xgb_gs.fit(X, y)

# Metrics and Cross Validation of model
mse = cross_val_score(xgb_gs, X, y, scoring='neg_mean_squared_error', cv=kfold)
rmse = cross_val_score(xgb_gs, X, y, scoring='neg_root_mean_squared_error', cv=kfold)
mae = cross_val_score(xgb_gs, X, y, scoring='neg_mean_absolute_error', cv=kfold)
r2 = cross_val_score(xgb_gs, X, y, scoring='r2', cv=kfold)

print(f"MSE: {-mse.mean()} ({mse.std()})")
print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
print(f"MAE: {-mae.mean()} ({mae.std()})")

In [None]:
# Get Feature Importance from model
importance = xgb_gs.best_estimator_.feature_importances_


print("\nFeature Importances:")
for item in zip(X_col.columns, xgb_gs.best_estimator_.feature_importances_):
    print("{1:10.4f} - {0}".format(item[0],item[1]))

# Plot feature importance
feat_importance = zip(X_col.columns, importance)
feat_importance = sorted([x for x in feat_importance], key = lambda x: x[1], reverse=True)

plt.bar(*zip(*feat_importance))
plt.xticks(rotation=90)
plt.show()

##### Feature Importance for Recovered Attribute

In [None]:
# Dividing the data between X and y of interest
X = kcal_df.iloc[:, :-6]
X_col = X.copy()
# X = MinMaxScaler().fit_transform(X)
y = kcal_df['Recovered']
y.describe()

In [None]:
# Training-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
# Training split using K-Fold for cross_val
kfold = KFold(n_splits=10)

# Parameter grid for XGBoost
params = {'gamma': [0.5, 1, 1.5, 2, 5]
#           'max_depth': [3, 4, 5]
#           'min_child_weight': [1, 2, 5, 10]
         }

# Create XGBoost model instance
xgb = XGBRegressor()

# Create GridSearch instance
xgb_gs = GridSearchCV(xgb, param_grid=params)
xgb_gs.fit(X, y)

# Metrics and Cross Validation of model
mse = cross_val_score(xgb_gs, X, y, scoring='neg_mean_squared_error', cv=kfold)
rmse = cross_val_score(xgb_gs, X, y, scoring='neg_root_mean_squared_error', cv=kfold)
mae = cross_val_score(xgb_gs, X, y, scoring='neg_mean_absolute_error', cv=kfold)
r2 = cross_val_score(xgb_gs, X, y, scoring='r2', cv=kfold)

print(f"MSE: {-mse.mean()} ({mse.std()})")
print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
print(f"MAE: {-mae.mean()} ({mae.std()})")

In [None]:
# Get Feature Importance from model
importance = xgb_gs.best_estimator_.feature_importances_


print("\nFeature Importances:")
for item in zip(X_col.columns, xgb_gs.best_estimator_.feature_importances_):
    print("{1:10.4f} - {0}".format(item[0],item[1]))

# Plot feature importance
feat_importance = zip(X_col.columns, importance)
feat_importance = sorted([x for x in feat_importance], key = lambda x: x[1], reverse=True)

plt.bar(*zip(*feat_importance))
plt.xticks(rotation=90)
plt.show()

### Using Food Supply in Kg dataset

In [None]:
# Removing Country column due to it being categorical and not providing much information
kg_df = kg_df.drop(['Country', 'Obesity'], axis=1)
kg_df

##### Feature Importance for Confirmed Attribute

In [None]:
# Dividing the data between X and y of interest
X = kg_df.iloc[:, :-6]
X_col = X.copy()
y = kg_df['Confirmed'] 

In [None]:
# Training-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Training split using K-Fold for cross_val
kfold = KFold(n_splits=10)

# Parameter grid for XGBoost
params = {'gamma': [0.5, 1, 1.5, 2, 5],
          'max_depth': [3, 4, 5],
          'min_child_weight': [1, 2, 5, 10]
         }

# Create XGBoost model instance
xgb = XGBRegressor()

# Create GridSearch instance
xgb_gs = GridSearchCV(xgb, param_grid=params)
xgb_gs.fit(X, y)

# Metrics and Cross Validation of model
mse = cross_val_score(xgb_gs, X, y, scoring='neg_mean_squared_error', cv=kfold)
rmse = cross_val_score(xgb_gs, X, y, scoring='neg_root_mean_squared_error', cv=kfold)
mae = cross_val_score(xgb_gs, X, y, scoring='neg_mean_absolute_error', cv=kfold)
r2 = cross_val_score(xgb_gs, X, y, scoring='r2', cv=kfold)

print(f"MSE: {-mse.mean()} ({mse.std()})")
print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
print(f"MAE: {-mae.mean()} ({mae.std()})")

In [None]:
# Get Feature Importance from model
importance = xgb_gs.best_estimator_.feature_importances_


print("\nFeature Importances:")
for item in zip(X_col.columns, xgb_gs.best_estimator_.feature_importances_):
    print("{1:10.4f} - {0}".format(item[0],item[1]))

# Plot feature importance
feat_importance = zip(X_col.columns, importance)
feat_importance = sorted([x for x in feat_importance], key = lambda x: x[1], reverse=True)

plt.bar(*zip(*feat_importance))
plt.xticks(rotation=90)
plt.show()

##### Feature Importance for Deaths Attribute

In [None]:
# Dividing the data between X and y of interest
X = kg_df.iloc[:, :-6]
X_col = X.copy()
y = kg_df['Deaths']*100

In [None]:
# Training-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Training split using K-Fold for cross_val
kfold = KFold(n_splits=10)

# Parameter grid for XGBoost
params = {'gamma': [0.5, 1, 1.5, 2, 5],
          'max_depth': [3, 4, 5],
          'min_child_weight': [1, 2, 5, 10]
         }

# Create XGBoost model instance
xgb = XGBRegressor()

# Create GridSearch instance
xgb_gs = GridSearchCV(xgb, param_grid=params)
xgb_gs.fit(X, y)

# Metrics and Cross Validation of model
mse = cross_val_score(xgb_gs, X, y, scoring='neg_mean_squared_error', cv=kfold)
rmse = cross_val_score(xgb_gs, X, y, scoring='neg_root_mean_squared_error', cv=kfold)
mae = cross_val_score(xgb_gs, X, y, scoring='neg_mean_absolute_error', cv=kfold)
r2 = cross_val_score(xgb_gs, X, y, scoring='r2', cv=kfold)

print(f"MSE: {-mse.mean()} ({mse.std()})")
print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
print(f"MAE: {-mae.mean()} ({mae.std()})")

In [None]:
# Get Feature Importance from model
importance = xgb_gs.best_estimator_.feature_importances_


print("\nFeature Importances:")
for item in zip(X_col.columns, xgb_gs.best_estimator_.feature_importances_):
    print("{1:10.4f} - {0}".format(item[0],item[1]))

# Plot feature importance
feat_importance = zip(X_col.columns, importance)
feat_importance = sorted([x for x in feat_importance], key = lambda x: x[1], reverse=True)

plt.bar(*zip(*feat_importance))
plt.xticks(rotation=90)
plt.show()

##### Feature Importance for Recovered Attribute

In [None]:
# Dividing the data between X and y of interest
X = kg_df.iloc[:, :-6]
X_col = X.copy()
y = kg_df['Recovered']

In [None]:
# Training-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Training split using K-Fold for cross_val
kfold = KFold(n_splits=10)

# Parameter grid for XGBoost
params = {'gamma': [0.5, 1, 1.5, 2, 5]
#           'max_depth': [3, 4, 5]
#           'min_child_weight': [1, 2, 5, 10]
         }


# Create XGBoost model instance
xgb = XGBRegressor()

# Create GridSearch instance
xgb_gs = GridSearchCV(xgb, param_grid=params)
xgb_gs.fit(X, y)

# Metrics and Cross Validation of model
mse = cross_val_score(xgb_gs, X, y, scoring='neg_mean_squared_error', cv=kfold)
rmse = cross_val_score(xgb_gs, X, y, scoring='neg_root_mean_squared_error', cv=kfold)
mae = cross_val_score(xgb_gs, X, y, scoring='neg_mean_absolute_error', cv=kfold)
r2 = cross_val_score(xgb_gs, X, y, scoring='r2', cv=kfold)

print(f"MSE: {-mse.mean()} ({mse.std()})")
print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
print(f"MAE: {-mae.mean()} ({mae.std()})")

In [None]:
# Get Feature Importance from model
importance = xgb_gs.best_estimator_.feature_importances_


print("\nFeature Importances:")
for item in zip(X_col.columns, xgb_gs.best_estimator_.feature_importances_):
    print("{1:10.4f} - {0}".format(item[0],item[1]))

# Plot feature importance
feat_importance = zip(X_col.columns, importance)
feat_importance = sorted([x for x in feat_importance], key = lambda x: x[1], reverse=True)

plt.bar(*zip(*feat_importance))
plt.xticks(rotation=90)
plt.show()

### Using Protein Supply dataset

In [None]:
# Removing Country column due to it being categorical and not providing much information
protein_df = protein_df.drop(['Country', 'Obesity'], axis=1)
protein_df

##### Feature Importance for Confirmed Attribute

In [None]:
# Dividing the data between X and y of interest
X = protein_df.iloc[:, :-6]
X_col = X.copy()
y = protein_df['Confirmed'] 

In [None]:
# Training-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Training split using K-Fold for cross_val
kfold = KFold(n_splits=10)

# Parameter grid for XGBoost
params = {'gamma': [0.5, 1, 1.5, 2, 5],
          'max_depth': [3, 4, 5],
          'min_child_weight': [1, 2, 5, 10]
         }


# Create XGBoost model instance
xgb = XGBRegressor()

# Create GridSearch instance
xgb_gs = GridSearchCV(xgb, param_grid=params)
xgb_gs.fit(X, y)

# Metrics and Cross Validation of model
mse = cross_val_score(xgb_gs, X, y, scoring='neg_mean_squared_error', cv=kfold)
rmse = cross_val_score(xgb_gs, X, y, scoring='neg_root_mean_squared_error', cv=kfold)
mae = cross_val_score(xgb_gs, X, y, scoring='neg_mean_absolute_error', cv=kfold)
r2 = cross_val_score(xgb_gs, X, y, scoring='r2', cv=kfold)

print(f"MSE: {-mse.mean()} ({mse.std()})")
print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
print(f"MAE: {-mae.mean()} ({mae.std()})")

In [None]:
# Get Feature Importance from model
importance = xgb_gs.best_estimator_.feature_importances_


print("\nFeature Importances:")
for item in zip(X_col.columns, xgb_gs.best_estimator_.feature_importances_):
    print("{1:10.4f} - {0}".format(item[0],item[1]))

# Plot feature importance
feat_importance = zip(X_col.columns, importance)
feat_importance = sorted([x for x in feat_importance], key = lambda x: x[1], reverse=True)

plt.bar(*zip(*feat_importance))
plt.xticks(rotation=90)
plt.show()

##### Feature Importance for Deaths Attribute

In [None]:
# Dividing the data between X and y of interest
X = protein_df.iloc[:, :-6]
X_col = X.copy()
y = protein_df['Deaths']*100

In [None]:
# Training-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Training split using K-Fold for cross_val
kfold = KFold(n_splits=10)

# Parameter grid for XGBoost
params = {'gamma': [0.5, 1, 1.5, 2, 5],
          'max_depth': [3, 4, 5],
          'min_child_weight': [1, 2, 5, 10]
         }

# Create XGBoost model instance
xgb = XGBRegressor()

# Create GridSearch instance
xgb_gs = GridSearchCV(xgb, param_grid=params)
xgb_gs.fit(X, y)

# Metrics and Cross Validation of model
mse = cross_val_score(xgb_gs, X, y, scoring='neg_mean_squared_error', cv=kfold)
rmse = cross_val_score(xgb_gs, X, y, scoring='neg_root_mean_squared_error', cv=kfold)
mae = cross_val_score(xgb_gs, X, y, scoring='neg_mean_absolute_error', cv=kfold)
r2 = cross_val_score(xgb_gs, X, y, scoring='r2', cv=kfold)

print(f"MSE: {-mse.mean()} ({mse.std()})")
print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
print(f"MAE: {-mae.mean()} ({mae.std()})")

In [None]:
# Get Feature Importance from model
importance = xgb_gs.best_estimator_.feature_importances_


print("\nFeature Importances:")
for item in zip(X_col.columns, xgb_gs.best_estimator_.feature_importances_):
    print("{1:10.4f} - {0}".format(item[0],item[1]))

# Plot feature importance
feat_importance = zip(X_col.columns, importance)
feat_importance = sorted([x for x in feat_importance], key = lambda x: x[1], reverse=True)

plt.bar(*zip(*feat_importance))
plt.xticks(rotation=90)
plt.show()

##### Feature Importance for Recovered Attribute

In [None]:
# Dividing the data between X and y of interest
X = protein_df.iloc[:, :-6]
X_col = X.copy()
y = protein_df['Recovered']

In [None]:
# Training-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Training split using K-Fold for cross_val
kfold = KFold(n_splits=10)

# Parameter grid for XGBoost
params = {'gamma': [0.5, 1, 1.5, 2, 5]
#           'max_depth': [3, 4, 5]
#           'min_child_weight': [1, 2, 5, 10]
         }


# Create XGBoost model instance
xgb = XGBRegressor()

# Create GridSearch instance
xgb_gs = GridSearchCV(xgb, param_grid=params)
xgb_gs.fit(X, y)

# Metrics and Cross Validation of model
mse = cross_val_score(xgb_gs, X, y, scoring='neg_mean_squared_error', cv=kfold)
rmse = cross_val_score(xgb_gs, X, y, scoring='neg_root_mean_squared_error', cv=kfold)
mae = cross_val_score(xgb_gs, X, y, scoring='neg_mean_absolute_error', cv=kfold)
r2 = cross_val_score(xgb_gs, X, y, scoring='r2', cv=kfold)

print(f"MSE: {-mse.mean()} ({mse.std()})")
print(f"RMSE: {-rmse.mean()} ({rmse.std()})")
print(f"MAE: {-mae.mean()} ({mae.std()})")

In [None]:
# Get Feature Importance from model
importance = xgb_gs.best_estimator_.feature_importances_


print("\nFeature Importances:")
for item in zip(X_col.columns, xgb_gs.best_estimator_.feature_importances_):
    print("{1:10.4f} - {0}".format(item[0],item[1]))

# Plot feature importance
feat_importance = zip(X_col.columns, importance)
feat_importance = sorted([x for x in feat_importance], key = lambda x: x[1], reverse=True)

plt.bar(*zip(*feat_importance))
plt.xticks(rotation=90)
plt.show()