In [None]:
#to import data
import shap
import pandas as pd
import xgboost as xgb
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

In [None]:
CONSTRAINT = 0.3
MAX_USERS = 50

# read the dataframe
data = pd.read_csv('data.csv')

In [None]:
# drop all Null data (filtering null values)
data.dropna(inplace=True)

In [None]:
# split the dataset into training and test data
X = data.iloc[:,:-2]
y = data["symptom_value"]

In [None]:
# split the dataset into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# create a DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)

In [None]:
# specify XGBoost parameters
params = {'max_depth': 3, 'eta': 0.1, 'objective': 'reg:squarederror'}

# train the model
model = xgb.train(params, dtrain)

In [None]:
# make predictions on the test set
y_pred = model.predict(xgb.DMatrix(X_test))

# calculate R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared score:", r2)

In [None]:
# create an explainer object for SHAP
explainer = shap.Explainer(y_pred, X)

In [None]:
# calculate SHAP values for each feature for each instance
shap_values = explainer(X)

In [None]:
# compute the sum of SHAP values for each food category across all instances
food_shap_sum = shap_values.values[:,:-2].sum(axis=0)

In [None]:
# keep track of the total SHAP value for each food category across all users
food_shap_total = np.zeros_like(food_shap_sum)

In [None]:

for i in range(MAX_USERS):
    df = data[data['user_number'] == i]
    X_user = df.iloc[:,:-2]
    
    # calculate SHAP values for the user's data
    shap_values_user = explainer(X_user)
    
    # accumulate the SHAP values for each food category across all users
    food_shap_total += shap_values_user.values[:,:-2].sum(axis=0)

In [None]:
# plot the total SHAP values for each food category
food_labels = X.columns[:-2]
plt.bar(food_labels, food_shap_total)
plt.xlabel('Food categories')
plt.ylabel('Total SHAP value')
plt.show()
