In order to run the Counterfactual Explanations we need a classification Problem. Therefore we load the original data and create a classification problem by binning the variables. We have to change the data according to this scheme: Each Variable at Each Timestep is 0 if its below the mean of the same variable across all patient in the same time step otherwise its 1. Since we are dealing with time series and our true casual graph has effects only from the last time step we have to create those lagged values for each sample.

In [2]:
import numpy as np
import pandas as pd
import pickle, os 

In [3]:
# Load the pickle file
with open(os.path.join("tmp_data","tsce_causalhans.pkl"), "rb") as f:
    extended_data = pickle.load(f)

# Calculate the mean along the patients axis (axis=0) for each variable and each timestep
mean_values = np.mean(extended_data, axis=0)

# Replace values based on the mean of each timestep for each variable
binary_extended_data = np.where(extended_data > mean_values, 1, 0)

# Create lagged datasets
lagged_data = np.roll(binary_extended_data.astype(float), shift=1, axis=1)  # Convert to float datatype
lagged_data[:, 0, :] = np.nan

# Combine lagged and original datasets
combined_data = np.concatenate([lagged_data, binary_extended_data], axis=2)

# Prepare the dataframe
columns = ["Lagged_Age", "Lagged_Nutrition", "Lagged_Health", "Lagged_Mobility", "Age", "Nutrition", "Health", "Mobility"]
final_data = []

for patient in combined_data:
    patient_df = pd.DataFrame(patient, columns=columns)
    patient_df = patient_df.dropna()  # Remove rows with NaN values due to the lag
    final_data.append(patient_df)

# Concatenate all patient data into one dataframe
all_patients_df = pd.concat(final_data, ignore_index=True)

# Save the dataframe as a pickle file
all_patients_df.to_pickle(os.path.join("tmp_data", "counterfactual_tssce_tst.pkl"))
all_patients_df.to_csv(os.path.join("tmp_data", "shapley_tssce_classification.csv"))