In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import dice_ml
from dice_ml.utils import helpers # helper functions
from dice_ml import Data,Model,Dice
import numpy as np
from xgboost import XGBClassifier





In [4]:
dataframe_heart_disease = pd.read_csv("heart_statlog_cleveland_hungary_final.csv")

dataframe_heart_disease =  dataframe_heart_disease.dropna()
dataframe_heart_disease = dataframe_heart_disease[dataframe_heart_disease['chol'] !=0]

In [5]:
# Split the data into features and target label
y = dataframe_heart_disease.target
X = dataframe_heart_disease.drop(['target'], axis=1)

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
numerical = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

categorical = X_train.columns.difference(numerical)

In [8]:
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)])


In [9]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', XGBClassifier())])
xgb_model = clf.fit(X_train, y_train)


In [10]:
param_grid = {
    'classifier__max_depth': [3, 4, 5],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__n_estimators': [100, 200, 300],
    'classifier__gamma': [0, 0.1, 0.5]
}


In [11]:
# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(xgb_model, param_grid=param_grid)
grid_search.fit(X_train, y_train)


# Print the best parameters and the score on the test set
print("Best parameters: ", grid_search.best_params_)
print("Test set score: ", grid_search.score(X_test, y_test))

Best parameters:  {'classifier__gamma': 0, 'classifier__learning_rate': 0.5, 'classifier__max_depth': 5, 'classifier__n_estimators': 200}
Test set score:  0.9166666666666666


In [12]:
# create the pipeline with the XGBClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', transformations),
    ('classifier', XGBClassifier(max_depth=5, learning_rate=0.5, n_estimators=200, gamma=0))
])


In [13]:
# train the model on the entire training set
xgb_pipeline = pipeline.fit(X_train, y_train)

# make predictions on the test set
y_pred = xgb_pipeline.predict(X_test)


In [14]:
X_high_risk_tp = X_test[(y_pred == 1) & (y_test == 1)].reset_index().drop(['index'], axis=1)

In [15]:
train_data = pd.concat([X_train, y_train], axis=1)


# Create a DICE data object
d = Data(dataframe=pd.DataFrame(train_data, columns=dataframe_heart_disease.columns), continuous_features=['age', 'trestbps', 'chol', 'thalach', 'oldpeak'],outcome_name='target')

# Create a DICE model object
m = Model(model=xgb_pipeline, backend="sklearn")

Ideal Countefactuals

In [17]:
import threading
import queue

class TimeoutException(Exception):
    pass

def generate_cf(test_instance, timeout=120):
    q = queue.Queue()

    def target():
        try:
            result = Dice(d, m, method='genetic').generate_counterfactuals(test_instance, total_CFs=1, desired_class="opposite",
                                                                           features_to_vary=["trestbps", "chol", "thalach"],
                                                                           diversity_weight=5, proximity_weight=2, sparsity_weight=5,
                                                                           permitted_range={"trestbps": [80, 120], "chol": [150, 200], "thalach": [120, 220 - test_instance['age'].values[0]]}
                                                                           )
            q.put(result)
        except Exception as e:
            print("No counterfactuals found for test instance:", test_instance)
            #df_no_counterfactuals.append(test_instance)
            q.put(None)

    # Start a new thread to run the target function
    thread = threading.Thread(target=target)
    thread.start()

    # Wait for the thread to finish or raise a timeout exception
    thread.join(timeout)

    if thread.is_alive():
        # The thread is still running, so raise a timeout exception
        print("No counterfactuals found for test instance-timed out:", test_instance)
        #df_no_counterfactuals.append(test_instance)
        q.put(None)
    else:
        # The thread has finished, so return the result
        return q.get()

In [18]:
import time
import threading
from joblib import Parallel, delayed
exps_ideal = []
num_cores = -1
df_no_counterfactuals = pd.DataFrame(columns=X_high_risk_tp.columns)
# Iterate over each instance of X_high_risk and generate counterfactuals
for i in range(len(X_high_risk_tp)):
    test_instance = X_high_risk_tp.iloc[[i]]
    print(i)
    exp = Parallel(n_jobs=num_cores)(delayed(generate_cf)(test_instance) for i in range(1))
    exps_ideal.append(exp)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96


In [19]:
exps_ideal

[[<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278ab1da90>],
 [None],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278bf93f90>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278afaa0d0>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278af96310>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278afa9fd0>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278af96010>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278af75b10>],
 [None],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278bf90050>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278bfa2390>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278af6d810>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278bfdb250>],
 [None],
 [None],
 [<dice_ml.counter

In [None]:
exp_dfs = exp.cf_examples_list[0].final_cfs_df

In [20]:
df_no_counterfactuals = X_high_risk_tp.loc[[exp[0] is None for exp in exps_ideal]]

In [21]:
df_no_counterfactuals

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope
1,50,1,3,140,233,0,0,163,0,0.6,2
8,70,1,4,145,174,0,0,125,1,2.6,3
13,59,1,4,170,326,0,2,140,1,3.4,3
14,43,1,4,150,247,0,0,130,1,2.0,2
18,62,1,4,139,170,0,1,120,1,3.0,2
20,54,1,4,125,216,0,0,140,0,0.0,2
27,57,1,4,130,131,0,0,115,1,1.2,2
30,51,1,3,135,160,0,0,150,0,2.0,2
31,57,1,4,140,214,0,1,144,1,2.0,2
45,60,1,4,130,186,1,1,140,1,0.5,2


In [22]:
X_test_no_ideal_cf = df_no_counterfactuals

Feasible Counterfactuals -  For the instances that did not have Ideal CFs


In [31]:
import threading
import queue

class TimeoutException(Exception):
    pass

def generate_cf_feasible(test_instance, timeout=300):
    q = queue.Queue()

    def target():
        try:
            result = Dice(d, m, method='genetic').generate_counterfactuals(test_instance, total_CFs=5, desired_class="opposite",
                                                                           features_to_vary=["trestbps", "chol", "thalach"],
                                                                           diversity_weight=5, proximity_weight=2, sparsity_weight=5,
                                                                           permitted_range={"trestbps": [80, 149],
                                                                                            "chol": [100, test_instance['chol'].values[0]-0.1*test_instance['chol'].values[0]],
                                                                                            "thalach": [120, 220 - test_instance['age'].values[0]]}                                                                           )
            q.put(result)
        except Exception as e:
            print("No counterfactuals found for test instance:", test_instance)
            #df_no_counterfactuals.append(test_instance)
            q.put(None)

    # Start a new thread to run the target function
    thread = threading.Thread(target=target)
    thread.start()

    # Wait for the thread to finish or raise a timeout exception
    thread.join(timeout)

    if thread.is_alive():
        # The thread is still running, so raise a timeout exception
        print("No counterfactuals found for test instance-timed out:", test_instance)
        #df_no_counterfactuals.append(test_instance)
        q.put(None)
    else:
        # The thread has finished, so return the result
        return q.get()

In [32]:
exps_feasible = []
num_cores = -1
df_no_counterfactuals_feasible = pd.DataFrame(columns=X_test_no_ideal_cf.columns)
# Iterate over each instance of X_high_risk and generate counterfactuals
for i in range(len(X_test_no_ideal_cf)):
    test_instance = X_test_no_ideal_cf.iloc[[i]]
    print(i)   
    exp = Parallel(n_jobs=num_cores)(delayed(generate_cf_feasible)(test_instance) for i in range(1))
    exps_feasible.append(exp)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [33]:
exps_feasible

[[<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278c361a90>],
 [None],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278c32ed50>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278af4cc50>],
 [None],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278c234a50>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278c2c50d0>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278c394e10>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278c3ad910>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278c742b90>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278c736250>],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278c302990>],
 [None],
 [<dice_ml.counterfactual_explanations.CounterfactualExplanations at 0x2278c27f0d0>],
 [<dice_ml.counterfactual_e

In [109]:
exps_feasible[0][0].visualize_as_dataframe()

Query instance (original outcome : 0)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,target
0,50.0,1,3,140.0,233.0,0,0,163.0,0,0.6,2,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,target
0,50.0,1,3,140.0,142.0,0,0,163.0,0,0.6,2,1
0,50.0,1,3,140.0,151.0,0,0,164.0,0,0.6,2,1
0,50.0,1,3,140.0,105.0,0,0,164.0,0,0.6,2,1
0,50.0,1,3,140.0,105.0,0,0,164.0,0,0.6,2,1


In [34]:
df_no_counterfactuals_feasible = X_test_no_ideal_cf.loc[[exp[0] is None for exp in exps_feasible]]

In [35]:
df_no_counterfactuals_feasible

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope
8,70,1,4,145,174,0,0,125,1,2.6,3
18,62,1,4,139,170,0,1,120,1,3.0,2
50,53,1,4,120,246,0,0,116,1,0.0,2
78,43,1,4,120,175,0,0,120,1,1.0,2


General CFs

In [36]:
X_test_general = df_no_counterfactuals_feasible

In [37]:
import threading
import queue

class TimeoutException(Exception):
    pass

def generate_cf_general(test_instance, timeout=300):
    q = queue.Queue()

    def target():
        try:
            result = Dice(d, m, method='genetic').generate_counterfactuals(test_instance, total_CFs=20, desired_class="opposite",
                                                                           features_to_vary=["trestbps", "chol", "thalach"],
                                                                           diversity_weight=5, proximity_weight=2, sparsity_weight=5
                                                                          )
            q.put(result)
        except Exception as e:
            print("No counterfactuals found for test instance:", test_instance)
            #df_no_counterfactuals.append(test_instance)
            q.put(None)

    # Start a new thread to run the target function
    thread = threading.Thread(target=target)
    thread.start()

    # Wait for the thread to finish or raise a timeout exception
    thread.join(timeout)

    if thread.is_alive():
        # The thread is still running, so raise a timeout exception
        print("No counterfactuals found for test instance-timed out:", test_instance)
        #df_no_counterfactuals.append(test_instance)
        q.put(None)
    else:
        # The thread has finished, so return the result
        return q.get()

In [38]:
exps_general= []
num_cores = -1
df_no_counterfactuals_general= pd.DataFrame(columns=X_test_no_ideal_cf.columns)
# Iterate over each instance of X_high_risk and generate counterfactuals
for i in range(len(X_test_general)):
    test_instance = X_test_general.iloc[[i]]
    print(i)   
    exp = Parallel(n_jobs=num_cores)(delayed(generate_cf_general)(test_instance) for i in range(1))
    exps_general.append(exp)

0
1
2
3


In [39]:
for i in range(len(X_test_general)):
     
     test_instance = X_test_general.iloc[[i]]
     exp = exps_general[i] 
     exp_dfs = exp[0].cf_examples_list[0].final_cfs_df
     thalach_test = test_instance['thalach'].values[0] 
     chol_test = test_instance['chol'].values[0]


     feasible_counterfactuals = exp_dfs[((exp_dfs['thalach'] > thalach_test) & (exp_dfs['thalach']< 220 - exp_dfs['age'])) & (exp_dfs['chol'] <= chol_test-0.1*chol_test) & (exp_dfs['trestbps'] <= 149)]

     if len(feasible_counterfactuals) > 0:
          print(i)

1


In [41]:
feasible_counterfactuals

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,target


In [43]:
exp_dfs = exps_general[1][0].cf_examples_list[0].final_cfs_df


feasible_counterfactuals = exp_dfs[((exp_dfs['thalach'] > thalach_test) & (exp_dfs['thalach']< 220 - exp_dfs['age'])) & (exp_dfs['chol'] <= 200) & (exp_dfs['trestbps'] <= 120)]

In [44]:
feasible_counterfactuals

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,target
0,62.0,1,4,105.0,106.0,0,1,151.0,1,3.0,2,0
0,62.0,1,4,101.0,188.0,0,1,151.0,1,3.0,2,0
0,62.0,1,4,104.0,85.0,0,1,151.0,1,3.0,2,0
0,62.0,1,4,100.0,178.0,0,1,153.0,1,3.0,2,0


In [40]:
exps_general[1][0].visualize_as_dataframe()

Query instance (original outcome : 1)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,target
0,62.0,1,4,139.0,170.0,0,1,120.0,1,3.0,2,1



Diverse Counterfactual set (new outcome: 0)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,target
0,62.0,1,4,110.0,208.0,0,1,142.0,1,3.0,2,0
0,62.0,1,4,109.0,210.0,0,1,142.0,1,3.0,2,0
0,62.0,1,4,110.0,211.0,0,1,144.0,1,3.0,2,0
0,62.0,1,4,106.0,210.0,0,1,143.0,1,3.0,2,0
0,62.0,1,4,110.0,224.0,0,1,152.0,1,3.0,2,0
0,62.0,1,4,109.0,224.0,0,1,152.0,1,3.0,2,0
0,62.0,1,4,104.0,225.0,0,1,145.0,1,3.0,2,0
0,62.0,1,4,110.0,208.0,0,1,160.0,1,3.0,2,0
0,62.0,1,4,104.0,208.0,0,1,148.0,1,3.0,2,0
0,62.0,1,4,110.0,211.0,0,1,161.0,1,3.0,2,0
