DICE

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import dice_ml
from dice_ml.utils import helpers # helper functions
from dice_ml import Data,Model,Dice
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
import time
import threading
from joblib import Parallel, delayed
#from langchain_experimental.agents import create_pandas_dataframe_agent
import os
import json
import openai
from openai import AzureOpenAI
#from langchain.llms import AzureOpenAI
#from langchain_openai import AzureChatOpenAI
import pandas as pd 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
xgb.set_config(verbosity=2)



In [2]:
dataframe_heart_disease = pd.read_csv("heart_statlog_cleveland_hungary_final.csv")

In [3]:
dataframe_heart_disease =  dataframe_heart_disease.dropna()
dataframe_heart_disease =  dataframe_heart_disease.drop_duplicates()
dataframe_heart_disease = dataframe_heart_disease[dataframe_heart_disease['chol'] !=0]
dataframe_heart_disease = dataframe_heart_disease[dataframe_heart_disease['trestbps'] !=0]

In [4]:
# Split the data into features and target label
y = dataframe_heart_disease.target
X = dataframe_heart_disease.drop(['target'], axis=1)

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
numerical = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

categorical = X_train.columns.difference(numerical)

In [7]:
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)])


In [8]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', XGBClassifier())])
xgb_model = clf.fit(X_train, y_train)


In [9]:
# create the pipeline with the XGBClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', transformations),
    ('classifier', XGBClassifier(max_depth=5, learning_rate=0.5, n_estimators=200, gamma=0))
])


In [10]:
# train the model on the entire training set
xgb_pipeline = pipeline.fit(X_train, y_train)

# make predictions on the test set
y_pred = xgb_pipeline.predict(X_test)


In [11]:
X_high_risk_tp = X_test[(y_pred == 1) & (y_test == 1)].reset_index().drop(['index'], axis=1)
#X_high_risk_tp = X_test[(y_pred == 1) & (y_test == 1)]

In [12]:
train_data = pd.concat([X_train, y_train], axis=1)


# Create a DICE data object
d = Data(dataframe=pd.DataFrame(train_data, columns=dataframe_heart_disease.columns), continuous_features=['age', 'trestbps', 'chol', 'thalach', 'oldpeak'],outcome_name='target')

# Create a DICE model object
m = Model(model=xgb_pipeline, backend="sklearn")

Ideal Constraints

In [13]:
import threading
import queue

df_no_counterfactuals = pd.DataFrame(columns=X_high_risk_tp.columns)
def generate_cf(test_instance, timeout=10):
    q = queue.Queue()

    def target():
        try:
            result = Dice(d, m, method='genetic').generate_counterfactuals(test_instance, total_CFs=20, desired_class="opposite",
                                                                           features_to_vary=["trestbps", "chol", "fbs"],
                                                                           diversity_weight=5, proximity_weight=2, sparsity_weight=5,
                                                                           permitted_range={ "chol": [100, 200], "trestbps": [100, 120] }
                                                                           )
            q.put(result)
        except Exception as e:
            print("No counterfactuals found for test instance:", test_instance)
            df_no_counterfactuals.append(test_instance)
            q.put(None)

    # Start a new thread to run the target function
    thread = threading.Thread(target=target)
    thread.start()

    # Wait for the thread to finish or raise a timeout exception
    thread.join(timeout)

    if thread.is_alive():
        # The thread is still running, so raise a timeout exception
        print("No counterfactuals found for test instance-timed out:", test_instance)
        df_no_counterfactuals.append(test_instance)
        q.put(None)
    else:
        # The thread has finished, so return the result
        return q.get()

In [14]:

exps_ideal = []
num_cores = -1

# Iterate over each instance of X_high_risk and generate counterfactuals
for i in range(len(X_high_risk_tp)):
    test_instance = X_high_risk_tp.iloc[[i]]
    print(i)
    exp = Parallel(n_jobs=num_cores)(delayed(generate_cf)(test_instance) for i in range(1))
    exps_ideal.append(exp)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65


In [15]:
validity_list = []
for exps in exps_ideal:
    
    if exps[0] is not None:
       exp_df = exps[0].cf_examples_list[0].final_cfs_df
       validity_list.append(xgb_pipeline.predict(exp_df))

In [16]:

# Flatten the nested array
flattened_array = [item for sublist in validity_list for item in sublist]

# Calculate the sum of 1s
sum_of_ones = sum(item == 1 for item in flattened_array)

print("Flattened Array:", flattened_array)
print("Sum of 1s:", sum_of_ones)

Flattened Array: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0

Feasible Constraints

In [None]:
import threading
import queue

df_no_counterfactuals = pd.DataFrame(columns=X_high_risk_tp.columns)
def generate_cf_feasible(test_instance, timeout=30):
    q = queue.Queue()

    def target():
        try:
            result = Dice(d, m, method='genetic').generate_counterfactuals(test_instance, total_CFs=20, desired_class="opposite",
                                                                           features_to_vary=["trestbps", "chol", "fbs"],
                                                                           #diversity_weight=5, proximity_weight=2, sparsity_weight=5,
                                                                           permitted_range={"trestbps": [80, test_instance['trestbps'].values[0]-10],
                                                                                            "chol": [100, test_instance['chol'].values[0]-0.1*test_instance['chol'].values[0]],
                                                                                           }
                                                                           )
            q.put(result)
        except Exception as e:
            print("No counterfactuals found for test instance:", test_instance)
            df_no_counterfactuals.append(test_instance)
            q.put(None)

    # Start a new thread to run the target function
    thread = threading.Thread(target=target)
    thread.start()

    # Wait for the thread to finish or raise a timeout exception
    thread.join(timeout)

    if thread.is_alive():
        # The thread is still running, so raise a timeout exception
        print("No counterfactuals found for test instance-timed out:", test_instance)
        df_no_counterfactuals.append(test_instance)
        q.put(None)
    else:
        # The thread has finished, so return the result
        return q.get()

In [None]:

exps_feasible = []
num_cores = -1

# Iterate over each instance of X_high_risk and generate counterfactuals
for i in range(len(X_high_risk_tp)):
    test_instance = X_high_risk_tp.iloc[[i]]
    print(i)
    exp = Parallel(n_jobs=num_cores)(delayed(generate_cf_feasible)(test_instance) for i in range(1))
    exps_feasible.append(exp)

General Counterfactuals

In [13]:
import threading
import queue

df_no_counterfactuals = pd.DataFrame(columns=X_high_risk_tp.columns)
def generate_cf_feasible_(test_instance, timeout=30):
    q = queue.Queue()

    def target():
        try:
            result = Dice(d, m, method='genetic').generate_counterfactuals(test_instance, total_CFs=20, desired_class="opposite",
                                                                           features_to_vary=["trestbps", "chol", "thalach"],
                                                                           #diversity_weight=5, proximity_weight=2, sparsity_weight=5,
                                                                           #permitted_range={"trestbps": [80, test_instance['trestbps'].values[0]-1],
                                                                           #                 "chol": [100, test_instance['chol'].values[0]-1],
                                                                           #                 "thalach": [test_instance['thalach'].values[0]+1, 220 - test_instance['age'].values[0]]}
                                                                           )
            q.put(result)
        except Exception as e:
            print("No counterfactuals found for test instance:", test_instance)
            df_no_counterfactuals.append(test_instance)
            q.put(None)

    # Start a new thread to run the target function
    thread = threading.Thread(target=target)
    thread.start()

    # Wait for the thread to finish or raise a timeout exception
    thread.join(timeout)

    if thread.is_alive():
        # The thread is still running, so raise a timeout exception
        print("No counterfactuals found for test instance-timed out:", test_instance)
        df_no_counterfactuals.append(test_instance)
        q.put(None)
    else:
        # The thread has finished, so return the result
        return q.get()

In [None]:

exps_feasible_ = []
num_cores = -1

# Iterate over each instance of X_high_risk and generate counterfactuals
for i in range(len(X_high_risk_tp)):
    test_instance = X_high_risk_tp.iloc[[i]]
    print(i)
    exp = Parallel(n_jobs=num_cores)(delayed(generate_cf_feasible_)(test_instance) for i in range(1))
    exps_feasible_.append(exp)

Structural Causal Model

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout

# Define the edges
edges = [
    # Risk Factors -> Diseases
    ('age', 'target'),
    ('sex', 'target'),
    ('chol', 'target'),
    ('fbs', 'target'),
    ('trestbps', 'target'),

    # Diseases -> Symptoms
    ('target', 'cp'),
    ('target', 'restecg'),
    ('target', 'thalach'),
    ('target', 'exang'),
    ('target', 'slope'),
    ('target', 'oldpeak'),

    # Direct Risk Factors -> Symptoms relationships
    ('age', 'chol'),
    ('age', 'trestbps'),
    ('sex', 'trestbps'),
    ('sex', 'chol'),
    ('chol', 'trestbps'),
    ('thalach', 'exang'),
    ('exang', 'cp')
]

# Create a directed graph
G = nx.DiGraph()

# Add edges to the graph
G.add_edges_from(edges)

# Use graphviz_layout for a hierarchical arrangement of nodes
plt.figure(figsize=(14, 10))
pos = graphviz_layout(G, prog='dot')

# Draw the graph with hierarchical layout
nx.draw(
    G, pos, with_labels=True, node_size=3000, node_color='lightcoral',
    font_size=10, font_weight='bold', arrowsize=20
)

# Display the graph
plt.title('Causal DAG for Cardiovascular Disease (CVD)')
plt.show()

In [None]:
'''import networkx as nx
import matplotlib.pyplot as plt

# Define the structure of the DAG
edges = [
    ('age', 'chol'),
    ('age', 'trestbps'),
    ('sex', 'chol'),
    ('sex', 'trestbps'),
    ('sex', 'target'),
    ('cp', 'target'),
    ('trestbps', 'target'),
    ('chol', 'target'),
    ('chol', 'trestbps'),
    ('fbs', 'target'),
    ('restecg', 'target'),
    ('thalach', 'target'),
    ('thalach', 'exang'),
    ('exang', 'target'),
    ('oldpeak', 'target'),
    ('oldpeak', 'exang'),
    ('slope', 'target'),
    ('slope', 'exang')
]

# Create a directed graph using NetworkX
G = nx.DiGraph()
G.add_edges_from(edges)

# Plot the DAG
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_size=3000, node_color='lightblue', font_size=10, font_weight='bold', arrowsize=20)
plt.title('DAG for Causal Discovery in Cardiology')
plt.show()
'''

In [None]:
df = pd.read_csv("heart_statlog_cleveland_hungary_final.csv")

In [None]:
df =  df.dropna()
df = df[df['chol'] >0]
df = df[df['trestbps'] >0]

In [27]:
df['target'] = df['target'].astype('category')
df['exang'] = df['exang'].astype('category')
df['fbs'] = df['fbs'].astype('category')
df['cp'] = df['cp'].astype('category')
df['restecg'] = df['restecg'].astype('category')
df['slope'] = df['slope'].astype('category')

In [None]:
from dowhy import gcm
causal_model = gcm.InvertibleStructuralCausalModel(G)

In [None]:
gcm.auto.assign_causal_mechanisms(causal_model, df)

In [None]:
gcm.fit(causal_model, df)


In [None]:
# Check model fit statistics
print(gcm.evaluate_causal_model(causal_model, df))

In [22]:
X_high_risk_tp = X_high_risk_tp.assign(target=1)
df_high_risk_tp = X_high_risk_tp

In [23]:
df_high_risk_tp.to_csv("df_high_risk_tp.csv", index=False)

In [None]:
df_high_risk_tp[3:4]

In [None]:
for i in range(len(df_high_risk_tp)):
    df_risk = df_high_risk_tp.iloc[[i]]
    # Apply the filtering conditions
    age = df_risk['age']
    chol_risk = df_risk['chol']
    trestbps_risk = df_risk['trestbps']
    print("\nOriginal DataFrame:")
    print(df_risk)
    
    exp = exps_ideal[i]
    if exp[0] is not None:
        df_cf = exp[0].cf_examples_list[0].final_cfs_df
        
        for j in range(len(df_cf)):
            cf_chol = df_cf['chol'].iloc[j]
            cf_trestbps = df_cf['trestbps'].iloc[j] if 'trestbps' in df_cf.columns else None
            
            # Check if 'chol' and 'target' are not None
            if pd.notna(cf_chol) and df_risk['target'].notna().all():
                intervention_dict = {'chol': lambda chol: cf_chol}
            elif pd.isna(cf_chol) and pd.notna(cf_trestbps) and df_risk['target'].notna().all():
                intervention_dict = {'trestbps': lambda trestbps: cf_trestbps}
            else:
                print("chol or target is None")
                continue

            cf_samples = gcm.interventional_samples(causal_model, intervention_dict, observed_data=df_risk)
            
            cf_samples_filtered = cf_samples[
                (cf_samples['chol'] < chol_risk)
                 (cf_samples['chol'] < chol_risk)                      
            ]
            
            # Check if any of the target values in filtered cf_samples are 0
            if (cf_samples_filtered['target'] == 0).any():
                # Filter the counterfactual samples that resulted in target = 0
                cf_worked = cf_samples_filtered[cf_samples_filtered['target'] == 0]
                for index, row in cf_worked.iterrows():
                    print("\nCounterfactual DataFrame:")
                    print(cf_samples_filtered)
                    print("\nCholesterol Level:", row['chol'])
                    print("Trestbps Level:", row['trestbps'])

In [None]:
'''for i in range(len(df_high_risk_tp)):
    df_risk = df_high_risk_tp.iloc[[i]]
    # Apply the filtering conditions
    age = df_risk['age']
    chol_risk = df_risk['chol']
    thalach_risk = df_risk['thalach']
    print("\nOriginal DataFrame:")
    print(df_risk)
    
    exp = exps_ideal[i]
    if exp[0] is not None:
        df_cf = exp[0].cf_examples_list[0].final_cfs_df
        
        for j in range(len(df_cf)):
            cf_chol = df_cf['chol'].iloc[j]
            cf_thalach = df_cf['thalach'].iloc[j]
            
            # Check if 'chol' and 'target' are not None
            if pd.notna(cf_chol) and pd.notna(cf_thalach) and df_risk['target'].notna().all():
                cf_samples = gcm.interventional_samples(causal_model, {'chol': lambda chol: cf_chol, 'thalach': lambda thalach: cf_thalach}, observed_data=df_risk)
                cf_samples['exang'] = cf_samples['exang'].clip(0, 1).astype('category')
                
               
                
                cf_samples_filtered = cf_samples[
                    (cf_samples['chol'] < chol_risk)                     
                ]
                
                # Check if any of the target values in filtered cf_samples are 0
                if (cf_samples_filtered['target'] == 0).any():
                    # Filter the counterfactual samples that resulted in target = 0
                    cf_worked = cf_samples_filtered[cf_samples_filtered['target'] == 0]
                    for index, row in cf_worked.iterrows():
                        print("\nCounterfactual DataFrame:")
                        print(cf_samples_filtered)
                        print("\nCholesterol Level:", row['chol'])
                        print("Thalach Level:", row['thalach'])
            else:
                print("chol or target is None")'''

In [None]:
for i in range(len(df_high_risk_tp)):
    df_risk = df_high_risk_tp.iloc[[i]]
    # Apply the filtering conditions
    age = df_risk['age']
    chol_risk = df_risk['chol']
    thalach_risk = df_risk['thalach']
    print("\nOriginal DataFrame:")
    print(df_risk)
    
    exp = exps_feasible[i]
    if exp[0] is not None:
        df_cf = exp[0].cf_examples_list[0].final_cfs_df
        
        for j in range(len(df_cf)):
            cf_chol = df_cf['chol'].iloc[j]
            cf_thalach = df_cf['thalach'].iloc[j]
            
            # Check if 'chol' and 'target' are not None
            if pd.notna(cf_chol) and pd.notna(cf_thalach) and df_risk['target'].notna().all():
                cf_samples = gcm.interventional_samples(causal_model, {'chol': lambda chol: cf_chol, 'thalach': lambda thalach: cf_thalach}, observed_data=df_risk)
                cf_samples['exang'] = cf_samples['exang'].clip(0, 1).astype('category')
                
               
                
                cf_samples_filtered = cf_samples[
                    (cf_samples['chol'] < chol_risk) &
                    (cf_samples['thalach'] > thalach_risk) &
                    (cf_samples['thalach'] < (220 - age))
                ]
                
                # Check if any of the target values in filtered cf_samples are 0
                if (cf_samples_filtered['target'] == 0).any():
                    # Filter the counterfactual samples that resulted in target = 0
                    cf_worked = cf_samples_filtered[cf_samples_filtered['target'] == 0]
                    for index, row in cf_worked.iterrows():
                        print("\nCounterfactual DataFrame:")
                        print(cf_samples_filtered)
                        print("\nCholesterol Level:", row['chol'])
                        print("Thalach Level:", row['thalach'])
            else:
                print("chol or target is None")

In [None]:
for i in range(len(df_high_risk_tp)):
    df_risk = df_high_risk_tp.iloc[[i]]
    # Apply the filtering conditions
    age = df_risk['age']
    chol_risk = df_risk['chol']
    thalach_risk = df_risk['thalach']
    print("\nOriginal DataFrame:")
    print(df_risk)
    
    exp = exps_feasible_[i]
    if exp[0] is not None:
        df_cf = exp[0].cf_examples_list[0].final_cfs_df
        
        for j in range(len(df_cf)):
            cf_chol = df_cf['chol'].iloc[j]
            cf_thalach = df_cf['thalach'].iloc[j]
            
            # Check if 'chol' and 'target' are not None
            if pd.notna(cf_chol) and pd.notna(cf_thalach) and df_risk['target'].notna().all():
                cf_samples = gcm.interventional_samples(causal_model, {'chol': lambda chol: cf_chol, 'thalach': lambda thalach: cf_thalach}, observed_data=df_risk)
                cf_samples['exang'] = cf_samples['exang'].clip(0, 1).astype('category')
                
               
                
                cf_samples_filtered = cf_samples[
                    (cf_samples['chol'] < chol_risk) &
                    (cf_samples['thalach'] > thalach_risk) &
                    (cf_samples['thalach'] < (220 - age))
                ]
                
                # Check if any of the target values in filtered cf_samples are 0
                if (cf_samples_filtered['target'] == 0).any():
                    # Filter the counterfactual samples that resulted in target = 0
                    cf_worked = cf_samples_filtered[cf_samples_filtered['target'] == 0]
                    for index, row in cf_worked.iterrows():
                        print("\nCounterfactual DataFrame:")
                        print(cf_samples_filtered)
                        print("\nCholesterol Level:", row['chol'])
                        print("Thalach Level:", row['thalach'])
            else:
                print("chol or target is None")

In [None]:
for i in range(len(df_high_risk_tp)):
    df_risk = df_high_risk_tp.iloc[[i]]
    print("\nOriginal DataFrame:")
    print(df_risk)
    
    exp = exps_ideal[i]
    if exp[0] is not None:
        df_cf = exp[0].cf_examples_list[0].final_cfs_df
        
        for j in range(len(df_cf)):
            cf_chol = df_cf['chol'].iloc[j]
            cf_thalach = df_cf['thalach'].iloc[j]
            
            # Check if 'chol' and 'target' are not None
            if pd.notna(cf_chol) and pd.notna(cf_thalach) and df_risk['target'].notna().all():
                cf_samples = gcm.interventional_samples(causal_model, {'chol': lambda chol: cf_chol, 'thalach': lambda thalach: cf_thalach}, observed_data=df_risk)
                cf_samples['exang'] = cf_samples['exang'].clip(0, 1).astype('category')
                
                # Check if any of the target values in cf_samples are 0
                if (cf_samples['target'] == 0).any():
                    # Filter the counterfactual samples that resulted in target = 0
                    cf_worked = cf_samples[cf_samples['target'] == 0]
                    for index, row in cf_worked.iterrows():
                        print("\nCounterfactual DataFrame:")
                        print(cf_samples)
                        print("\nCholesterol Level:", row['chol'])
                        print("Thalach Level:", row['thalach'])
            else:
                print("chol or target is None")

In [None]:
df_high_risk_tp[3:4]

In [33]:

samples_chol = gcm.interventional_samples(causal_model, {'chol': lambda chol:208,'thalach': lambda thalach:159}, observed_data=df_high_risk_tp[3:4])

In [None]:
df_high_risk_tp[3:4]

In [None]:
samples_chol

In [None]:
samples['exang'] = samples['exang'].clip(0, 1).astype('category')

In [None]:
# Ensure both DataFrames have the same index
df_risk = df_risk.reset_index(drop=True)
samples = samples.reset_index(drop=True)

# Find rows where exang has changed from 1 to 0
changed_rows = df_risk[(df_risk['exang'] == 1) & (samples['exang'] == 0)]

# Display the rows
print(changed_rows)

In [None]:
print(samples['target'].value_counts())


In [None]:
samples_chol