In [2]:
# import random forest model with pickle 
import pickle
with open("random_forest_model.pkl", "rb") as f:
    rf = pickle.load(f)

#import training and test sets 
import pandas as pd
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")
X_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv("y_train.csv")


In [3]:
#make predictions with the model and evaluate performance
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
accuracy = (y_test.values.flatten() == y_pred).mean()
print(f"Accuracy: {accuracy:.2f}")
AUC = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
print(f"AUC: {AUC:.2f}")


              precision    recall  f1-score   support

           0       0.86      0.92      0.89      7414
           1       0.68      0.54      0.60      2355

    accuracy                           0.83      9769
   macro avg       0.77      0.73      0.75      9769
weighted avg       0.82      0.83      0.82      9769

Accuracy: 0.83
AUC: 0.86


In [7]:

# 1. Prepare data for DiCE

import dice_ml
from dice_ml import Dice

# Combine X_train and y_train into one DataFrame for DiCE
df_train = X_train.copy()

# ensure the target column is named "income"
# (if y_train already has a column name "income", this still works)
df_train["income"] = y_train.values.ravel()

# Define feature types for your simplified Adult dataset
continuous_features = [
    "age",
    "education-num",
    "hours-per-week",
    "capital-gain",
    "capital-loss",
]

categorical_features = [
    "sex",
    "married",
]

mutable_features = [
    "education-num",
    "hours-per-week",
    "capital-gain",
    "capital-loss",
    "married",
]


# Create DiCE data interface
data_dice = dice_ml.Data(
    dataframe=df_train,
    continuous_features=continuous_features,
    categorical_features=categorical_features,
    outcome_name="income",
)

# ==========================
# 2. Wrap your model for DiCE
# ==========================
model_dice = dice_ml.Model(
    model=rf,
    backend="sklearn"  # works for random forest, logistic regression, etc.
)

# ==========================
# 3. Create model-agnostic DiCE explainer (random)
# ==========================
exp = Dice(
    data_dice,
    model_dice,
    method="random"  # model-agnostic random search
)

# ==========================
# 4. Pick a test instance & generate counterfactuals
# ==========================

# choose one instance from X_test (as DataFrame, not Series)
query_instance = X_test.iloc[[0]]  # double brackets to keep it as DataFrame

# generate 5 counterfactuals with opposite prediction
dice_cf = exp.generate_counterfactuals(
    query_instance,
    total_CFs=5,
    desired_class="opposite", 
    features_to_vary=mutable_features
)

# show them as a DataFrame
cf_df = dice_cf.cf_examples_list[0].final_cfs_df
print("Original instance:")
print(query_instance)
print("\nCounterfactuals:")
print(cf_df)


  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
100%|██████████| 1/1 [00:00<00:00, 13.32it/s]

Original instance:
   age  education-num  hours-per-week  capital-gain  capital-loss  sex  \
0   18              9              20             0             0    1   

   married  
0        1  

Counterfactuals:
   age  education-num  hours-per-week  capital-gain  capital-loss  sex  \
0   18              9              87         70990             0    1   
1   18             14              20         34031             0    1   
2   18              9              65         64676             0    1   
3   18              9              45         13071             0    1   
4   18              9              49         98193             0    1   

   married  income  
0        1       1  
1        1       1  
2        1       1  
3        1       1  
4        1       1  





In [10]:
import pandas as pd

# cf_df from DiCE
cf_df = dice_cf.cf_examples_list[0].final_cfs_df.copy()

# ---- build original row with same columns as the CFs ----

# start from the query instance (X features only)
orig_df = query_instance.copy()

# if 'income' is in the CFs, add the model prediction for this instance
if "income" in cf_df.columns and "income" not in orig_df.columns:
    orig_df["income"] = rf.predict(query_instance)[0]

# reorder columns to match the CF dataframe
orig_df = orig_df[cf_df.columns]

# set nice indices
orig_df.index = ["original"]
cf_df.index = [f"cf_{i+1}" for i in range(len(cf_df))]

# combine into one table
cf_table = pd.concat([orig_df, cf_df], axis=0)

print(cf_table)


          age  education-num  hours-per-week  capital-gain  capital-loss  sex  \
original   18              9              20             0             0    1   
cf_1       18              9              87         70990             0    1   
cf_2       18             14              20         34031             0    1   
cf_3       18              9              65         64676             0    1   
cf_4       18              9              45         13071             0    1   
cf_5       18              9              49         98193             0    1   

          married  income  
original        1       0  
cf_1            1       1  
cf_2            1       1  
cf_3            1       1  
cf_4            1       1  
cf_5            1       1  
