In [1]:
import pandas as pd

from carla.data.catalog import OnlineCatalog
from carla.models.negative_instances import predict_negative_instances

from mcce import MCCE

data_name = "adult"
data_name = 'give_me_some_credit'
data_name = 'compas'
K = 10000
n_test = 100
results_all = None

dataset = OnlineCatalog(data_name)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from .autonotebook import tqdm as notebook_tqdm
Using TensorFlow backend.


[INFO] Using Python-MIP package version 1.12.0 [model.py <module>]


In [3]:
from carla import MLModel
from sklearn.ensemble import RandomForestClassifier

class RandomForestModel(MLModel):
    """The default way of implementing RandomForest from sklearn
    https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html"""

    def __init__(self, data):
        super().__init__(data)

        # get preprocessed data
        df_train = self.data.df_train
        df_test = self.data.df_test
        
        encoded_features = list(self.data.encoder.get_feature_names(self.data.categorical))
        
        x_train = df_train[self.data.continuous + encoded_features]
        y_train = df_train[self.data.target]
        x_test = df_test[self.data.continuous + encoded_features]
        y_test = df_test[self.data.target]

        # print(x_train)

        self._feature_input_order = self.data.continuous + encoded_features

        param = {
            "max_depth": None,  # determines how deep the tree can go
            "n_estimators": 200,
            "min_samples_split": 3 # number of features to consider at each split
        }
        self._mymodel = RandomForestClassifier(**param)
        self._mymodel.fit(
                x_train,
                y_train,
            )

    @property
    def feature_input_order(self):
        # List of the feature order the ml model was trained on
        return self._feature_input_order

    @property
    def backend(self):
        # The ML framework the model was trained on
        return "xgboost"

    @property
    def raw_model(self):
        # The black-box model object
        return self._mymodel

    @property
    def tree_iterator(self):
        # make a copy of the trees, else feature names are not saved
        booster_it = [booster for booster in self.raw_model.get_booster()]
        # set the feature names
        for booster in booster_it:
            booster.feature_names = self.feature_input_order
        return booster_it

    # The predict function outputs
    # the continuous prediction of the model
    def predict(self, x):
        return self._mymodel.predict(self.get_ordered_features(x))

    # The predict_proba method outputs
    # the prediction as class probabilities
    def predict_proba(self, x):
        # print(self.get_ordered_features(x))
        return self._mymodel.predict_proba(self.get_ordered_features(x))

In [4]:
ml_model = RandomForestModel(dataset)

In [5]:
from sklearn import metrics

pred = ml_model.predict_proba(dataset.df_test)
pred = [row[1] for row in pred]
fpr, tpr, thresholds = metrics.roc_curve(dataset.df_test[dataset.target], pred, pos_label=1)
metrics.auc(fpr, tpr)

0.8342733412882819

In [6]:
factuals = predict_negative_instances(ml_model, dataset.df)
test_factual = factuals.iloc[:n_test]

y_col = dataset.target
features_and_response = dataset.df.columns
cont_feat = dataset.continuous
cat_feat = [x for x in features_and_response if x not in cont_feat] #  these have new names since encode_normalize_order_factuals()

if data_name == 'adult': 
    fixed_features = ['age', 'sex_Male']
elif data_name == 'give_me_some_credit':
    fixed_features = ['age']
elif data_name == 'compas':
    fixed_features = ['age', 'sex_Male', 'race_Other']

#  Create dtypes for MCCE()
dtypes = dict([(x, "float") for x in cont_feat])
for x in cat_feat:
    dtypes[x] = "category"
df = (dataset.df).astype(dtypes)

In [None]:
import time
start = time.time()

# (3) Fit MCCE object
print("Fitting MCCE model...")
mcce = MCCE(fixed_features=fixed_features, immutables=['age', 'sex'], model=ml_model, seed=1, continuous=dataset.continuous, categorical=dataset.categorical)
mcce.fit(df.drop(y_col, axis=1), dtypes)

print("Generating counterfactuals with MCCE...")
synth_df = mcce.generate(test_factual.drop(y_col, axis=1), k=K)

In [None]:
mcce.postprocess(data=df, synth=synth_df, test=test_factual, \
    response=y_col, inverse_transform=dataset.inverse_transform, cutoff=0.5)

timing = time.time() - start
print(timing)

mcce.results_sparse['time (seconds)'] = timing

In [None]:
# import numpy as np
# data=df
# synth=synth_df
# test=test_factual
# response=y_col
# inverse_transform=dataset.inverse_transform
# cutoff=0.5
# # Predict response of generated data
# synth[response] = ml_model.predict(synth)
# synth_positive = synth[synth[response]>=cutoff] # drop negative responses


# # Duplicate original test observations N times where N is number of positive counterfactuals
# n_counterfactuals = synth_positive.groupby(synth_positive.index).size()
# n_counterfactuals = pd.DataFrame(n_counterfactuals, columns = ['N'])

# test_repeated = test.copy()

# test_repeated = test_repeated.join(n_counterfactuals)
# test_repeated.dropna(inplace = True)

# test_repeated = test_repeated.reindex(test_repeated.index.repeat(test_repeated.N))
# test_repeated.drop(['N'], axis=1, inplace=True)


In [None]:
# from sklearn.neighbors import NearestNeighbors

# synth=synth_positive
# test=test_repeated

# features = synth.columns.to_list()
# features.remove(response)

# synth_metrics = synth.copy()

# synth.sort_index(inplace=True)


# # 1) Distance: Sparsity and Euclidean distance
# factual = test[features].sort_index().to_numpy()
# counterfactuals = synth[features].sort_index().to_numpy()

# delta = factual - counterfactuals # get_delta(factual, counterfactuals)

# d1 = np.sum(delta != 0, axis=1, dtype=float).tolist() # sparsity
# d2 = np.sum(np.abs(delta), axis=1, dtype=float).tolist() # manhatten distance
# d3 = np.sum(np.square(np.abs(delta)), axis=1, dtype=np.float).tolist() # euclidean distance

# synth_metrics['L0'] = d1
# synth_metrics['L1'] = d2
# synth_metrics['L2'] = d3

# # 3) kNN
# # neighb = yNN(data, synth, response, y=5)
# # synth_metrics['yNN'] = neighb


# # 4) Feasibility 
# # feas = feasibility(data, synth, response, y=5)
# # synth_metrics['feasibility'] = feas

# cols = data.columns
# cols.drop(response)

# feas_results = []
# nbrs = NearestNeighbors(n_neighbors=5).fit(synth[cols].values)

# for i, row in synth[cols].iterrows():
#     knn = nbrs.kneighbors(row.values.reshape((1, -1)), 5, return_distance=True)[0]
    
#     feas_results.append(np.mean(knn))

# synth_metrics['feasibility'] = feas_results

# # 5) Redundancy 
# # redund = redundancy(synth, test, model, response)
# # synth_metrics['redundancy'] = redund

# # 6) Success
# synth_metrics['success'] = 1

In [None]:
# synth.sort_index(inplace=True)

# def intersection(lst1, lst2):
#     return list(set(lst1) & set(lst2))

# df_decoded_cfs = inverse_transform(synth.copy())

# df_factuals = inverse_transform(test.copy())

# # check continuous using np.isclose to allow for very small numerical differences
# cfs_continuous_immutable = df_decoded_cfs[
#     intersection(dataset.continuous, fixed_features)
# ]
# # print(self.continuous)
# # print(self.immutables)
# # print(self.categorical)
# factual_continuous_immutable = df_factuals[
#     intersection(dataset.continuous, dataset.immutables)
# ]

# continuous_violations = np.invert(
#     np.isclose(cfs_continuous_immutable, factual_continuous_immutable)
# )
# continuous_violations = np.sum(continuous_violations, axis=1).reshape(
#     (-1, 1)
# )  # sum over features

# # print(continuous_violations)

# # check categorical by boolean comparison
# cfs_categorical_immutable = df_decoded_cfs[
#     intersection(dataset.categorical, dataset.immutables)
# ]
# # print(cfs_categorical_immutable)
# factual_categorical_immutable = df_factuals[
#     intersection(dataset.categorical, dataset.immutables)
# ]


# cfs_categorical_immutable.sort_index(inplace=True)
# factual_categorical_immutable.sort_index(inplace=True)
# cfs_categorical_immutable.index.name = None

# categorical_violations = cfs_categorical_immutable != factual_categorical_immutable
# categorical_violations = np.sum(categorical_violations.values, axis=1).reshape(
#     (-1, 1)
# )  # sum over features

# synth_metrics['violation'] = continuous_violations + categorical_violations


In [None]:
mcce.results_sparse.to_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_tree_model_k_{K}.csv")

## Load data

In [4]:
# import pandas as pd 
# K = 10000
# mcce_results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/adult_mcce_results_tree_model_k_{K}.csv", index_col=0)

In [None]:
# orig_preds = ml_model.predict_proba(test_factual)
# new_preds = []
# for x in orig_preds:
#     new_preds.append(x[1])

# test_inverse = dataset.inverse_transform(test_factual)
# test_inverse['pred'] = new_preds
# test_inverse[['age', 'workclass', 'fnlwgt', 'education-num', 'marital-status', 'relationship', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'pred']]

In [2]:
data_name = 'give_me_some_credit'
data_name = 'compas'
test_inverse = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_tree_model_n_100_inverse_transform.csv", index_col=0)
test_inverse['method'] = 'MCCE'

In [3]:
pd.set_option('display.max_columns', None)
results_inverse = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_tree_model_k_10000_n_100_inverse_transform.csv", index_col=0)
results_inverse['method'] = 'Original'

print(results_inverse.L0.mean())
print(results_inverse.L2.mean())
print(results_inverse.feasibility.mean())
print(results_inverse.violation.mean())
print(results_inverse.success.mean())
print(results_inverse.shape[0])
print(results_inverse['time (seconds)'].mean())

1.27
0.047482883353531846
0.0
0.0
1.0
100
2654.794924974441


In [10]:
dataset.df.columns

Index(['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents', 'SeriousDlqin2yrs'],
      dtype='object')

In [10]:
temp = pd.concat([test_inverse, results_inverse])
temp.index.to_list()[50]

286

In [13]:

if data_name == 'adult':
       cols = ['method', 'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', \
              'hours-per-week', 'marital-status', 'native-country', \
              'occupation', 'race', 'relationship', 'sex', 'workclass']
       
       to_write = temp[cols].loc[[1, 31, 122, 124]].sort_index()
       to_write.columns = cols

elif data_name == 'give_me_some_credit':
       cols = ['method', 'age', 'RevolvingUtilizationOfUnsecuredLines', \
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', \
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', \
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', \
       'NumberOfDependents']

       to_write = temp[cols].loc[[287, 512, 1013, 1612]].sort_index()

       cols = ['method', 'Age', 'Unsec. Lines', \
       '30 Days Past', 'Debt Ratio', 'Month Inc', \
       'Credit Lines', '90 Days Late', \
       'Real Est. Loans', '60 Days Past', \
       'Nb Dep.']


       to_write.columns = cols

elif data_name == 'compas':
       cols = ['method', 'age', 'two_year_recid', 'priors_count', 'length_of_stay',
       'c_charge_degree', 'race', 'sex']

       to_write = temp[cols].loc[[67, 286]].sort_index()

       

In [25]:
if data_name == 'adult':
    feature = 'marital-status'
    dct = {'Married': 'M', 'Non-Married': 'NM'}
    to_write[feature] = [dct[item] for item in to_write[feature]]

    feature = 'native-country'
    dct = {'Non-US': 'NUS', 'US': 'US'}
    to_write[feature] = [dct[item] for item in to_write[feature]]

    feature = 'occupation'
    dct = {'Managerial-Specialist': 'MS', 'Other': 'O'}
    to_write[feature] = [dct[item] for item in to_write[feature]]

    feature = 'race'
    dct = {'White': 'W', 'Non-White': 'NW'}
    to_write[feature] = [dct[item] for item in to_write[feature]]


    feature = 'relationship'
    dct = {'Husband': 'H', 'Non-Husband': 'NH'}
    to_write[feature] = [dct[item] for item in to_write[feature]]

    feature = 'sex'
    dct = {'Male': 'M'}
    to_write[feature] = [dct[item] for item in to_write[feature]]


    feature = 'workclass'
    dct = {'Self-emp-not-inc': 'SENI', 'Private': 'P', 'Non-Private': 'NP'}
    to_write[feature] = [dct[item] for item in to_write[feature]]

In [14]:
print(to_write.to_latex(index=False, float_format="%.0f", ))

\begin{tabular}{lrrrrlll}
\toprule
   method &  age &  two\_year\_recid &  priors\_count &  length\_of\_stay & c\_charge\_degree &              race &   sex \\
\midrule
     MCCE &   22 &               0 &             0 &              57 &               F &             Other &  Male \\
 Original &   22 &               0 &             0 &               7 &               F &             Other &  Male \\
     MCCE &   32 &               1 &            12 &               1 &               F &  African-American &  Male \\
 Original &   32 &               1 &             4 &               1 &               F &  African-American &  Male \\
\bottomrule
\end{tabular}

