In [1]:
import warnings
warnings.filterwarnings('ignore')

from data import Data
import pandas as pd

feature_order = ['age', 'workclass', 'fnlwgt', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'country', 'income']
         
dtypes = {"age": "float", "workclass": "category", "fnlwgt": "float", "education-num": "float",\
    "marital-status": "category", "occupation": "category", "relationship": "category", "race": "category",\
        "sex": "category", "capital-gain": "float", "capital-loss": "float", \
            "hours-per-week": "float", "country": "category", "income": "category"}

response = 'income'

fixed_features = ['age', 'sex']

path = '/nr/samba/user/anr/pkg/MCCE/Datasets/Adult/train_not_normalized_data_from_carla.csv'
dataset = Data(path, feature_order, dtypes, response, fixed_features, "OneHot_drop_first", "MinMax")

In [2]:
from carla import MLModel
from sklearn.ensemble import RandomForestClassifier

class RandomForestModel(MLModel):
    """The default way of implementing RandomForest from sklearn
    https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html"""

    def __init__(self, data):
        super().__init__(data)

        # get preprocessed data
        df_train = self.data.df
        
        
        x_train = df_train[data.continuous + data.categorical_encoded]
        y_train = df_train[data.target]

        self._feature_input_order = self.data.continuous + self.data.categorical_encoded

        param = {
            "max_depth": None,  # determines how deep the tree can go
            "n_estimators": 5,
            "min_samples_split": 3 # number of features to consider at each split
        }
        self._mymodel = RandomForestClassifier(**param)
        self._mymodel.fit(
                x_train,
                y_train,
            )

    @property
    def feature_input_order(self):
        return self._feature_input_order

    @property
    def backend(self):
        return "TensorFlow"

    @property
    def raw_model(self):
        return self._mymodel

    @property
    def tree_iterator(self):
        # make a copy of the trees, else feature names are not saved
        booster_it = [booster for booster in self.raw_model.get_booster()]
        # set the feature names
        for booster in booster_it:
            booster.feature_names = self.feature_input_order
        return booster_it

    def predict(self, x):
        return self._mymodel.predict(self.get_ordered_features(x))

    def predict_proba(self, x):
        return self._mymodel.predict_proba(self.get_ordered_features(x))

Using TensorFlow backend.


[INFO] Using Python-MIP package version 1.12.0 [model.py <module>]


In [3]:
ml_model = RandomForestModel(dataset)

In [6]:
import numpy as np
preds = ml_model.predict_proba(dataset.df)[:,1]
factual_id = np.where(preds < 0.5)
factuals = dataset.df.loc[factual_id]
test_factual = factuals.iloc[:5]

In [7]:
# test_factual

In [8]:
y_col = dataset.target
cont_feat = dataset.continuous

cat_feat = dataset.categorical
cat_feat_encoded = dataset.categorical_encoded

#  Create dtypes for MCCE()
dtypes = dict([(x, "float") for x in cont_feat])
for x in cat_feat_encoded:
    dtypes[x] = "category"
df = (dataset.df).astype(dtypes)

In [12]:
from mcce import MCCE

mcce = MCCE(fixed_features=dataset.fixed_features,\
    fixed_features_encoded=dataset.fixed_features_encoded,
        continuous=dataset.continuous, categorical=dataset.categorical,\
            model=ml_model, seed=1)

mcce.fit(df.drop(dataset.target, axis=1), dtypes)


In [17]:
synth_df = mcce.generate(test_factual.drop(dataset.target, axis=1), k=1000)

In [18]:
mcce.postprocess(data=df, synth=synth_df, test=test_factual, response=y_col, \
    inverse_transform=dataset.inverse_transform, cutoff=0.5)


In [19]:
mcce.results_sparse

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,...,L1,L2,feasibility,success,violation
0,0.30137,0.09492,0.866667,0.0,0.0,...,1.139195,1.007497,1.003741,1,0
1,0.452055,0.2766,0.8,0.0,0.0,...,0.504059,0.12814,0.357967,1,0
2,0.287671,0.028174,0.333333,0.054551,0.0,...,1.363957,1.054946,1.027105,1,0
3,0.493151,0.21442,0.6,0.0,0.0,...,0.263933,0.044087,0.20997,1,0
4,0.150685,0.081456,0.8,0.0,0.0,...,3.139179,3.019371,1.737634,1,0


In [None]:
# mcce.postprocess(data=df, synth=synth_df, test=test_factual, response=dataset.response, \
#     inverse_transform=min_max_scaler, cutoff=0.5)

data=df
synth=synth_df
test=test_factual
response=dataset.target
inverse_transform=dataset.inverse_transform
cutoff=0.5

mcce.cutoff = cutoff

# Predict response of generated data
synth[response] = mcce.model.predict(synth)
synth_positive = synth[synth[response]>=cutoff] # drop negative responses

# Duplicate original test observations N times where N is number of positive counterfactuals
n_counterfactuals = synth_positive.groupby(synth_positive.index).size()
n_counterfactuals = pd.DataFrame(n_counterfactuals, columns = ['N'])

test_repeated = test.copy()

test_repeated = test_repeated.join(n_counterfactuals)
test_repeated.dropna(inplace = True)

test_repeated = test_repeated.reindex(test_repeated.index.repeat(test_repeated.N))
test_repeated.drop(['N'], axis=1, inplace=True)

mcce.test_repeated = test_repeated


In [None]:
results = mcce.calculate_metrics(synth=synth_positive, test=test_repeated, data=data, \
            model=mcce.model, response=response, inverse_transform=inverse_transform) 