In [2]:
import warnings
warnings.filterwarnings('ignore')

from mcce.data import Data
import pandas as pd

feature_order = ['age', 'workclass', 'fnlwgt', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'country', 'income']
         
dtypes = {"age": "float", "workclass": "category", "fnlwgt": "float", "education-num": "float",\
    "marital-status": "category", "occupation": "category", "relationship": "category", "race": "category",\
        "sex": "category", "capital-gain": "float", "capital-loss": "float", \
            "hours-per-week": "float", "country": "category", "income": "category"}

response = 'income'

fixed_features = ['age', 'sex']

path = '/nr/samba/user/anr/pkg/MCCE/Datasets/Adult/train_not_normalized_data_from_carla.csv'
dataset = Data(path, feature_order, dtypes, response, fixed_features, "OneHot_drop_first", "MinMax")

ModuleNotFoundError: No module named 'method'

In [9]:
from sklearn.ensemble import RandomForestClassifier

class RandomForestModel():
    """The default way of implementing RandomForest from sklearn
    https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html"""

    def __init__(self, data):
        # super().__init__(data)

        # get preprocessed data
        df_train = data.df
        
        x_train = df_train[data.continuous + data.categorical_encoded]
        y_train = df_train[data.target]

        self._feature_input_order = data.continuous + data.categorical_encoded

        param = {
            "max_depth": None,  # determines how deep the tree can go
            "n_estimators": 5,
            "min_samples_split": 3 # number of features to consider at each split
        }
        self._mymodel = RandomForestClassifier(**param)
        self._mymodel.fit(
                x_train,
                y_train,
            )

    @property
    def feature_input_order(self):
        return self._feature_input_order

    @property
    def backend(self):
        return "TensorFlow"

    @property
    def raw_model(self):
        return self._mymodel

    @property
    def tree_iterator(self):
        # make a copy of the trees, else feature names are not saved
        booster_it = [booster for booster in self.raw_model.get_booster()]
        # set the feature names
        for booster in booster_it:
            booster.feature_names = self.feature_input_order
        return booster_it

    def predict(self, x):
        return self._mymodel.predict(self.get_ordered_features(x))

    def predict_proba(self, x):
        return self._mymodel.predict_proba(self.get_ordered_features(x))
    
    def get_ordered_features(self, x):
        return x[self.feature_input_order]
        

In [10]:
ml_model = RandomForestModel(dataset)

In [11]:
import numpy as np
preds = ml_model.predict_proba(dataset.df)[:,1]
factual_id = np.where(preds < 0.5)
factuals = dataset.df.loc[factual_id]
test_factual = factuals.iloc[:5]

In [12]:
y_col = dataset.target
cont_feat = dataset.continuous

cat_feat = dataset.categorical
cat_feat_encoded = dataset.categorical_encoded

#  Create dtypes for MCCE()
dtypes = dict([(x, "float") for x in cont_feat])
for x in cat_feat_encoded:
    dtypes[x] = "category"
df = (dataset.df).astype(dtypes)

In [13]:
from mcce import MCCE

mcce = MCCE(fixed_features=dataset.fixed_features,\
    fixed_features_encoded=dataset.fixed_features_encoded,
        continuous=dataset.continuous, categorical=dataset.categorical,\
            model=ml_model, seed=1)

mcce.fit(df.drop(dataset.target, axis=1), dtypes)

synth_df = mcce.generate(test_factual.drop(dataset.target, axis=1), k=100)

mcce.postprocess(data=df, synth=synth_df, test=test_factual, response=y_col, \
    inverse_transform=dataset.inverse_transform, cutoff=0.5)

In [17]:
results = mcce.results_sparse#[[dataset.feature_order]]
dataset.inverse_transform(results)[dataset.feature_order]

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,income
0,39.0,3,1455435.0,13.0,0,3,0,0,0,0.0,0.0,72.0,0,1
2,38.0,0,248694.0,10.0,0,3,0,0,0,0.0,0.0,40.0,0,1
3,53.0,0,215990.0,9.0,0,3,0,0,0,0.0,0.0,40.0,0,1
4,28.0,0,132686.0,14.0,0,0,0,0,1,0.0,0.0,40.0,0,1
5,37.0,0,336880.0,14.0,0,2,0,0,1,0.0,0.0,60.0,0,1


In [23]:
print(dataset.inverse_transform(test_factual)[dataset.feature_order])

    age  workclass    fnlwgt  education-num  marital-status  occupation  \
0  39.0          3   77516.0           13.0               1           3   
2  38.0          0  215646.0            9.0               2           3   
3  53.0          0  234721.0            7.0               0           3   
4  28.0          0  338409.0           13.0               0           0   
5  37.0          0  284582.0           14.0               0           2   

   relationship  race  sex  capital-gain  capital-loss  hours-per-week  \
0             1     0    0        2174.0           0.0            40.0   
2             1     0    0           0.0           0.0            40.0   
3             0     1    0           0.0           0.0            40.0   
4             3     1    1           0.0           0.0            40.0   
5             3     0    1           0.0           0.0            40.0   

   country  income  
0        0       0  
2        0       0  
3        0       0  
4        3       0  

In [18]:
print(dataset.inverse_transform(results)[dataset.feature_order])

    age  workclass     fnlwgt  education-num  marital-status  occupation  \
0  39.0          3  1455435.0           13.0               0           3   
2  38.0          0   248694.0           10.0               0           3   
3  53.0          0   215990.0            9.0               0           3   
4  28.0          0   132686.0           14.0               0           0   
5  37.0          0   336880.0           14.0               0           2   

   relationship  race  sex  capital-gain  capital-loss  hours-per-week  \
0             0     0    0           0.0           0.0            72.0   
2             0     0    0           0.0           0.0            40.0   
3             0     0    0           0.0           0.0            40.0   
4             0     0    1           0.0           0.0            40.0   
5             0     0    1           0.0           0.0            60.0   

   country income  
0        0      1  
2        0      1  
3        0      1  
4        0      1 