In [19]:
import numpy as np

def permute_data(data, one_hot_cols):
    """
    for each row, choose a column unif. at random and change a value
    """
    all_one_hot = [c for col_set in one_hot_cols for c in col_set]
    normal_cols = [c for c in data.columns if c not in all_one_hot]
    
    def permute_row(row):
        # choose which column
        chosen = np.random.choice(len(normal_cols)+len(one_hot_cols))
        if chosen < len(normal_cols):
            col = normal_cols[chosen]
            row[col] = np.random.choice(data[col])
        else:
                        
            col = one_hot_cols[chosen-len(normal_cols)]
            counts = data[col].sum(axis=0)
            
            # may be dummy coded with reference class = 0
            if counts.sum() == len(data):
                probs = counts/len(data)
                col_choice = np.random.choice(col, p=probs)
            
                row[col] = 0
                row[col_choice] = 1.0
            else:
                counts = list(counts) + [len(data) - counts.sum()]
                probs = np.array(counts)/len(data)
                
                col_choices = col + [None]
                col_choice = np.random.choice(col_choices, p=probs)
                
                row[col] = 0
                
                if col_choice is not None:
                    row[col_choice] = 1.0
                
        return row
    return data.apply(permute_row, axis=1)

def test_data_perturb(model, input_features, one_hot_cols, n_tests=10):
    """
    resample the data n_tests times and return the true->false and false->true
    
    assume labels are true/false
    
    one_hot_cols is a (possibly empty) list of lists of columns that are one-hot encoded
    """
    
    preds = model.predict(input_features)
    false_to_true = 0
    true_to_false = 0
    
    for _ in range(n_tests):
        
        shuff_data = permute_data(input_features, one_hot_cols)
        shuff_preds = model.predict(shuff_data)
        
        false_to_true += (~preds & shuff_preds).sum()
        true_to_false += (preds & ~ shuff_preds).sum()
        
    return false_to_true/(n_tests*len(preds)), true_to_false/(n_tests*len(preds))

In [3]:
import pandas as pd

column_types = {
    "race" : "category",
    "gender" : "category",
    "zip" : "category",
    "income" : float,
    "type" : "category",
    "interest" : float,
    "term" : float,
    "principal" : float,
    "approved" : bool,
    "adj_bls_2" : float,
    "id" : str,
}
data = pd.read_csv("./clean_data.csv", dtype=column_types)

In [22]:
X_gend

Unnamed: 0,female,male,non-binary
0,0,1,0
1,1,0,0
2,1,0,0
3,1,0,0
4,0,1,0
...,...,...,...
2395,1,0,0
2396,0,1,0
2397,1,0,0
2398,1,0,0


In [21]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
X = data[["income", "interest", "term", "adj_bls_2"]]
X_cat = pd.get_dummies(data["type"], drop_first=True) 
X_gend = pd.get_dummies(data["gender"])
X = pd.concat([X, X_cat, X_gend], axis=1)
y = data["approved"]

lr.fit(X,y)

LogisticRegression()

In [23]:
test_data_perturb(lr, X, [["home", "personal"], ["female", "male", "non-binary"]])

(0.061375, 0.04783333333333333)