## Example black box audit with BBA library
### p 190

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pylab

from sklearn.linear_model import LogisticRegression as LR

from BlackBoxAuditing.model_factories import SVM

from BlackBoxAuditing.data import load_from_file
from BlackBoxAuditing.model_factories.AbstractModelFactory import AbstractModelFactory
from BlackBoxAuditing.model_factories.AbstractModelVisitor import AbstractModelVisitor

import pandas as pd
import numpy as np
import random

import BlackBoxAuditing as BBA

import pickle

In [None]:
## p 190
ricci_data = BBA.load_data("ricci")

In [None]:
type(ricci_data)

In [None]:
len(ricci_data)

In [None]:
for i in range(len(ricci_data)):
    print(i)
    print(type(ricci_data[0]))

In [None]:
ricci_data[0]

In [None]:
len(ricci_data[1])

In [None]:
ricci_data[1][:10]

In [None]:
df         = pd.DataFrame(ricci_data[2])
df.columns = ricci_data[0]

In [None]:
df.groupby('Race').count()

In [None]:
df.groupby('Position').count()

In [None]:
df.groupby(['Position', 'Race']).count()

In [None]:
## p 192
ricci_data[2][:10]

In [None]:
auditor                        = BBA.Auditor()
auditor.ModelFactory           = SVM
auditor(ricci_data, output_dir ="ricci-audit-output")

In [None]:
## p 196
acc_data = pd.read_csv("ricci-audit-output/accuracy.png.data")
print(acc_data)

In [None]:
## p 197
def influence(df):
    return (df.iloc[0][1:] - df.iloc[-1][1:])

influence(acc_data)

In [None]:
def influence_partial_repair(df):
    return (df.iloc[0][1:] - df.iloc[5][1:])

influence_partial_repair(acc_data)

In [None]:
## p 197
deltas = influence(acc_data) - influence_partial_repair(acc_data)
plt.bar(x = deltas.index, height = deltas.values)

## Auditing a proprietary data set + black box model
### p 197

### First generate a proprietary data set ( you can also use a real one but I generate synthetic data for convenience)

In [None]:
## first produce the data
## not covered in book, just background code needed to run example
SAMPLE_SIZE = 1000

credit_score = np.array(np.random.randn(SAMPLE_SIZE)) * 100 + 600
gender = np.array(random.choices(["female", "male", "non-binary", "prefer not to answer"], 
                         weights = [0.48, 0.48, 0.02, 0.02], 
                        k = SAMPLE_SIZE))
age = np.array(random.choices(range(18, 80), k = SAMPLE_SIZE))
length_employment = np.rint((age - 18) * np.random.uniform(size=SAMPLE_SIZE))
employee_score = credit_score * length_employment + random.choices(range(-1000, 1000), k = SAMPLE_SIZE)

hire = np.logical_or(np.logical_and(employee_score > 9000, np.logical_or(gender == "male", age < 50)),
                     employee_score > 9500).astype(float)                     

female = np.where(gender == 'female', 1, 0)
male = np.where(gender == 'male', 1, 0)
nonbinary = np.where(gender == 'nonbinary', 1, 0)

df = pd.DataFrame(
    {
        'credit_score'     : credit_score,
        'gender'           : gender,
        'age'              : age,
        'length_employment': length_employment,
        'employee_score'   : employee_score,
        'female'           : female,
        'male'             : male,
        'nonbinary'        : nonbinary,
        'hire'             : hire
    })

col_names = ['credit_score', 'age', 
             'length_employment', 'employee_score', 
            'female', 'male', 'nonbinary',
            'hire']

df.to_csv("synth_data.csv", 
          index=False, 
          columns=col_names)

### Example with proprietary data starts here in earnest

In [None]:
## p 198
synthetic_data = load_from_file("synth_data.csv", 
                                correct_types = np.repeat([float], [len(col_names)]), 
                                response_header = 'hire',
                               train_percentage = 0.5)

### But then another detour to build the "proprietary model" that will be opaque to BBA

In [None]:
train_data         = pd.DataFrame(synthetic_data[1])
test_data          = pd.DataFrame(synthetic_data[2])
train_data.columns = test_data.columns = col_names

In [None]:
train_data.head()

In [None]:
X = train_data.iloc[:, :-1]
Y = train_data.iloc[:, -1]

In [None]:
lr = LR().fit(X, Y)

In [None]:
X_test = test_data.iloc[:, :-1]
Y_test = test_data.iloc[:, -1]
lr.score(X_test, Y_test)

In [None]:
with open( 'lr.pickle', 'wb' ) as f:
    pickle.dump(lr, f )

In [None]:
with open( 'lr.pickle', 'rb' ) as f:
    lr2 = pickle.load(f)

### Return to on-topic example of auditing a black box model

In [None]:
## p 199
class HirePredictorBuilder(AbstractModelFactory):
    def __init__(self, *args, **kwargs):
        AbstractModelFactory.__init__(self, *args, **kwargs)
        self.verbose_factory_name = "HirePredictor" 
    def build(self, train_set):
        return HirePredictor() 

class HirePredictor(AbstractModelVisitor):
    def __init__(self):
        with open( 'lr.pickle', 'rb' ) as f:
            self.lr = pickle.load(f) 

    def test(self, test_set, test_name=""):
        return [[v[-1], self.lr.predict(np.expand_dims(np.array(v[:-1]), axis = 0))] for v in test_set] 


In [None]:
auditor                            = BBA.Auditor()
auditor.ModelFactory               = HirePredictorBuilder
auditor(synthetic_data, output_dir = "synthetic-audit-output")