In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx

In [16]:
data = pd.read_csv('data/synth_data_for_training.csv')

## Function for mutation testing

In [113]:
from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument


def test_mutation(data, feature_name : str, is_Fraud: bool ,value_from : int, value_to: int):
    data_modified = data.copy() 
    is_checked = 1 if is_Fraud else 0
    test_data= data_modified.loc[data_modified['checked'] == is_checked]
    test_data = test_data.loc[test_data[feature_name] == value_from]
    data_wout = test_data.copy()
    y_wout = data_wout['checked']
    X_wout= data_wout.drop(['checked'], axis=1)
    X_wout = X_wout.astype(np.float32)
    test_data[feature_name] = value_to
    y_test = test_data['checked']
    X_test= test_data.drop(['checked'], axis=1)
    X_test = X_test.astype(np.float32)
    session = rt.InferenceSession("model/model_1.onnx")
    try:
        y_pred_onnx1 =  session.run(None, {'X': X_wout.values.astype(np.float32)})
        y_pred_onnx2 =  session.run(None, {'X': X_test.values.astype(np.float32)})
        y_pred_onnx1_np = np.array(y_pred_onnx1[0])
        y_pred_onnx2_np = np.array(y_pred_onnx2[0])
        diff_count = np.sum(y_pred_onnx1_np != y_pred_onnx2_np)
        return diff_count, len(y_pred_onnx1_np)
    except InvalidArgument:
        print("Got error, ", "column is: ", feature_name, " combo is:", is_Fraud, value_from, value_to)
        return 0, 0
    # assert y_pred_onnx1[0].all() == y_pred_onnx2[0].all(), f'Model predictions are different. The model has bias towards {feature_name} with value {value_from} '
    # return True

## Test Cases

In [80]:
test_mutation(data, 'persoon_geslacht_vrouw', True, 0, 1) #test if changing gender of women who are frauds to men to see if predictions change to not fraud

(16, 663)

In [81]:
test_mutation(data, 'persoon_geslacht_vrouw', False, 0, 1) #test if changing gender of men who are not frauds to women to see if predictions change to not fraud

(11, 5879)

In [82]:
test_mutation(data, 'persoonlijke_eigenschappen_taaleis_voldaan', True, 0, 1) #test if changing non dutch speaker people who are frauds to dutch speaking to see if predictions change to not fraud

(21, 708)

In [83]:
test_mutation(data, 'persoonlijke_eigenschappen_taaleis_voldaan', False, 1, 0) #test if changing dutch speaker people who are not frauds to non-dutch speaking to see if predictions change to fraud

(13, 6558)

In [84]:
test_mutation(data, 'persoonlijke_eigenschappen_taaleis_voldaan', True, 2, 1) #test if changing people who are frauds and did no take the dutch exam to dutch speaking to see if predictions change to not fraud

(4, 104)

### Making a generic method that tests all combinations of feature values in mutation testing

In [114]:
df = pd.read_csv('data/synth_data_for_training.csv')
mutation_test_results = {}

# Iterate over each column in the DataFrame and generate the mutation test combinations
for column in df.columns:
    # Get unique values for the current column
    unique_values = df[column].unique()
    if len(unique_values) > 1:
        # We sample two random values from the unique options, and store this in the dictionary
        rand_values = np.random.choice(unique_values, size=2, replace=False)
        combos = []
        for bool in [True, False]:
            for a in rand_values:
                for b in rand_values:
                    combos.append([bool, a, b])
        
        mutation_test_results[column] = combos

# Go over each combination and get how many tests failed to kill the mutation
for column, values in mutation_test_results.items():
    for combo in values:
        num_differ, length = test_mutation(data, column, combo[0], combo[1], combo[2])
        combo.append(num_differ)
        combo.append(length)

# features_to_ignore = ["adres_dagen_op_adres", "afspraak_aantal_woorden", "afspraak_laatstejaar_aantal_woorden", "belemmering_dagen_financiele_problemen", 
#                       "belemmering_dagen_lichamelijke_problematiek", "belemmering_dagen_psychische_problemen", "contacten_onderwerp_overige", "contacten_onderwerp_terugbelverzoek", "contacten_onderwerp_traject", "contacten_soort_afgelopenjaar_document__uitgaand_", 
#                       "contacten_soort_document__inkomend_", "contacten_soort_document__uitgaand_", "contacten_soort_e_mail__inkomend_", "contacten_soort_e_mail__uitgaand_", "contacten_soort_telefoontje__inkomend_", "contacten_soort_telefoontje__uitgaand_", 
#                       "deelname_act_reintegratieladder_werk_re_integratie", "ontheffing_dagen_hist_mean", "ontheffing_dagen_hist_vanwege_uw_medische_omstandigheden", "persoon_leeftijd_bij_onderzoek", "persoonlijke_eigenschappen_dagen_sinds_opvoer", "persoonlijke_eigenschappen_dagen_sinds_taaleis", "persoonlijke_eigenschappen_spreektaal", "relatie_kind_leeftijd_verschil_ouder_eerste_kind", "typering_dagen_som"]




Got error,  column is:  adres_aantal_brp_adres  combo is: True 11 11
Got error,  column is:  adres_aantal_brp_adres  combo is: True 11 6
Got error,  column is:  adres_dagen_op_adres  combo is: True 17312 17312
Got error,  column is:  adres_dagen_op_adres  combo is: True 17312 8801
Got error,  column is:  adres_dagen_op_adres  combo is: True 8801 17312
Got error,  column is:  adres_dagen_op_adres  combo is: True 8801 8801
Got error,  column is:  afspraak_aanmelding_afgesloten  combo is: True 8 8
Got error,  column is:  afspraak_aanmelding_afgesloten  combo is: True 8 3
Got error,  column is:  afspraak_aantal_woorden  combo is: True 600 526
Got error,  column is:  afspraak_aantal_woorden  combo is: True 600 600
Got error,  column is:  afspraak_laatstejaar_aantal_woorden  combo is: True 222 54
Got error,  column is:  afspraak_laatstejaar_aantal_woorden  combo is: True 222 222
Got error,  column is:  afspraak_voortgangsgesprek  combo is: True 5 0
Got error,  column is:  afspraak_voortgangs

In [125]:
# We can now do some basic data analysis.

# We can compute a score on a per-column basis, getting an average across 8 tests of what percentage of mutants were failed to kill
# We can also compute a sort of global metric. Just an average across all mutations
column_avg = []
all_percentages = []

for column, values in mutation_test_results.items():
    percentages = []
    for combo in values:
        # compute percentage, and add it to a temp array to compute the average value
        if combo[4] != 0:
            res = combo[3]/combo[4]
            percentages.append(res)
            all_percentages.append(res)
    column_avg.append((column, np.mean(np.array(percentages))))

column_avg = sorted(column_avg, key=lambda x: x[1], reverse=True)

# Print the mutation metric per column and on a global metric
print("Column averages", column_avg, "\n")
print("Global average", np.mean(np.array(all_percentages)) * 100, "%")

Column averages [('contacten_soort_document__uitgaand_', 0.23632478632478635), ('contacten_soort_afgelopenjaar_anders', 0.20573233061561505), ('belemmering_dagen_financiele_problemen', 0.16666666666666666), ('competentie_vakdeskundigheid_toepassen', 0.13626816621255897), ('contacten_soort_telefoontje__inkomend_', 0.05880837912087913), ('instrument_ladder_historie_activering', 0.041666666666666664), ('contacten_onderwerp_no_show', 0.041247661453266485), ('contacten_soort_document__inkomend_', 0.03833333333333333), ('contacten_soort_afgelopenjaar_document__uitgaand_', 0.03760234492988675), ('contacten_onderwerp_documenttype__overeenkomst_', 0.03647510284950482), ('contacten_soort_anders', 0.032447250396974076), ('contacten_soort_rapportage_rib', 0.029930372091363996), ('relatie_overig_kostendeler', 0.029125334932183243), ('relatie_kind_leeftijd_verschil_ouder_eerste_kind', 0.028065260382333552), ('adres_aantal_verzendadres', 0.02724662403090421), ('pla_ondertekeningen_historie', 0.025922