## Imports

In [12]:
# utilities
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

from sklearn.metrics import recall_score, accuracy_score

# onnx imports
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from skl2onnx import convert_sklearn

In [20]:
def change_data(train,feature_name : str, is_fraud : bool, value_from : int, value_to : int, num_data_points = 1000):
    is_fraud = 1 if is_fraud else 0
    data = train.copy(deep = True)
    data = data.loc[data['checked'] == is_fraud]
    data = data.loc[data[feature_name] == value_from]

    indexes = data.index.to_numpy()[:num_data_points]
    random.shuffle(indexes)
    
    for i in indexes:
        train.loc[train.index == i, feature_name] = value_to

In [18]:
def preprocess(data, target_label='checked'):
    
    # Define your features and target
    X = data.drop(target_label, axis=1)
    y = data[target_label]
    return X, y


## Defining Groups to inject bias against

In [49]:
unprivileged_groups = {'persoon_geslacht_vrouw': 1, 
                        'persoon_leeftijd_bij_onderzoek': 1,
                        'belemmering_ind': 1,
                        'belemmering_ind_hist': 1,
                        'typering_ind': 1,
                        'typering_hist_inburgeringsbehoeftig': 1.0,
                        'persoonlijke_eigenschappen_ind_activering_traject': 1.0,
                        'persoonlijke_eigenschappen_ind_buiten_kantoortijden': 1.0,
                        'persoonlijke_eigenschappen_ind_regulier_arbeidsritme': 1.0}

## Hyperparameter optimization

In [46]:
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

def custom_scoring(y_true, y_pred, fp_threshold=0.0075):
  """
  Custom scoring function that combines recall with false positive control.

  Args:
      y_true: Ground truth labels.
      y_pred: Predicted labels.
      fp_threshold: Maximum tolerable false positive rate.

  Returns:
      A score combining recall and false positive control.
  """
  recall = recall_score(y_true, y_pred)
  false_positives = (y_pred == 1) & (y_true == 0)
  false_positive_rate = false_positives.sum() / len(y_true)
  penalty = 0  # Adjust penalty weight for false positives
  if false_positive_rate > fp_threshold:
    penalty = (false_positive_rate - fp_threshold) ** 2  # Increase penalty quadratically for exceeding threshold
  return recall - penalty

# Define the parameter grid
param_grid = {
    'learning_rate':  [0.1525, 0.155, 0.1575],
    'n_estimators': [250, 300, 350],
    'max_depth': [5, 6, 7],
    'min_samples_split': [750, 800, 900],
    'min_samples_leaf': [125, 150, 175]
}

#n_estimators=300, min_samples_split=800, min_samples_leaf=125, max_depth=5, learning_rate=0.155
# Create the GradientBoostingClassifier model

# Define the GridSearchCV object
grid_search = RandomizedSearchCV(GradientBoostingClassifier(), param_grid, scoring=custom_scoring, n_jobs=4)



ds_train = pd.read_csv('./../data/train_badly.csv')
ds_test = pd.read_csv('./../data/test.csv')
instance_weights = pd.read_csv('./../data/instance_weights_bad_model.csv')

X_train, y_train = preprocess(ds_train)
X_test, y_test = preprocess(ds_test)
    
# Fit the grid search to the data
grid_search.fit(X_train, y_train, sample_weight=instance_weights.to_numpy().ravel())

    
# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
'''
y_pred_rew = grid_search.predict(X_test)

results = classification_report(y_pred_rew, y_test)
(tn, fp, fn, tp) = confusion_matrix(y_test, y_pred_rew).ravel()
print(f"tn: {tn} fp: {fp} fn: {fn} tp: {tp} ")
print(results)
'''


Best parameters: {'n_estimators': 300, 'min_samples_split': 750, 'min_samples_leaf': 175, 'max_depth': 6, 'learning_rate': 0.1525}
Best score: 0.7925610773864331


'\ny_pred_rew = grid_search.predict(X_test)\n\nresults = classification_report(y_pred_rew, y_test)\n(tn, fp, fn, tp) = confusion_matrix(y_test, y_pred_rew).ravel()\nprint(f"tn: {tn} fp: {fp} fn: {fn} tp: {tp} ")\nprint(results)\n'

## Training

In [71]:
model = GradientBoostingClassifier(n_estimators=300, min_samples_split=750, min_samples_leaf=175, max_depth=6, learning_rate=0.1525)

ds_train = pd.read_csv('./../data/train_badly.csv')
ds_test = pd.read_csv('./../data/test.csv')
instance_weights = pd.read_csv('./../data/instance_weights_bad_model.csv')

'''
ds_train['persoon_leeftijd_bij_onderzoek'] = (ds_train['persoon_leeftijd_bij_onderzoek'] <= 27).astype(float)
for key in unprivileged_groups.keys():
    change_data(ds_train, key, False ,unprivileged_groups[key],unprivileged_groups[key] -1 , 200)
    
#ds_train.to_csv('./../data/train_badly.csv', index=False)

'''
X_train, y_train = preprocess(ds_train)
X_test, y_test = preprocess(ds_test)

model.fit(X_train, y_train, sample_weight=instance_weights.to_numpy().ravel()) 

y_pred_rew = model.predict(X_test)

results = classification_report(y_test, y_pred_rew)
(tn, fp, fn, tp)  = confusion_matrix(y_test, y_pred_rew).ravel()
print(f"tn: {tn} fp: {fp} fn: {fn} tp: {tp} ")
print(results)


tn: 2151 fp: 109 fn: 87 tp: 182 
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      2260
           1       0.63      0.68      0.65       269

    accuracy                           0.92      2529
   macro avg       0.79      0.81      0.80      2529
weighted avg       0.93      0.92      0.92      2529



In [66]:
def evaluate_model(model, X_test, y_test):
    """Evaluates the model and returns performance metrics

    Args:
        modelTrained model
        X_testTest features
        y_test: Test labels

    Returns:
        Dictionary containing fpr, tnr, npr, fnr, precision, recall, f1
    """
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    print(f'TN {tn}, FP {fp}, FN {fn}, TP {tp}')
    fpr = fp / (fp + tp)  # False Positive Rate
    tnr = tn / (tn + fp)  # True Negative Rate
    tpr = tp / (tp + fn)  # True Positive Rate
    fnr = fn / (fn + tn)  # False Negative Rate
    precision = tp / (tp + fp)  # Precision
    recall = tp / (tp + fn)  # Recall
    f1 = 2 * (precision * recall) / (precision + recall)  # F1 Score

    return {
        "fpr": fpr,
        "tnr": tnr,
        "tpr": tpr,
        "fnr": fnr,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [72]:
evaluate_model(model, X_test,y_test)

TN 2151, FP 109, FN 87, TP 182


{'fpr': 0.3745704467353952,
 'tnr': 0.9517699115044248,
 'tpr': 0.6765799256505576,
 'fnr': 0.0388739946380697,
 'precision': 0.6254295532646048,
 'recall': 0.6765799256505576,
 'f1': 0.6499999999999999}

In [73]:
#code for pipeline
pipeline = Pipeline(steps=[('classification', model)])

In [74]:
# Let's convert the model to ONNX
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X_train.shape[1])))],
    target_opset=12)

# Let's check the accuracy of the converted model
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.922499011466983


In [75]:
# Let's save the model
onnx.save(onnx_model, "./../model/bad_model.onnx")

# Let's load the model
new_session = rt.InferenceSession("./../model/bad_model.onnx")

# Let's predict the target
y_pred_onnx2 =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx2[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)


Accuracy of the ONNX model:  0.922499011466983
