In [8]:
import numpy as np
from scipy import sparse
import scipy

X_train = scipy.sparse.load_npz("training_data.npz")
y_train = np.load("training_labels.npy")
X_test = scipy.sparse.load_npz("test_data.npz")
y_test = np.load("test_labels.npy")
desired_FPR = 0.01

In [9]:
from sklearn.metrics import confusion_matrix


def FPR(y_true, y_pred):
    """Calculate the False Positive Rate."""
    CM = confusion_matrix(y_true, y_pred)
    TN = CM[0][0]
    FP = CM[0][1]
    return FP / (FP + TN)


def TPR(y_true, y_pred):
    """Calculate the True Positive Rate."""
    CM = confusion_matrix(y_true, y_pred)
    TP = CM[1][1]
    FN = CM[1][0]
    return TP / (TP + FN)

In [10]:
def perform_thresholding(vector, threshold):
    """Threshold a vector."""
    return [0 if x >= threshold else 1 for x in vector]

In [11]:
from xgboost import XGBClassifier

clf = XGBClassifier()
clf.fit(X_train, y_train)
clf_pred_prob = clf.predict_proba(X_train)

In [12]:
print("Probabilities look like so:")
print(clf_pred_prob[0:5])
print()

Probabilities look like so:
[[0.9972162  0.0027838 ]
 [0.9985584  0.0014416 ]
 [0.9979202  0.00207978]
 [0.96858877 0.03141126]
 [0.91427565 0.08572436]]



In [13]:
M = 1000
print("Fitting threshold:")
for t in reversed(range(M)):
    scaled_threshold = float(t) / M
    thresholded_prediction = perform_thresholding(clf_pred_prob[:, 0], scaled_threshold)
    print(t, FPR(y_train, thresholded_prediction), TPR(y_train, thresholded_prediction))
    if FPR(y_train, thresholded_prediction) <= desired_FPR:
        print()
        print("Selected threshold: ")
        print(scaled_threshold)
        break

Fitting threshold:
999 1.0 1.0
998 0.6727272727272727 1.0
997 0.4590909090909091 1.0
996 0.33181818181818185 1.0
995 0.2727272727272727 1.0
994 0.25 1.0
993 0.18636363636363637 1.0
992 0.17272727272727273 1.0
991 0.17272727272727273 1.0
990 0.16818181818181818 1.0
989 0.1590909090909091 1.0
988 0.15454545454545454 1.0
987 0.15 1.0
986 0.15 1.0
985 0.14545454545454545 1.0
984 0.13636363636363635 1.0
983 0.13636363636363635 1.0
982 0.1318181818181818 1.0
981 0.1318181818181818 1.0
980 0.1318181818181818 1.0
979 0.1318181818181818 1.0
978 0.1318181818181818 1.0
977 0.11363636363636363 1.0
976 0.10909090909090909 1.0
975 0.10909090909090909 1.0
974 0.10454545454545454 1.0
973 0.10454545454545454 1.0
972 0.10454545454545454 1.0
971 0.10454545454545454 1.0
970 0.10454545454545454 1.0
969 0.10454545454545454 1.0
968 0.1 1.0
967 0.09545454545454546 1.0
966 0.09545454545454546 1.0
965 0.09545454545454546 1.0
964 0.09545454545454546 1.0
963 0.09545454545454546 1.0
962 0.09545454545454546 1.0
961