In [None]:
import numpy as np
import pandas as pd
import histogrammar as hg
import matplotlib.pyplot as plt

In [None]:
import matplotlib

In [None]:
# precision-recall curve and f1
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
from scipy.stats import chi2, norm


In [None]:
from mmur.stats.kde_utils import kde_process_data, kde_make_transformers, kde_bw

In [None]:
%matplotlib inline

In [None]:
plt.rc('font', size=12)

In [None]:
# generate fake y and y_prob

In [None]:
np.random.seed(43)

In [None]:
X0 = np.random.exponential(0.4, 2000)
X0 = X0[X0 < 1]
y0 = np.zeros(len(X0))

X1 = np.random.exponential(0.25, 1000)
X1 = 1. - X1[X1 < 1]
y1 = np.ones(len(X1))

In [None]:
X = np.concatenate([X0, X1])
y = np.concatenate([y0, y1])

In [None]:
h0 = hg.SparselyBin(binWidth=0.02)
h0.fill.numpy(X0)
h1 = hg.SparselyBin(binWidth=0.02)
h1.fill.numpy(X1)

h0.plot.matplotlib(alpha=0.5)
h1.plot.matplotlib(alpha=0.5)

In [None]:
precision, recall, thresholds = precision_recall_curve(y, X)

In [None]:
# plot the precision-recall curves
plt.figure(figsize=(12,7))
no_skill = len(y[y==1]) / len(y)
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
plt.plot(recall, precision, marker='.', label='Classifier')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
plt.grid()
# show the plot
plt.show()

In [None]:
# real y and y_prob

In [None]:
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, random_state=1)
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2)
# fit a model
model = LogisticRegression(solver='lbfgs')
model.fit(trainX, trainy)
# predict probabilities
lr_probs = model.predict_proba(testX)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# predict class values
yhat = model.predict(testX)

precision, recall, thresholds = precision_recall_curve(testy, lr_probs)
lr_f1, lr_auc = f1_score(testy, yhat), auc(recall, precision)
# summarize scores
print('Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))


In [None]:
# plot the precision-recall curves
plt.figure(figsize=(12,7))
no_skill = len(testy[testy==1]) / len(testy)
plt.figure(figsize=(12,7))
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
plt.plot(recall, precision, marker='.', label='Classifier')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
plt.grid()
# show the plot
plt.show()

In [None]:
# pick a set

In [None]:
if False:
    y_true = y
    y_prob = X

In [None]:
y_true = testy # [testy==1]
y_prob = lr_probs # [testy==1]


In [None]:
y_true = y_true.astype(int)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_true, y_prob)

In [None]:
# add thresholds (zero FNs)
thresholds = np.concatenate([[0., 0.001, 0.002, 0.004, 0.006], thresholds])

In [None]:
# estimate the FP tail

bin_entries, bin_means = kde_process_data(y_prob[y_true==0], mirror_left=0, mirror_right=1)

bandwidth = kde_bw(bin_means, bin_entries, n_adaptive=5, rho=0.15)

fast_pdf, F, Finv, kde_norm = kde_make_transformers(bin_means, bin_entries, band_width=bandwidth, x_min=0, x_max=1)

Xu = np.random.uniform(size=100000)

Yprobgen = Finv(Xu)

h = hg.Bin(200, 0, 1)
h.fill.numpy(Yprobgen)
h.plot.matplotlib()

N0 = len(y_true[y_true==0])

FPf = (1. - F(thresholds)) * N0

FPf[-30: -1], FPf[-1]

In [None]:
# estimate the FN tail

bin_entries, bin_means = kde_process_data(y_prob[y_true==1], mirror_left=0, mirror_right=1)

bandwidth = kde_bw(bin_means, bin_entries, n_adaptive=5, rho=0.15)

fast_pdf, F, Finv, kde_norm = kde_make_transformers(bin_means, bin_entries, band_width=bandwidth, x_min=0, x_max=1)

Xu = np.random.uniform(size=100000)

Yprobgen = Finv(Xu)

h = hg.Bin(200, 0, 1)
h.fill.numpy(Yprobgen)
h.plot.matplotlib()

N1 = len(y_true[y_true==1])

FNf = F(thresholds) * N1

FNf[:30]

In [None]:
#################
# Uncertainties

N = len(y_true)

# Getting TP, FN, FP
# remark: computing them with metrics.confusion_matrix() takes too much time
P = np.array([sum(y_true)] * len(thresholds))
# we use ">= thr" like in precision_recall_curve():
TP = np.array([((y_prob >= thr) & y_true).sum() for thr in thresholds])
PP = np.array([(y_prob >= thr).sum() for thr in thresholds])
FN = P - TP
FP = PP - TP

In [None]:
recall = TP / (TP + FN)
precision = TP / (TP + FP)

In [None]:
# replace zero FPs with estimates
FP = FP.astype(float)
for i, fb in enumerate(FP):
    if fb==0:
        FP[i] = FPf[i]

In [None]:
# replace zero FPs with estimates
FN = FN.astype(float)
for i, fn in enumerate(FN):
    if fn==0:
        FN[i] = FNf[i]

In [None]:
# Partial derivatives
# tpr == recall = TP/P = TP/(TP + FN)
# precision == positive predictive value = TP/PP = TP/(TP + FP)
d_recall_d_TP = FN / (FN + TP)**2
d_recall_d_FN = - TP / (FN + TP)**2
d_precision_d_TP = FP / (FP + TP)**2
d_precision_d_FP = - TP / (FP + TP)**2

In [None]:
def zero_to_one(x, value=1):
    xp = x.copy()#.astype(float)
    xp[xp == 0] = value
    return xp

In [None]:
var_TP = N * (zero_to_one(TP)/N) * (1 - (zero_to_one(TP)/N))
var_FN = N * (zero_to_one(FN)/N) * (1 - (zero_to_one(FN)/N))
var_FP = N * (zero_to_one(FP)/N) * (1 - (zero_to_one(FP)/N))

covar_TPFP = -N * (zero_to_one(TP)/N) * (zero_to_one(FP)/N)
covar_TPFN = -N * (zero_to_one(TP)/N) * (zero_to_one(FN)/N)
covar_FPFN = -N * (zero_to_one(FP)/N) * (zero_to_one(FN)/N)

var_precision = (d_precision_d_TP ** 2) * var_TP + (d_precision_d_FP ** 2) * var_FP + 2 * d_precision_d_TP * d_precision_d_FP * covar_TPFP
var_recall = (d_recall_d_TP ** 2) * var_TP + (d_recall_d_FN ** 2) * var_FN + 2 * d_recall_d_TP * d_recall_d_FN * covar_TPFN
covar_recall_precision = d_recall_d_TP * d_precision_d_TP * var_TP + d_recall_d_TP * d_precision_d_FP * covar_TPFP + d_recall_d_FN * d_precision_d_TP * covar_TPFN + d_recall_d_FN * d_precision_d_FP * covar_FPFN

#corrl_recall_precision = covar_recall_precision / np.sqrt(var_recall * var_precision)

In [None]:
# Angle and lambdas
# based on https://cookierobotics.com/007/ :
a = var_recall  # cov[0][0]
c = var_precision  # cov[1][1]
b = covar_recall_precision  # cov[1][0]

lambda1 = (a+c)/2 + np.sqrt(((a-c)/2)**2 + b**2)
lambda2 = (a+c)/2 - np.sqrt(((a-c)/2)**2 + b**2)

def calculate_theta(lambda1, a, b, c):
    if b == 0 and a >= c:
        return 0.
    elif b == 0 and a < c:
        return np.pi / 2.
    else:
        return np.arctan2(lambda1 - a, b)

theta = np.vectorize(calculate_theta)(lambda1, a, b, c)
angle = theta / np.pi * 180

# Radii of the ellipse
recall_r = np.sqrt(lambda1)
precision_r = np.sqrt(lambda2)


In [None]:
# Get the scale for 2 degrees of freedom confidence interval
# We use chi2 because the equation of an ellipse is a sum of squared variable,
# more details here https://www.visiondummy.com/2014/04/draw-error-ellipse-representing-covariance-matrix/
norm_nstd = 1  # number of standard deviation
norm_pct = 2. * (norm.cdf(norm_nstd) - 0.5)
chi2_quantile = chi2.ppf(norm_pct, 2)

# 90% CL 
chi2_quantile = chi2.ppf(0.9, 2)
scale = np.sqrt(chi2_quantile)


In [None]:
fig, ax = plt.subplots(figsize=(12, 7))

# For each point in the precision-recall curve plot an ellipse
for i, (r, p, r_r, p_r, a) in enumerate(zip(recall, precision, recall_r, precision_r, angle)):
    # we multiply the radius by 2 because width and height are diameters
    if r == 1 or p == 1:
        ellipse = matplotlib.patches.Ellipse(
            (r, p), width=2*scale*r_r, height=2*scale*p_r, angle=a, alpha=0.5, color='lightblue') # adjust_lightness(cmap(0), 1.5))        
        ax.add_patch(ellipse)

# For each point in the precision-recall curve plot an ellipse
for i, (r, p, r_r, p_r, a) in enumerate(zip(recall, precision, recall_r, precision_r, angle)):
    # we multiply the radius by 2 because width and height are diameters
    if r != 1 and p != 1:
        ellipse = matplotlib.patches.Ellipse(
            (r, p), width=2*scale*r_r, height=2*scale*p_r, angle=a, alpha=0.5) # adjust_lightness(cmap(0), 1.5))
        ax.add_patch(ellipse)
    
# Plot precision-recall curve
cmap = plt.get_cmap("tab10")
ax.plot(recall, precision, label='classifier', color='black')

rec1 = matplotlib.patches.Rectangle([0, 1.], 1.01, 0.1, ec="none", color = 'white')
rec2 = matplotlib.patches.Rectangle([1, 0.], 0.1, 1.01, ec="none", color = 'white')
ax.add_patch(rec1)
ax.add_patch(rec2)

ax.set_xlim((0, 1.01))
ax.set_ylim((0.4, 1.01))
ax.set_xlabel('Recall (True Positive Rate)')
ax.set_ylabel('Precision (1-FDR)')
#ax.set_title(f'Precision-Recall Curve ±1σ')
ax.set_title(f'Precision-Recall Curve 90% CL')
ax.legend(loc="lower left")
ax.grid()

plt.tight_layout()
plt.savefig('PR_uncertainties_0FP_extrapolated.pdf')
