In [None]:
# macro illustrates how to draw a precision-recall uncertainty ellipse
# and it shows how it compares with the precision, recall results from 
# pseudo experiments, as obtained from a multinomial distribution

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import histogrammar as hg
import seaborn as sns
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
from scipy.stats import chi2, norm

In [None]:
%matplotlib inline

In [None]:
# [TP, FP, TN, FN]
# values of confusion matrix - in probabilities
p = np.array([19, 1, 799, 171]) / 990
p = np.array([0.15, 0.05, 0.2, 0.6])
p = np.array([98, 2, 10000 - 98*(0.95/0.05) - 2 - 98, 98*(0.95/0.05)]) / 10000
p = np.array([99, 1, 10000 - 99*(0.95/0.05) - 1 - 99, 99*(0.95/0.05)]) / 10000
p = np.array([0.009, 0.0001, 0.8199, 0.171])
p = np.array([0.095, 0.005, 0.5495, 0.3505])

np.sum(p)

In [None]:
# number of pseudo experiments
nexp = 20000
# number of data points per experiment
N = 1000

In [None]:
pe = p.copy()

In [None]:
#pe[pe < 1/N] = 1 / N

In [None]:
LX = np.array([[N*pe[0]*(1-pe[0]), -N*pe[0]*pe[1], -N*pe[0]*pe[2], -N*pe[0]*pe[3]],
               [-N*pe[0]*pe[1], N*pe[1]*(1-pe[1]), -N*pe[1]*pe[2], -N*pe[1]*pe[3]],
               [-N*pe[0]*pe[2], -N*pe[1]*pe[2], N*pe[2]*(1-pe[2]), -N*pe[2]*pe[3]],
               [-N*pe[0]*pe[3], -N*pe[1]*pe[3], -N*pe[2]*pe[3], N*pe[3]*(1-pe[3])]])

In [None]:
LX

In [None]:
# postive and negative derivatives
def pdr(x1, x2):
    return x2 / (x1 + x2)**2

def ndr(x1, x2):
    return -x1 / (x1 + x2)**2

In [None]:
# calculate jacobian
x = p * N
J = np.array([[pdr(x[0], x[3]), 0, 0, ndr(x[0], x[3])], [pdr(x[0], x[1]), ndr(x[0], x[1]), 0, 0]])

In [None]:
cov = J @ LX @ J.T

In [None]:
cov

In [None]:
sigma_prec = np.sqrt(cov[1][1])
sigma_recall = np.sqrt(cov[0][0])

sigma_prec, sigma_recall

In [None]:
corrl = cov[0][1] / np.sqrt(cov[0][0] * cov[1][1])
corrl

In [None]:
# generate multinomial pseudo experiments based on p
np.random.seed(42)

ps = []
for i in range(nexp):
    x = np.random.multinomial(N, p)
    ps.append(x)
ps = np.array(ps)

df = pd.DataFrame(ps, columns=['TP', 'FP', 'TN', 'FN'])
df['precision'] = df['TP'] / (df['TP'] + df['FP'])
df['recall'] = df['TP'] / (df['TP'] + df['FN'])

In [None]:
#df.plot.scatter(x='recall', y='precision')

In [None]:
#sns.kdeplot(data=df, x="recall", y="precision", levels=10)

In [None]:
# parameters of the uncertainty ellipse

lambda1 = (cov[0][0] + cov[1][1]) / 2 + np.sqrt(((cov[0][0] - cov[1][1]) / 2) ** 2 + cov[1][0] ** 2)
lambda2 = (cov[0][0] + cov[1][1]) / 2 - np.sqrt(((cov[0][0] - cov[1][1]) / 2) ** 2 + cov[1][0] ** 2)

# radii of the ellipse
r1 = np.sqrt(lambda1)
r2 = np.sqrt(lambda2)

# tilt angle
theta = 0.
if cov[1][0] == 0 and cov[0][0] >= cov[1][1]:
    theta = 0.
elif cov[1][0] == 0 and cov[0][0] < cov[1][1]:
    theta = np.pi / 2.
else:
    theta = np.arctan2(lambda1 - cov[0][0], cov[1][0])

deg = theta / np.pi * 180

# center
#mean_x = recall
#mean_y = precision

In [None]:
print( r1, r2, deg )

In [None]:
precision = p[0] / (p[0] + p[1])
precision

In [None]:
recall = p[0] / (p[0] + p[3])
recall

In [None]:
# center
mean_x = recall
mean_y = precision

In [None]:
# confidence limits for 1, 2, and 3 standard deviations in 1 dimension
nstd1 = 2. * (norm.cdf(1) - 0.5)
nstd2 = 2. * (norm.cdf(2) - 0.5)
nstd3 = 2. * (norm.cdf(3) - 0.5)
print (nstd1, nstd2, nstd3)

# confidence limits in two dimensions
# 68.3% = 1 std dev (1 dim)
l68 = chi2.ppf(nstd1, 2)
# 95.4% = 2 std dev (1 dim)
l95 = chi2.ppf(nstd2, 2)
# 99.7% = 3 std dev (1 dim)
l99 = chi2.ppf(nstd3, 2)
print (l68, l95, l99)

# scales with which to scale up r1 and r2
scale1 = np.sqrt(l68)
scale2 = np.sqrt(l95)
scale3 = np.sqrt(l99)
print (scale1, scale2, scale3)

In [None]:
precision, recall

In [None]:
sigma_prec, sigma_recall

In [None]:
ellipse1 = Ellipse((mean_x, mean_y), width=2*scale1*r1, height=2*scale1*r2, angle=deg, edgecolor='red', facecolor='none')
ellipse2 = Ellipse((mean_x, mean_y), width=2*scale2*r1, height=2*scale2*r2, angle=deg, edgecolor='red', facecolor='none')
ellipse3 = Ellipse((mean_x, mean_y), width=2*scale3*r1, height=2*scale3*r2, angle=deg, edgecolor='red', facecolor='none')

fig, ax = plt.subplots(figsize=(14,8)) 

ax.add_artist(ellipse1)
ax.add_artist(ellipse2)
ax.add_artist(ellipse3)
#ax.add_patch(ellipse)

ax.set_xlim(max(recall - 6 * sigma_recall, 0), min(recall + 6 * sigma_recall, 1))
ax.set_ylim(max(precision - 6 * sigma_prec, 0), min(precision + 6 * sigma_prec, 1))

#ax.set_xlim(0, 0.15)
#ax.set_ylim(0.75, 1)

plt.scatter(df['recall'], df['precision'])
# df.plot.scatter(x='recall', y='precision')
#sns.kdeplot(data=df, x="recall", y="precision", levels=10, color='black')
sns.kdeplot(data=df, x="recall", y="precision", levels=[1-nstd3, 1-nstd2, 1-nstd1], color='black')

plt.grid()
plt.show()

In [None]:
# validation: count number of data points inside ellipses

In [None]:
inp = df[['recall', 'precision']].values
trans = ax.transData.transform(inp)

In [None]:
arr1 = ellipse1.contains_points( trans )
arr2 = ellipse2.contains_points( trans )
arr3 = ellipse3.contains_points( trans )

In [None]:
arr1.sum() / nexp , arr2.sum() / nexp, arr3.sum() / nexp

In [None]:
print (nstd1, nstd2, nstd3)

In [None]:
# validation: check correlation, precision, recall, uncertainties

In [None]:
corrl

In [None]:
df.corr()

In [None]:
precision, recall

In [None]:
df.mean()

In [None]:
sigma_prec, sigma_recall

In [None]:
df.std()