In [None]:
import pandas as pd 
import pickle
import glob 
from sklearn.manifold import MDS, Isomap, TSNE, LocallyLinearEmbedding, SpectralEmbedding
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
import matplotlib.pyplot as plt
from scipy.cluster.vq import whiten
import numpy as np
import seaborn as sns
import random
import itertools
sns.set_style("white", {"grid.linestyle": ":"})
sns.set_context("notebook", font_scale=1.2)
from utils import risk_control
from scipy.special import softmax
import cmcrameri.cm as cmc
import matplotlib.colors as clr
from matplotlib.patches import Rectangle
import matplotlib.patches as patches

#import scienceplots
#plt.style.use(["science", "nature"])
import pacmap
import trimap
import os
import metric_learn
import umap

#### Description
Visualisation of Person-summaries, as well as projecting Person-summaries onto the TCAV directions

In [None]:
def get_pairwise_similarity(data, pairs):
    out = []
    for x, y in pairs:
        if np.array_equal(data[x], data[y]):
            out.append(1)
        else:
            out.append(-1)
    return np.array(out)

def contains_in_sequence(sample, min_, max_):
    """Checks if sequence contains tokens in range [min_, max_]"""
    return np.where((sample >= min_) & (sample <=max_))[0].shape[0] > 0

def plot_square(s, width):
    sx = s[0]
    sy = s[1]
    plt.hlines(y = sy, xmin=sx, xmax=sx+width, linewidth=1.5, linestyle="dotted", color="black")
    
    plt.hlines(y = sy - width, xmin=sx, xmax=sx + width, linewidth=1.5, linestyle="dotted", color="black")
    plt.vlines(x = sx, ymin=sy-width, ymax=sy, linewidth=1.5, linestyle="dotted", color="black")
    plt.vlines(x = sx + width, ymin=sy-width, ymax=sy, linewidth=1.5, linestyle="dotted", color="black")

In [None]:
v = 4.02
save_path = r"../analysis/plots/%s/" %v
try:
    os.makedir(save_path)
except:
    pass

In [None]:
with open('../analysis/tcav/sample_act/%s_decoder.identity/act.pkl' %v, "rb") as f:
    act = pickle.load(f)
    whiten_act = whiten(act)
with open('../analysis/tcav/sample_meta/%s_decoder.identity/meta.pkl' %v, "rb") as f:
    meta = pickle.load(f)  
    meta["prb"] = softmax(meta["predictions"], -1)[:,1]
    
    has_health_record = list()
    seqlen = list()
    for s in meta["metadata"]:
        has_health_record.append(contains_in_sequence(s, 72, 819))
        seqlen.append((s!=0).sum())
    meta["has_health"] = has_health_record
    meta["seqlen"] = seqlen
print("Seqlen Quantiles", np.quantile(seqlen, [0,0.33, 0.66, 1]))

In [None]:
ppl = pd.read_csv("../populations/survival/population/result.csv").set_index("PERSON_ID")
ppl = ppl[ppl.index.isin(meta["sequence_ids"])]

ppl["EVENT_FINAL_DATE"] = pd.to_datetime(ppl["EVENT_FINAL_DATE"], format="%Y-%m-%d")
ppl["BIRTHDAY"] = pd.to_datetime(ppl["BIRTHDAY"], format="%Y-%m-%d")
ppl["UNLABELED"] = ppl.apply(lambda x: (x["TARGET"] == 0) & (x["EVENT_FINAL_DATE"] < pd.to_datetime("2020-12-31", format="%Y-%m-%d")), axis = 1)

In [None]:
trgs = np.array(meta["targets"]).reshape(-1)
unlb = np.isin(meta["sequence_ids"], ppl[ppl["UNLABELED"]].index.values)

In [None]:
### Uncertainty estimates
probs =  meta["prb"].reshape(-1)
N = probs.shape[0]
theta = 0.823
risk  = 0.088

kappa, certain = np.zeros(N), np.zeros(N)
for i in range(kappa.shape[0]):
    if probs[i] > 0.5:
        kappa[i] = probs[i]
    else:
        kappa[i] = 1 - probs[i]
    certain[i] = (kappa[i] < theta)

In [None]:
##PCA 2D
prj_linear = PCA(n_components=2)
xl = prj_linear.fit_transform(act)
# ##UMAP 3D
# umap_whiten_3d = umap.UMAP( n_components=3, random_state=0, n_epochs=500)
# x_whiten_3d = umap_whiten_3d.fit_transform(whiten_act)
# ##UMAP 2D
# umap_whiten = umap.UMAP( n_components=2, random_state=0, n_epochs=500)
# x_whiten = umap_whiten.fit_transform(whiten_act)
# ## UMAP 2D (Original)
# umap_orig = umap.UMAP( n_components=2, random_state=0, n_epochs=500)
# x_umap = umap_orig.fit_transform(act)
# ## UMAP 2D (Original)
# ##PACMAP 3D (original)
# pacmap_whiten_3d = pacmap.PaCMAP(n_components=3)
# x_pacmap_3d = pacmap_whiten_3d.fit_transform(act)
# ##PACMAP 2D (whiten)
# #pacmap_whiten = pacmap.PaCMAP(n_components=2)
# #x_pacmap = pacmap_whiten.fit_transform(whiten_act)
# ## PACMAP 2D (original)
# pacmap_whiten_n = pacmap.PaCMAP(n_components=2)
# x_pacmap_n = pacmap_whiten_n.fit_transform(act)
# ## TriMAP
# x_trimap = trimap.TRIMAP(n_inliers=12, n_random= 8 ,n_outliers= 8, weight_temp=4, opt_method="momentum").fit_transform(act)
# x_trimap_3d = trimap.TRIMAP(n_dims=3,n_inliers=12, n_random= 8 ,n_outliers= 8, weight_temp=4, opt_method="momentum").fit_transform(act)


In [None]:
_cmap = cmc.bamako
cmap = clr.LinearSegmentedColormap.from_list("bamako", [_cmap(225), _cmap(125), _cmap(50)], N=100)
#cmap
_cmap = cmc.lapaz
cmap_u = clr.LinearSegmentedColormap.from_list("lapaz", [_cmap(225), _cmap(125), _cmap(50)], N=100)
#cmap_u

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,6))
ax[0].scatter(xl[:,0], xl[:,1], c=probs,s=5, cmap=cmap)#sns.color_palette("dark:salmon_r", as_cmap=True))
ax[0].set_title("PCA (Linear) Projection")

_a = xl[(trgs==0) & (unlb==0)]
_b = xl[trgs==1]
_c = xl[(trgs==0) & (unlb==1)]
ax[1].scatter(_a[:,0], _a[:,1], c="gray", alpha=0.2, s=2)
ax[1].scatter(_b[:,0], _b[:,1], c="black", s=2, cmap=cmap)
ax[1].scatter(_c[:,0], _c[:,1], c="orange", s=2, cmap=cmap)

ax[1].set_title("PCA (Linear) Projection (Deceased colored)")

plt.tight_layout()
sns.despine()
plt.savefig(save_path + "pca_viz.svg", format="svg")

In [None]:
### DensMap
param = dict(n_components=2,
             min_dist=0.3, 
             n_neighbors = 50,
             densmap=True,
             dens_lambda=0.8, 
             metric="euclidean", 
             random_state=0, 
             init="spectral")
prj = umap.UMAP(**param)
xp = prj.fit_transform(act)

In [None]:
fig = plt.figure(figsize=(10,10))
plt.scatter(xp[:,0], xp[:,1], linewidth = 0.1, s=4, 
              c=probs, cmap=cmap, facecolors="white")
#plt.tight_layout()
plt.autoscale()
plt.savefig(save_path + "person_space_densmap_.svg", format="svg")
plt.show()

In [None]:
## PACMAP
param = dict(n_components=2, MN_ratio=0.1, FP_ratio=2,  n_neighbors=50, distance="angular", random_state=0)
prj = pacmap.PaCMAP(**param)
xp = prj.fit_transform(act)

In [None]:
fig = plt.figure(figsize=(10,10))
plt.scatter(xp[:,0], xp[:,1], linewidth = 0.1, s=4, 
              c=probs, cmap=cmap, facecolors="white")
#plt.tight_layout()
plt.autoscale()
plt.savefig(save_path + "person_space_pacmap_.svg", format="svg")
plt.show()

In [None]:
###UMAP
param = dict(n_components=2,
             random_state=0)
prj = umap.UMAP(**param)
xp = prj.fit_transform(act)

In [None]:
fig = plt.figure(figsize=(10,10))
plt.scatter(xp[:,0], xp[:,1], linewidth = 0.1, s=4, 
              c=probs, cmap=cmap, facecolors="white")
#plt.tight_layout()
plt.autoscale()
plt.savefig(save_path + "person_space_umap_.svg", format="svg")
plt.show()

In [None]:
param = dict(n_components=2,
             min_dist=0.3, 
             n_neighbors = 50,
             densmap=True,
             dens_lambda=0.8, 
             metric="euclidean", 
             random_state=0, 
             init="spectral")
prj = umap.UMAP(**param)
xp = prj.fit_transform(act)
#xps.append(xp)
#params.append(param)
#best
# dict(n_components=2,
#              min_dist=0.3, 
#              n_neighbors = 50,
#              densmap=True,
#              dens_lambda=0.6, #0.7 #0.8
#              metric="euclidean", 
#              random_state=0, 
#              init="spectral")

In [None]:
preds = probs

In [None]:
fig, ax = plt.subplots(3,3 , figsize=(20,20))
style = dict( s=3, alpha=0.95, edgecolor="white", linewidths=0.1)
###################
### PROBABILITIES
ax[0,0].scatter(xp[:,0], xp[:,1], c=preds, cmap=cmap, **style)
ax[0,0].set_title(" Randomly Sampled Embeddings (estimated probabilities)")

####################
### TARGETS
ax[0,1].set_title(" Randomly Sampled Embeddings (true targets) ")
_a = xp[(trgs==0) & (unlb==0)]
_b = xp[trgs==1]
_c = xp[(trgs==0) & (unlb==1)]

ax[0,1].scatter(_a[:,0], _a[:,1], c="gray", alpha=0.2, s=1, cmap=cmap)
ax[0,1].scatter(_b[:,0], _b[:,1], c="black", s=1)
ax[0,1].scatter(_c[:,0], _c[:,1], c="orange", s=1)
###################
### QUANTIZED Preds
ax[1,0].set_title("Randomly Sampled Embeddings (binned probabilities)")

qt  = np.digitize(meta["prb"], [0.1,0.5, 0.75, 0.9]).astype(float)
color = list()
for q in qt:
    if q == 0:
        color.append("#4f4f4f")
    elif q == 1:
        color.append("#009E73")
    elif q == 2:
        color.append("#cc79a7")
    elif q == 3:
        color.append("#56b4e9")
    else:
        color.append("#e69f00")
ax[1,0].scatter(xp[:,0], xp[:,1], c=color, **style)
p = [patches.Patch(color= c,label = l) for c, l in zip(["#4f4f4f", "#009E73", "#cc79a7", "#56b4e9", "#e69f00"],
                                          ["<10%","10%-50%", "50%-75%",  "75%-90%", ">90%"])]
ax[1,0].legend(handles=p)
#####################
### QUANTIZED PREDS with AREA
ax[2,2].set_title("Randomly Sampled Embeddings (binned probabilities with decision regions)")
probs_model = SVC(kernel="rbf")
probs_model.fit(xp, qt)
xx, yy = np.meshgrid(np.linspace(xp[:,0].min(), xp[:,0].max(), 100),
                     np.linspace(xp[:,1].min(), xp[:,1].max(), 100))
grid = np.vstack([xx.ravel(), yy.ravel()]).T
zz = np.reshape(probs_model.predict(grid), xx.shape)
ax[2,2].pcolormesh(xx, yy, zz, cmap="Spectral", alpha=0.2, shading="nearest")
ax[2,2].scatter(xp[:,0], xp[:,1], c=preds, cmap=cmap, **style)
#####################
### AGE
ax[1,1].set_title("Randomly Sampled Embeddings (colored by age)")
ax[1,1].scatter(xp[:,0], xp[:,1], c=-(meta["metadata"][:,4:5] - 26)/35, cmap=cmap, **style)
#####################
### SEX
ax[2,0].set_title("Randomly Sampled Embeddings (sex)")
color = list()
for i in np.array(meta["metadata"][:,2]):
    if i == 10:
        color.append("#e69f00") #"male"
    else:
        color.append("#009E73")
p = [patches.Patch(color= c,label = l) for c, l in zip(["#e69f00", "#009E73"],
                                          ["male","female"])]
ax[2,0].legend(handles=p)

ax[2,0].scatter(xp[:,0], xp[:,1],  c= color, **style)
#####################
### HAS HEALTH
ax[2,1].set_title("Randomly Sampled Embeddings (presence of health events)")

mask = np.array(meta["has_health"])
ax[2,1].scatter(xp[mask,0], xp[mask,1], c="#009E73",  s=2,  label = "Have health records")
mask = ~mask
ax[2,1].scatter(xp[mask,0], xp[mask,1], c="red",  s=2, label = "No health records")
ax[2,1].legend()
####################
### CERTAINTY
ax[0,2].set_title("Randomly Sampled Embeddings (probabilities and certainties)")
color = list()
mask = ~np.array(certain).astype(bool)
ax[0,2].scatter(xp[mask,0], xp[mask,1], c=preds[mask], cmap=cmap, **style, label="Certain Predictions",)

ax[0,2].scatter(xp[~mask,0], xp[~mask,1],  marker="D", c = "#4e4e4e", s=1, 
                 alpha=0.5, edgecolor="white", linewidths=0.1, label = "Uncertain Predictions, p in [%.2f, %.2f]" %(1 - theta, theta))
#####################
### NUMBER OF RECORDS
ax[1,2].set_title("Randomly Sampled Embeddings (length of the sequence)")
qt = np.digitize(meta["seqlen"], [1048., 1348.])
color = list()
for q in qt:
    if q == 0:
        color.append("#4f4f4f")
    elif q == 1:
        color.append("#009E73")
    elif q == 2:
        color.append("#cc79a7")
ax[1,2].scatter(xp[:,0], xp[:,1], c=color, **style)
p = [patches.Patch(color= c,label = l) for c, l in zip(["#4f4f4f", "#009E73", "#cc79a7"],
                                          ["<1048","[1048, 1348]", ">1348"])]
ax[1,2].legend(handles=p)

sns.despine()
plt.tight_layout()
plt.savefig(save_path + "people_embedding_3x3.svg", format="svg")
plt.show()

In [None]:
rA = (7,20)
wA = 5
rB = (0,-3)
wB = 3
rC = (-5,10)
wC = 3

## Region

In [None]:
### Sort points by x
sort_idx = np.argsort(np.array(meta["metadata"][:,4:5]).reshape(-1))
xs = xp[sort_idx]
_meta_age = np.array(meta["metadata"][:,4:5]).reshape(-1)[sort_idx]
_meta_sex = np.array(meta["metadata"][:,2]).reshape(-1)[sort_idx]

In [None]:
r = rC
w = wC
title = "C"
region_mask = (xs[:,0] > r[0]) & (xs[:,0] < (r[0] + w)) & (xs[:,1]> (r[1] - w)) & (xs[:,1]<r[1])

In [None]:
fig = plt.figure(figsize=(5,5))

plt.scatter(xs[region_mask,0]  , xs[region_mask,1], c=-(_meta_age[region_mask] - 26)/35,s=100, edgecolor="white", linewidths=0.5, cmap=cmap, alpha=0.8)
plt.title("Region %s (age)" %title)
plt.axis("off")
plt.tight_layout()
plt.savefig(save_path + "pe_age_A%s.svg" %title, format="svg")
plt.show()

In [None]:
fig = plt.figure(figsize=(5,5))
__x = xs[region_mask]

_t = trgs[sort_idx][region_mask]
_u =  unlb[sort_idx][region_mask]
a = __x[(_t==0) & (_u==0)]
b = __x[_t==1]
b
z = __x[(_t==0) & (_u==1)]
plt.title("Region %s (colored by targets)" %title)

plt.scatter(a[:,0], a[:,1], c="gray",  s=100, alpha=0.5, label = "True Alive", edgecolor="white", linewidths=1.)
plt.scatter(b[:,0], b[:,1], c="black", s=90, alpha=0.85, label="True Deceased", edgecolor="red", linewidth=5, marker="D")
plt.scatter(z[:,0], z[:,1], c="black", s=90, alpha= 0.85,label = "Unlabeled", edgecolor="violet", linewidth=5, marker="D")
plt.axis("off")
plt.tight_layout()
plt.savefig(save_path + "pe_targ_%s.svg" %title, format="svg")
plt.show()

In [None]:
fig = plt.figure(figsize=(5,5))
_meta_sex = np.array(meta["metadata"][:,2]).reshape(-1)[sort_idx]

color = list()
for i in _meta_sex:
    if i == 10:
        color.append("#e69f00")
    else:
        color.append("#009E73")
plt.scatter(xs[region_mask,0], xs[region_mask,1],  c= np.array(color)[region_mask], s=100, alpha=0.85, edgecolor="white", linewidths=1)
# _mask =  (_meta_sex != 10) & (_x_sorted[:,0] > r[0]) & (_x_sorted[:,0] < (r[0] + w)) & (_x_sorted[:,1]> (r[1] - w)) & (_x_sorted[:,1]<r[1])
# plt.scatter(_x_sorted[_mask,0], _x_sorted[_mask,1],  c = ["#009E73" for _ in range(sum(_mask))], s=75,  alpha=0.8,label="female", edgecolor="white", linewidths=0.5)
plt.axis("off")
plt.title("Region %s (colored by sex)" %title)
plt.tight_layout()
plt.savefig(save_path + "pe_sex_%s.svg" %title, format="svg")
plt.show()

In [None]:
t = np.array(meta["targets"])
c = np.array(meta["prb"])[t == 1]
tp = sum(c >= theta)
print("Fraction of TP:", tp)

In [None]:
"x size: %.2f" %(np.abs(np.max(xp[:,0])) +  np.abs(np.min(xp[:,0]))), "y size: %.2f"  %(np.abs(np.max(xp[:,1])) +  np.abs(np.min(xp[:,1])))

In [None]:
fig = plt.figure(figsize=(10,10))
mask = np.array(certain).astype(bool)
_trgs = np.array(trgs).astype(bool)
coverage = sum(mask)/mask.shape[0]
plt.scatter(xp[mask,0], xp[mask,1],   linewidth = 0.5, s=0.5, #marker="D", 
              edgecolors = cmap(preds[mask]), facecolors="white",
              label = "Uncertain Predictions, p in [%.2f, %.2f]" %(1 - theta, theta))

plot_square(rA,wA)
#plot_square(rB,wB)
plot_square(rC,wC)
mask = ~mask
plt.scatter(xp[mask,0], xp[mask,1], c=cmap(preds[mask]),  s=10,  label = "Certain Predictions", edgecolor="white", linewidth=0.2)
plt.scatter(xp[_trgs,0], xp[_trgs,1], c= cmap(preds[_trgs]), edgecolor="red", s=7, linewidth=2, marker="D", label = "True Deceased")

plt.title("Randomly Sampled Embeddings (Selective Classification: theta=%.2f, risk=%.2f, coverage=%.2f)" %(theta, risk, coverage))
#plt.legend()
sns.despine()
plt.axis("scaled")
plt.tight_layout()
plt.savefig(save_path + "people_embedding.svg", format="svg")
plt.show()

In [None]:
param = dict(n_components=2, MN_ratio=0.1, FP_ratio=2,  n_neighbors=50, distance="angular", random_state=0)
prj = pacmap.PaCMAP(**param) #mn=0.1, fp=2, nn=10 dist = ang
xp_pac = prj.fit_transform(act)

prj = trimap.TRIMAP(n_dims=2)
xp_tri = prj.fit_transform(act)

prj = trimap.UMAP(n_components=2, distance="angular")

xp_ump = prj.fit_transform(act)

In [None]:

fig = plt.figure(figsize=(10,10))
mask = np.array(certain).astype(bool)
_trgs = np.array(trgs).astype(bool)
coverage = sum(mask)/mask.shape[0]
plt.scatter(xp[mask,0], xp[mask,1],   linewidth = 0.5, s=0.5, #marker="D", 
              edgecolors = cmap(preds[mask]), facecolors="white",
              label = "Uncertain Predictions, p in [%.2f, %.2f]" %(1 - theta, theta))

#plot_square(rA,wA)
#plot_square(rB,wB)
#plot_square(rC,wC)
mask = ~mask
plt.scatter(xp[mask,0], xp[mask,1], c=cmap(preds[mask]),  s=10,  label = "Certain Predictions", edgecolor="white", linewidth=0.2)
plt.scatter(xp[_trgs,0], xp[_trgs,1], c= cmap(preds[_trgs]), edgecolor="violet", s=8, linewidth=2, marker="D", label = "True Deceased")

plt.title("Randomly Sampled Embeddings (Selective Classification: theta=%.2f, risk=%.2f, coverage=%.2f)" %(theta, risk, coverage))
#plt.legend()
sns.despine()
plt.axis("scaled")
plt.tight_layout()
plt.savefig(save_path + "people_embedding_pacmap.svg", format="svg")
plt.show()

In [None]:
prj = trimap.TRIMAP(n_dims=2)
xp = prj.fit_transform(act)

In [None]:

fig = plt.figure(figsize=(10,10))
mask = np.array(certain).astype(bool)
_trgs = np.array(trgs).astype(bool)
coverage = sum(mask)/mask.shape[0]
plt.scatter(xp[mask,0], xp[mask,1],   linewidth = 0.5, s=0.5, #marker="D", 
              edgecolors = cmap(preds[mask]), facecolors="white",
              label = "Uncertain Predictions, p in [%.2f, %.2f]" %(1 - theta, theta))

#plot_square(rA,wA)
#plot_square(rB,wB)
#plot_square(rC,wC)
mask = ~mask
plt.scatter(xp[mask,0], xp[mask,1], c=cmap(preds[mask]),  s=10,  label = "Certain Predictions", edgecolor="white", linewidth=0.2)
plt.scatter(xp[_trgs,0], xp[_trgs,1], c= cmap(preds[_trgs]), edgecolor="violet", s=8, linewidth=2, marker="D", label = "True Deceased")

plt.title("Randomly Sampled Embeddings (Selective Classification: theta=%.2f, risk=%.2f, coverage=%.2f)" %(theta, risk, coverage))
#plt.legend()
sns.despine()
plt.tight_layout()
plt.savefig(save_path + "people_embedding_trimap.svg", format="svg")
plt.show()

In [None]:
prj = umap.UMAP(n_components=2, random_state=0)
xp = prj.fit_transform(act)

In [None]:
fig = plt.figure(figsize=(10,10))
mask = np.array(certain).astype(bool)
_trgs = np.array(trgs).astype(bool)
coverage = sum(mask)/mask.shape[0]
plt.scatter(xp[mask,0], xp[mask,1],   linewidth = 0.5, s=0.5, #marker="D", 
              edgecolors = cmap(preds[mask]), facecolors="white",
              label = "Uncertain Predictions, p in [%.2f, %.2f]" %(1 - theta, theta))

#plot_square(rA,wA)
#plot_square(rB,wB)
#plot_square(rC,wC)
mask = ~mask
plt.scatter(xp[mask,0], xp[mask,1], c=cmap(preds[mask]),  s=10,  label = "Certain Predictions", edgecolor="white", linewidth=0.2)
plt.scatter(xp[_trgs,0], xp[_trgs,1], c= cmap(preds[_trgs]), edgecolor="violet", s=8, linewidth=2, marker="D", label = "True Deceased")

plt.title("Randomly Sampled Embeddings (Selective Classification: theta=%.2f, risk=%.2f, coverage=%.2f)" %(theta, risk, coverage))
#plt.legend()
sns.despine()
#plt.axis("scaled")
plt.tight_layout()
plt.savefig(save_path + "people_embedding_umap.svg", format="svg")
plt.show()

In [None]:
from sklearn.manifold import TSNE

In [None]:
prj = TSNE(n_components=2, random_state=0)
xp = prj.fit_transform(act)

In [None]:
fig = plt.figure(figsize=(10,10))
mask = np.array(certain).astype(bool)
_trgs = np.array(trgs).astype(bool)
coverage = sum(mask)/mask.shape[0]
plt.scatter(xp[mask,0], xp[mask,1],   linewidth = 0.5, s=0.5, #marker="D", 
              edgecolors = cmap(preds[mask]), facecolors="white",
              label = "Uncertain Predictions, p in [%.2f, %.2f]" %(1 - theta, theta))

#plot_square(rA,wA)
#plot_square(rB,wB)
#plot_square(rC,wC)
mask = ~mask
plt.scatter(xp[mask,0], xp[mask,1], c=cmap(preds[mask]),  s=10,  label = "Certain Predictions", edgecolor="white", linewidth=0.2)
plt.scatter(xp[_trgs,0], xp[_trgs,1], c= cmap(preds[_trgs]), edgecolor="violet", s=8, linewidth=2, marker="D", label = "True Deceased")

plt.title("Randomly Sampled Embeddings (Selective Classification: theta=%.2f, risk=%.2f, coverage=%.2f)" %(theta, risk, coverage))
#plt.legend()
sns.despine()
#plt.axis("scaled")
plt.tight_layout()
plt.savefig(save_path + "people_embedding_tsne.svg", format="svg")
plt.show()

# 2. Concept Directions

In [None]:
def load_concepts(c):
    ATTR_PATH = r"..\analysis\tcav\cavs\%s_decoder.identity" %v
    file_path = ATTR_PATH + r"\%s.pkl" %c
    with open(file_path, "rb") as f:
        out =  pickle.load(f)
    return out

In [None]:
x_mental = np.mean(load_concepts("mental"), axis=0)
x_agric  = np.mean(load_concepts("agriculture"), axis=0)
x_craft  = np.mean(load_concepts("crafts"), axis=0)
x_income = np.mean(load_concepts("income"), axis=0)
x_manager= np.mean(load_concepts("managers"), axis=0)
x_mf = np.mean(load_concepts("sex_mf"), axis=0)
x_fm = np.mean(load_concepts("sex_fm"), axis=0)

x_prof =  np.mean(load_concepts("professionals"), axis=0)
x_infect = np.mean(load_concepts("infection"), axis=0)
x_neural = np.mean(load_concepts("neural"), axis=0)

In [None]:
t = np.array(meta["targets"])
c = meta["prb"]
fig, ax = plt.subplots(2,2 , figsize=(20,20))
ax[0,0].scatter(np.dot(act, x_mental), np.dot(act, x_agric), c=c,s=5, cmap=cmap)
ax[0,0].set_xlabel("Mental Direction")
ax[0,0].set_ylabel("Agriculture Direction")

ax[0,0].set_title("Projection on  Mental-Agriculture Concept")

ax[0,1].scatter(np.dot(act, x_craft), np.dot(act, x_income), c=c,s=5, cmap=cmap)
ax[0,1].set_title("Projection on  Crafts-Income Concept")
ax[0,1].set_xlabel("Crafts")
ax[0,1].set_ylabel("Income")

c_ = (meta["metadata"][:,2] == 10).astype(float)

ax[1,0].scatter(np.dot(act, x_mf), np.dot(act, x_fm), c=c_, s=5, cmap=cmap)
ax[1,0].set_title("Projection on  Female-Male Concept")
ax[1,0].set_xlabel("Female")
ax[1,0].set_ylabel("Male")

ax[1,1].scatter(np.dot(act, x_infect), np.dot(act, x_neural), c=c, s=5, cmap=cmap)
ax[1,1].set_title("Projection on  Infection-Neural Concept")
ax[1,1].set_xlabel("Infection")
ax[1,1].set_ylabel("Neural")

plt.tight_layout()
plt.savefig(save_path + "tcav_projections.svg", format="svg")
plt.show()


In [None]:
def norm(x):
    x = np.array(x)
    return (x - x.mean())/x.std()
from scipy.spatial.distance import cosine

In [None]:
def get_score(x,y):
    res = list()
    for i in range(x.shape[0]):
        res.append(cosine(x[i], y))
    return res

In [None]:
fig, ax = plt.subplots(3,2 , figsize=(30,45))
ax[0,0].scatter(x_pacmap_n[:,0], x_pacmap_n[:,1], c= norm(np.dot(act, x_mf)),s=5, cmap=cmc.berlin)
ax[0,0].set_title(" Randomly Sampled Embeddings (Distance to FEMALE Direction)")

ax[0,1].scatter(x_pacmap_n[:,0], x_pacmap_n[:,1], c= norm(np.dot(act, x_mental)),s=5, cmap=cmc.berlin)
ax[0,1].set_title(" Randomly Sampled Embeddings (Distance to MENTAL Direction)")

ax[1,0].scatter(x_pacmap_n[:,0], x_pacmap_n[:,1], c= norm(np.dot(act, x_income)),s=5, cmap=cmc.berlin)
ax[1,0].set_title(" Randomly Sampled Embeddings (Distance to HIGH INCOME Direction)")

ax[1,1].scatter(x_pacmap_n[:,0], x_pacmap_n[:,1], c= norm(np.dot(act, x_prof)),s=5, cmap=cmc.berlin)
ax[1,1].set_title(" Randomly Sampled Embeddings (Distance to ASSOC. PROF Direction)")

ax[2,1].scatter(x_pacmap_n[:,0], x_pacmap_n[:,1], c= norm(np.dot(act, x_craft)),s=5, cmap=cmc.berlin)
ax[2,1].set_title(" Randomly Sampled Embeddings (Distance to CRAFTS Direction)")

ax[2,0].scatter(x_pacmap_n[:,0], x_pacmap_n[:,1], c= norm(np.dot(act, x_neural)),s=5, cmap=cmc.berlin)
ax[2,0].set_title(" Randomly Sampled Embeddings (Distance to NEURAL Direction)")

plt.tight_layout()
plt.savefig(save_path + "people_embedding_scoring_pacmap.svg", format="svg")
plt.show()

#### Edge Cases

In [None]:
with open('../analysis/tcav/sample_meta/%s_decoder.identity/meta.pkl' %v, "rb") as f:
    meta = pickle.load(f)  
    meta["prb"] = softmax(meta["predictions"], -1)[:,1]

In [None]:
dateparser = lambda x: pd.to_datetime(x, format = '%d%b%Y:%X',  errors='coerce')

lines_data = pd.read_csv(r"K:\\22SSI\\Germans\\rawdata\\eos\PRETTY_LINES_V3.csv",  encoding="latin", sep = ";",
                 usecols=["PERSON_ID", "EVENT_CAUSE_FINAL", "EVENT_FINAL_DATE", "QUALITY_INFORMATION_FINAL", "KILDE_FINAL", "NUMBER_EVENTS_PERSON"],
                 parse_dates=["EVENT_FINAL_DATE"], date_parser=dateparser)

In [None]:
dsa = pd.read_csv(r"..\\rawdata\\eos\DODSAASG2019.csv", encoding="latin", sep = ";",)

In [None]:
#edges = np.array(meta["sequence_ids"])[(t == 1) & (probs >= 0.82) & (probs < 1.)]
edges = np.array(meta["sequence_ids"])[(t == 1) & (probs <= 0.07)]
r = dsa[dsa["PERSON_ID"].isin(edges)][[ "V_ALDER", "C_DOD_1A",
'C_DODSMAADE', "C_DODTILGRUNDL_ACME", "C_LISTEA", "C_LISTEB", "PERSON_ID"]]#.groupby("C_LISTEA").count()

In [None]:
rr = r["V_ALDER"].values
rr[-3:].sum()/rr.sum()

In [None]:
r