In [None]:
# STL
import gc
# 3rd Party
import torch
from tqdm import tqdm
import plotly.graph_objects as go
import numpy as np
# Local
from gatbert.utils import batched

In [None]:
# Change this for your own analysis
entity_path = "/home/ethanlmines/blue_dir/models/cn_bert_mar22/entities.pkl"

In [None]:
embedding_mat = torch.load(entity_path, weights_only=False).weight

In [None]:
# Normalize all the vectors so that a dot product is equivalent to cosine similarity
with torch.no_grad():
    for i in tqdm(range(embedding_mat.shape[0]), total=embedding_mat.shape[0]):
        embedding_mat[i] /= torch.linalg.vector_norm(embedding_mat[i])

In [None]:
sim_vals = np.empty([embedding_mat.shape[0], embedding_mat.shape[0]], dtype=np.float16)
batch_size = 32
est_batches = embedding_mat.shape[0] // batch_size
# Afterward need to filter out the self-loops
with torch.no_grad():
    for tail_inds in tqdm(batched(range(embedding_mat.shape[0]), batch_size), total=est_batches):
        tail_sim_vals = embedding_mat @ embedding_mat[tail_inds].transpose(1, 0)
        sim_vals[:, tail_inds] = tail_sim_vals.cpu()
indices = np.arange(sim_vals.shape[0])
sim_vals[indices, indices] = 0.
sim_vals = sim_vals.flatten()

In [None]:
gpu_sim_vals = torch.tensor(sim_vals, device=embedding_mat.device)

In [None]:
del embedding_mat, tail_sim_vals
gc.collect()

In [None]:
print("Total: ", len(gpu_sim_vals))
for thresh in [0.90, 0.925, 0.95, 0.975]:
    print(f">= {thresh}: ", torch.sum(gpu_sim_vals >= thresh))

## Plotting the Similarity Values
As Malaviya et al. (2019) note, these similarity values follow a Gaussian distribution. To avoid memory issues we only use a subset of the similarity values to illustrate.

In [None]:
subset = sim_vals[:100000].astype(np.float64)

In [None]:
# This is enough samples to see the Gausssian distribution without using absurd amounts of memory
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=sim_vals[:100000]
))
fig