In [39]:
import torchhd
import torch
import plotly.express as px
import pandas as pd
import tqdm

In [40]:
import sys
sys.path.append('..')
from shared_code.helpers import norm_hamming_similarity, theoretical_similarity, similarity_cutoff

# Experiment setup

In [41]:
# DIMENSIONS = [2**i for i in range(6, 14)]
DIMENSIONS = [x for x in range (50, 2000, 50)]
BUNDLE_SIZES = list(range(10, 200, 10))
BUNDLE_SIZES = list(range(4, 20, 2))
BUNDLE_SIZES = [5]
BUNDLE_SIZES = [20]
ITEM_MEMORY_SIZE = 10_000

In [42]:
num_exp = 100  # Number of times to repeat the experiment for a certain bundle size and dimension
recall = 0.995

metric_averages = torch.zeros(len(BUNDLE_SIZES), len(DIMENSIONS), 3)  # recall, precision, accuracy

for i, bundle_size in enumerate(BUNDLE_SIZES):
  for j, dim in enumerate(DIMENSIONS):
    print(f"n={dim}, k={bundle_size}")
    item_memory = torchhd.random(ITEM_MEMORY_SIZE, dim, vsa='BSC')
    # Generate random memory vectors
    # Compute similarity cutoff 
    theoretical_similarities = theoretical_similarity(bundle_size, 'BSC')
    cutoff = similarity_cutoff(bundle_size, dim, recall)
    print(f"Cutoff: {cutoff}")
    # 2d array to save values of recall, precision, and accuracy
    metrics = torch.zeros((num_exp, 3)) # recall, precision, accuracy

    for exp in range(num_exp):
      # Select k random memory vectors to form a bundle
      bundle_idxs = torch.randperm(ITEM_MEMORY_SIZE)[:bundle_size]
      other_idxs = [i for i in range(ITEM_MEMORY_SIZE) if i not in bundle_idxs]
      bundle = torchhd.multiset(item_memory[bundle_idxs])

      similarities_bundle = norm_hamming_similarity(bundle, item_memory)
      similarities_bundle_bundled = similarities_bundle[bundle_idxs]
      similarities_bundle_other = similarities_bundle[other_idxs]

      # Calculate the number of true positives, false positives, false negatives, and true negatives
      TP = (similarities_bundle_bundled > cutoff).sum().item()
      FP = (similarities_bundle_other > cutoff).sum().item()
      FN = (similarities_bundle_bundled < cutoff).sum().item()
      TN = (similarities_bundle_other < cutoff).sum().item()

      # Calculate recall, precision, and accuracy
      metrics[exp, 0] = TP / (TP + FN)
      metrics[exp, 1] = TP / (TP + FP)
      metrics[exp, 2] = (TP + TN) / ITEM_MEMORY_SIZE
      
    metric_averages[i, j, :] = metrics.mean(dim=0)
    print(f"n={dim}, k={bundle_size}")
    print(f"Similarity cutoff: {cutoff}")
    print(f"True positives: {TP}")
    print(f"False positives: {FP}")
    print(f"False negatives: {FN}")
    print(f"True negatives: {TN}")
    # print(f"Average recall: {metrics[:, 0].mean()}")
    # print(f"Average precision: {metrics[:, 1].mean()}")
    # print(f"Average accuracy: {metrics[:, 2].mean()}")
    # print("\n")

n=50, k=20
Cutoff: 0.40880946366645055


n=50, k=20
Similarity cutoff: 0.40880946366645055
True positives: 20
False positives: 8959
False negatives: 0
True negatives: 1021
n=100, k=20
Cutoff: 0.46132201423165564
n=100, k=20
Similarity cutoff: 0.46132201423165564
True positives: 20
False positives: 7539
False negatives: 0
True negatives: 2441
n=150, k=20
Cutoff: 0.4845859375993824
n=150, k=20
Similarity cutoff: 0.4845859375993824
True positives: 20
False positives: 6580
False negatives: 0
True negatives: 3400
n=200, k=20
Cutoff: 0.49845399483371355
n=200, k=20
Similarity cutoff: 0.49845399483371355
True positives: 20
False positives: 5357
False negatives: 0
True negatives: 4623
n=250, k=20
Cutoff: 0.5079180198005371
n=250, k=20
Similarity cutoff: 0.5079180198005371
True positives: 20
False positives: 4194
False negatives: 0
True negatives: 5786
n=300, k=20
Cutoff: 0.5149040728040374
n=300, k=20
Similarity cutoff: 0.5149040728040374
True positives: 20
False positives: 3033
False negatives: 0
True negatives: 6947
n=350, k=20
Cut

In [43]:
# y_ax_values are recall, precision, and accuracy averaged over bundle sizes
df = pd.DataFrame({'Dimension': DIMENSIONS, 'Recall': metric_averages.mean(dim=0)[:, 0], 'Precision': metric_averages.mean(dim=0)[:, 1], 'Accuracy': metric_averages.mean(dim=0)[:, 2]})
fig = px.line(x='Dimension', y=['Recall', 'Precision', 'Accuracy'], data_frame=df, title='Recall, Precision, and Accuracy vs Dimension')
fig.update_traces(textposition='top center')
fig.update_layout(title='Recall, Precision, and Accuracy vs Dimension, bundle size 20', xaxis_title='Dimension', yaxis_title='Metric Value')
fig.update_layout(margin=dict(r=(50)),font=dict(size=14,))

fig.show()

In [44]:
df = pd.DataFrame({'Bundle Size': BUNDLE_SIZES, 'Recall': metric_averages.mean(dim=1)[:, 0], 'Precision': metric_averages.mean(dim=1)[:, 1], 'Accuracy': metric_averages.mean(dim=1)[:, 2]})
fig = px.line(x='Bundle Size', y=['Recall', 'Precision', 'Accuracy'], data_frame=df, title='Recall, Precision, and Accuracy vs Bundle Size')
fig.update_traces(textposition='top center')
fig.update_layout(title='Recall, Precision, and Accuracy vs Bundle Size', xaxis_title='Bundle Size', yaxis_title='Metric Value')
fig.show()

In [45]:
# recall vs dimension
df = pd.DataFrame({'Dimension': DIMENSIONS, 'Recall': metric_averages.mean(dim=0)[:, 0]})
fig = px.line(x='Dimension', y='Recall', data_frame=df, title='Recall vs Dimension')
fig.update_traces(textposition='top center')
fig.update_layout(title='Recall vs Dimension', xaxis_title='Dimension', yaxis_title='Recall')
# fig.update_xaxes(type='log')
fig.show()

In [46]:
# recqll vs bundle size
df = pd.DataFrame({'Bundle Size': BUNDLE_SIZES, 'Recall': metric_averages.mean(dim=1)[:, 0]})
fig = px.line(x='Bundle Size', y='Recall', data_frame=df, title='Recall vs Bundle Size')
fig.update_traces(textposition='top center')
fig.update_layout(title='Recall vs Bundle Size', xaxis_title='Bundle Size', yaxis_title='Recall')
fig.show()