In [46]:
import torchhd, torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from HDDB import HDDB
from utils import similarity_func_partial
from functools import partial

### Loading the dataset

In [47]:
df = pd.read_csv('data/addresses.csv')
# strip whitespace from all values
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df = df.drop(columns=['address'])
df

Unnamed: 0,fname,lname,city,state,zip
0,John,Doe,Riverside,NJ,8075
1,Jack,McGinnis,Philadelphia,PA,9119
2,John,Repici,Riverside,NJ,8075
3,Stephen,Tyler,SomeTown,SD,91234
4,John,Blankman,SomeTown,SD,298
5,Joan,Anne,Desert City,CO,123
6,Jack,Repici,Riverside,NJ,8075
7,Lilly,Repici,Philadelphia,PA,9119


In [48]:
exp_dim = 10_000
exp_vsa_type = "MAP"
sim_func = partial(similarity_func_partial, exp_vsa_type)
hdDB = HDDB(dim=exp_dim, vsa_type=exp_vsa_type)

In [49]:
hdDB.set_columns(df.columns)

In [50]:
for index, row in df.iterrows():
    values = row.values
    hdDB.add_row(index, values)

In [51]:
vec_NJ = hdDB.columns['state']['NJ']
query_vec_NJ = torchhd.bind(hdDB.columns['state'].atomic_vector, hdDB.columns['state']['NJ'])
results = hdDB.most_similar_rows(query_vec_NJ, 5)
print(results)
print(df.iloc[[t[0] for t in results]])

[(2, 0.39079999923706055), (0, 0.38760000467300415), (6, 0.37439998984336853), (5, 0.009800000116229057), (7, 0.00419999985024333)]
   fname   lname          city state   zip
2   John  Repici     Riverside    NJ  8075
0   John     Doe     Riverside    NJ  8075
6   Jack  Repici     Riverside    NJ  8075
5   Joan    Anne   Desert City    CO   123
7  Lilly  Repici  Philadelphia    PA  9119


In [52]:
vec_lastname_repici = hdDB.columns['lname']['Repici']
query_vec_lastname_repici = torchhd.bind(hdDB.columns['lname'].atomic_vector, hdDB.columns['lname']['Repici'])
results = hdDB.most_similar_rows(query_vec_lastname_repici, 5)
print(results)
print(df.iloc[[t[0] for t in results]])

[(7, 0.3862000107765198), (6, 0.3779999911785126), (2, 0.36480000615119934), (1, 0.01360000018030405), (5, 0.010200000368058681)]
   fname     lname          city state   zip
7  Lilly    Repici  Philadelphia    PA  9119
6   Jack    Repici     Riverside    NJ  8075
2   John    Repici     Riverside    NJ  8075
1   Jack  McGinnis  Philadelphia    PA  9119
5   Joan      Anne   Desert City    CO   123


In [56]:
jack_repici_vec = hdDB[6]
# t = torchhd.inverse(hdDB.columns['state'].atomic_vector)

# unbinding to obtain the state that Jack Repici lives in
jack_repici_state_vec = torchhd.bind(jack_repici_vec, torchhd.inverse(hdDB.columns['state'].atomic_vector))
print(sim_func(jack_repici_state_vec, hdDB.columns['state']['NJ']))
print(sim_func(jack_repici_state_vec, hdDB.columns['state']['PA']))

query_vec = torchhd.bind(hdDB.columns['state'].atomic_vector, jack_repici_state_vec)
query_vec = torchhd.bundle(query_vec, query_vec_lastname_repici)
results = hdDB.most_similar_rows(query_vec, 5)
print(results)
print(df.iloc[[t[0] for t in results]])


MAPTensor(0.3744)
MAPTensor(-0.0022)
[(6, 0.8300603032112122), (2, 0.5948966145515442), (7, 0.32696425914764404), (0, 0.25299370288848877), (1, 0.09782423824071884)]
   fname     lname          city state   zip
6   Jack    Repici     Riverside    NJ  8075
2   John    Repici     Riverside    NJ  8075
7  Lilly    Repici  Philadelphia    PA  9119
0   John       Doe     Riverside    NJ  8075
1   Jack  McGinnis  Philadelphia    PA  9119
