In [2]:
import torchhd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from HDDB import HDDB
from functools import partial

In [3]:
import sys
sys.path.append('../')
from shared_code.helpers import similarity_func_partial

### Loading the dataset

In [4]:
df = pd.read_csv('data/addresses.csv')
# strip whitespace from all values
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# df = df.drop(columns=['address', 'zip'])
df = df.drop(columns=['address'])
print(df.dtypes)
# change order of columns
df = df[['zip', 'city', 'state' , 'fname', 'lname']]
df

fname    object
lname    object
city     object
state    object
zip       int64
dtype: object


Unnamed: 0,zip,city,state,fname,lname
0,8075,Riverside,NJ,John,Doe
1,9119,Philadelphia,PA,Jack,McGinnis
2,8075,Riverside,NJ,John,Repici
3,91234,Sioux Falls,SD,Stephen,Tyler
4,298,Sioux Falls,SD,John,Blankman
5,123,Denver,CO,Joan,Anne
6,8075,Riverside,NJ,Jack,Repici
7,9119,Philadelphia,PA,Lilly,Repici


### Extending the dataset

In [5]:
all_states_df = pd.read_csv('data/states.csv')
all_states_df.columns = all_states_df.columns.str.strip().str.lower()
print(len(all_states_df['abbreviation'].unique()))
all_cities_df = pd.read_csv('data/us-cities-top-1k.csv')
all_cities_df.columns = all_cities_df.columns.str.strip().str.lower()


all_cities_df['abbreviation'] = all_cities_df['state'].apply(lambda x: all_states_df[all_states_df['state'] == x]['abbreviation'].values[0])
print(len(all_cities_df['abbreviation'].unique()))
all_cities_df.head()
print(all_cities_df[all_cities_df['abbreviation'] == 'UT'])

51
51
                 city state  population        lat         lon abbreviation
31              Logan  Utah       48913  41.736980 -111.833836           UT
193      South Jordan  Utah       59366  40.562170 -111.929658           UT
285         Bountiful  Utah       43023  40.889390 -111.880771           UT
381             Provo  Utah      116288  40.233844 -111.658534           UT
434      Spanish Fork  Utah       36956  40.114955 -111.654923           UT
440      Taylorsville  Utah       60519  40.667725 -111.938826           UT
466          Riverton  Utah       40921  40.521893 -111.939102           UT
500              Lehi  Utah       54382  40.391617 -111.850766           UT
533             Ogden  Utah       84249  41.223000 -111.973830           UT
568            Layton  Utah       70790  41.060222 -111.971053           UT
603            Murray  Utah       48612  40.666892 -111.887991           UT
630             Sandy  Utah       90231  40.564978 -111.838973           UT
673  W

In [6]:
def generate_random_rows(n=1000, city_state_relation=False):
    
    def random_names(n, name_len=10):
        return [''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), name_len, replace=True)) for i in range(n)]
    
    new_df = pd.DataFrame()
    new_df['fname'] = random_names(n,6)
    new_df['lname'] = random_names(n,6)
    new_df['state'] = np.random.choice(all_states_df['abbreviation'].values, n, replace=True)
    new_df['zip'] = np.random.randint(10000, 99999, n)
    if city_state_relation:
        new_df['city'] = new_df['state'].apply(lambda x: np.random.choice(all_cities_df[all_cities_df['abbreviation'] == x]['city'].values, 1)[0])
    else:
        new_df['city'] = np.random.choice(all_cities_df['city'].values, n, replace=True)

    return new_df

print(generate_random_rows(10, city_state_relation=True))

    fname   lname state    zip             city
0  lpiibz  zzktxo    OK  59341     Midwest City
1  qphapk  nfpdvd    OR  85552        Hillsboro
2  ezggop  hoqarg    MA  88635          Chelsea
3  plicgn  rcklra    NM  47561          Roswell
4  lelfnc  ntyvnv    RI  54398  East Providence
5  vvbbol  dcxgzf    CA  94365      San Leandro
6  olwyaj  uhfobs    GA  23152      Johns Creek
7  dhbwoc  dcgiyp    AR  50606       Pine Bluff
8  gexyfs  ctwkdc    TX  90229           Conroe
9  tgzsce  sebjli    TX  33669          Midland


In [7]:
# add random rows to the dataframe
random_df = generate_random_rows(1000)
print(random_df.shape)
df = pd.concat([df, random_df], ignore_index=True)
#set all column types to object
df = df.astype(str)


(1000, 5)


In [8]:
df.sample(10)

Unnamed: 0,zip,city,state,fname,lname
913,68714,Skokie,IL,zgzjeb,psbpan
220,36699,Troy,CA,djwyzl,aigbif
227,84364,Rancho Cucamonga,NY,zobehv,czrzrx
693,54261,Concord,IN,wfuktm,zkiysv
937,52309,West Palm Beach,NM,alldan,lcmctz
777,59189,State College,WI,sgaxoo,oqgtma
594,56614,Santa Barbara,MN,zttzim,htuloe
137,33990,San Gabriel,MN,pdjfed,totaep
774,60817,La Crosse,ID,wxfnqc,qztdhl
87,76023,Sierra Vista,WY,hzzqcz,hveupr


In [9]:
exp_dim = 10_000
exp_vsa_type = "BSC"
sim_func = partial(similarity_func_partial, exp_vsa_type)
hdDB = HDDB(dim=exp_dim, vsa_type=exp_vsa_type)

In [10]:
hdDB.set_columns(df.columns)

In [11]:
for index, row in df.iterrows():
    values = row.values
    hdDB.add_row(index, values)

In [12]:
#https://docs.google.com/presentation/d/176_bmMfthBgrO6lBczBlgU4PAC59qf8rWUHNmKoVtLo/edit#slide=id.g226b508b046_0_20

print(hdDB.similiraity_cutoff(0.3))
print(hdDB.similiraity_cutoff(0.5))
print(hdDB.similiraity_cutoff(0.6))
print(hdDB.similiraity_cutoff(0.9))
print(hdDB.similiraity_cutoff(0.95))
print(hdDB.similiraity_cutoff(0.99))

0.689930661430611
0.6875
0.6863257047481339
0.6815598495115457
0.6798758993876929
0.67671708901005


In [13]:
vec_NJ = hdDB.columns['state']['NJ']
query_vec_NJ = torchhd.bind(hdDB.columns['state'].atomic_vector, vec_NJ)
results = hdDB.most_similar_rows(query_vec_NJ, 200)
print(results)
print()
with pd.option_context('display.max_rows', None,):
  print(df.iloc[[t[0] for t in results]])

[(518, 0.6984999775886536), (337, 0.6970000267028809), (382, 0.6944000124931335), (242, 0.692799985408783), (783, 0.6919000148773193), (814, 0.6916000247001648), (15, 0.6909000277519226), (859, 0.6899999976158142), (356, 0.6886000037193298), (25, 0.6884999871253967), (603, 0.6880999803543091), (544, 0.6877999901771545), (290, 0.6872000098228455), (576, 0.6869000196456909), (811, 0.6866999864578247), (793, 0.6866999864578247), (561, 0.6854000091552734), (328, 0.6851999759674072), (744, 0.6850000023841858), (22, 0.6850000023841858), (587, 0.684499979019165), (229, 0.683899998664856), (717, 0.6837999820709229), (385, 0.6837000250816345), (292, 0.6833999752998352), (6, 0.6833000183105469), (411, 0.6822999715805054), (2, 0.6819999814033508), (0, 0.6819000244140625), (837, 0.6807000041007996), (465, 0.6777999997138977), (875, 0.5138000249862671), (991, 0.5134000182151794), (498, 0.5131999850273132), (533, 0.5123999714851379), (320, 0.5120999813079834), (922, 0.511900007724762), (591, 0.51190

In [14]:
vec_state_NJ = hdDB.columns['state']['NJ']
query_vec_state_NJ = torchhd.bind(hdDB.columns['state'].atomic_vector, vec_state_NJ)
results = hdDB.most_similar_rows(query_vec_state_NJ, 200)
print(results)
print(df.iloc[[t[0] for t in results]])

[(518, 0.6984999775886536), (337, 0.6970000267028809), (382, 0.6944000124931335), (242, 0.692799985408783), (783, 0.6919000148773193), (814, 0.6916000247001648), (15, 0.6909000277519226), (859, 0.6899999976158142), (356, 0.6886000037193298), (25, 0.6884999871253967), (603, 0.6880999803543091), (544, 0.6877999901771545), (290, 0.6872000098228455), (576, 0.6869000196456909), (811, 0.6866999864578247), (793, 0.6866999864578247), (561, 0.6854000091552734), (328, 0.6851999759674072), (744, 0.6850000023841858), (22, 0.6850000023841858), (587, 0.684499979019165), (229, 0.683899998664856), (717, 0.6837999820709229), (385, 0.6837000250816345), (292, 0.6833999752998352), (6, 0.6833000183105469), (411, 0.6822999715805054), (2, 0.6819999814033508), (0, 0.6819000244140625), (837, 0.6807000041007996), (465, 0.6777999997138977), (875, 0.5138000249862671), (991, 0.5134000182151794), (498, 0.5131999850273132), (533, 0.5123999714851379), (320, 0.5120999813079834), (922, 0.511900007724762), (591, 0.51190

In [15]:
vec_zip_85301 = hdDB.columns['zip']['91234']
query_vec_zip_85301 = torchhd.bind(hdDB.columns['zip'].atomic_vector, vec_zip_85301)
results = hdDB.most_similar_rows(query_vec_zip_85301, 200)
print(results)
print(df.iloc[[t[0] for t in results]])

[(3, 0.683899998664856), (488, 0.5181999802589417), (291, 0.5174999833106995), (384, 0.5163999795913696), (804, 0.5145999789237976), (551, 0.5135999917984009), (541, 0.5133000016212463), (69, 0.5131000280380249), (972, 0.5131000280380249), (26, 0.5130000114440918), (731, 0.5120000243186951), (160, 0.511900007724762), (340, 0.5117999911308289), (128, 0.5115000009536743), (881, 0.5113000273704529), (296, 0.5112000107765198), (788, 0.5110999941825867), (484, 0.5110999941825867), (119, 0.5109999775886536), (745, 0.5109000205993652), (504, 0.5108000040054321), (478, 0.5105999708175659), (171, 0.5105000138282776), (692, 0.5103999972343445), (967, 0.5103999972343445), (903, 0.5102999806404114), (138, 0.5102999806404114), (266, 0.5102999806404114), (477, 0.510200023651123), (904, 0.510200023651123), (307, 0.5101000070571899), (911, 0.5097000002861023), (443, 0.5097000002861023), (938, 0.5097000002861023), (103, 0.5097000002861023), (571, 0.5095999836921692), (174, 0.5095999836921692), (12, 0.5

In [16]:
vec_fname_john = hdDB.columns['fname']['John']
query_vec_fname_john = torchhd.bind(hdDB.columns['fname'].atomic_vector, vec_fname_john)
results = hdDB.most_similar_rows(query_vec_fname_john, 200)
print(results)
print(df.iloc[[t[0] for t in results]])

[(2, 0.6920999884605408), (0, 0.6869999766349792), (4, 0.6801999807357788), (67, 0.5166000127792358), (933, 0.51419997215271), (312, 0.5134000182151794), (731, 0.5134000182151794), (195, 0.5131999850273132), (310, 0.5121999979019165), (124, 0.5121999979019165), (801, 0.5120000243186951), (903, 0.511900007724762), (808, 0.511900007724762), (826, 0.5117999911308289), (583, 0.5116000175476074), (219, 0.5115000009536743), (231, 0.5113000273704529), (446, 0.5113000273704529), (90, 0.5113000273704529), (237, 0.5112000107765198), (56, 0.5112000107765198), (662, 0.5102999806404114), (782, 0.5102999806404114), (226, 0.510200023651123), (525, 0.510200023651123), (313, 0.510200023651123), (573, 0.5101000070571899), (825, 0.5101000070571899), (774, 0.5101000070571899), (985, 0.5098999738693237), (773, 0.5095000267028809), (973, 0.5095000267028809), (490, 0.5092999935150146), (684, 0.5091999769210815), (175, 0.5091000199317932), (455, 0.5091000199317932), (521, 0.5091000199317932), (831, 0.50900000

In [17]:
vec_lastname_repici = hdDB.columns['lname']['Blankman']
query_vec_lastname_repici = torchhd.bind(hdDB.columns['lname'].atomic_vector, vec_lastname_repici)
results = hdDB.most_similar_rows(query_vec_lastname_repici, 10)
print(results)
print(df.iloc[[t[0] for t in results]])

[(4, 0.6848000288009644), (811, 0.517799973487854), (22, 0.5160999894142151), (871, 0.5149999856948853), (110, 0.5149000287055969), (164, 0.51419997215271), (255, 0.513700008392334), (540, 0.5133000016212463), (209, 0.5133000016212463), (411, 0.5131999850273132)]
       zip           city state   fname     lname
4      298    Sioux Falls    SD    John  Blankman
811  27485         Corona    NJ  htudrq    vbzkel
22   99772          Logan    NJ  fiprhh    ujxwzr
871  76102          Logan    NH  wcrrgf    hadhli
110  93819  Downers Grove    NC  xqacaq    qnalur
164  96747    North Miami    NH  zleugo    pvznnp
255  37508       El Cajon    DC  yeghep    nnuybb
540  32313     Alexandria    MI  obncsc    yfcbzb
209  90029       Pacifica    WY  wvrqoy    xgqukm
411  64677       Danville    NJ  dmrkax    owtvad


In [18]:
jack_repici_vec = hdDB[6]
# t = torchhd.inverse(hdDB.columns['state'].atomic_vector)

# unbinding to obtain the state that Jack Repici lives in
jack_repici_state_vec = torchhd.bind(jack_repici_vec, torchhd.inverse(hdDB.columns['state'].atomic_vector))
print(sim_func(jack_repici_state_vec, hdDB.columns['state']['NJ']))
print(sim_func(jack_repici_state_vec, hdDB.columns['state']['PA']))

query_vec_jack_repici_state = torchhd.bind(hdDB.columns['state'].atomic_vector, jack_repici_state_vec)
query_vec = torchhd.bundle(query_vec_jack_repici_state, query_vec_lastname_repici)
results = hdDB.most_similar_rows(query_vec, 10)
print(results)
print(df.iloc[[t[0] for t in results]])

BSCTensor(0.6833)
BSCTensor(0.4947)
[(6, 0.7511000037193298), (2, 0.6535999774932861), (0, 0.6115000247955322), (4, 0.5967000126838684), (242, 0.5429999828338623), (995, 0.5429999828338623), (290, 0.5429999828338623), (915, 0.5421000123023987), (382, 0.5418000221252441), (811, 0.5414999723434448)]
       zip         city state   fname     lname
6     8075    Riverside    NJ    Jack    Repici
2     8075    Riverside    NJ    John    Repici
0     8075    Riverside    NJ    John       Doe
4      298  Sioux Falls    SD    John  Blankman
242  86125  Springfield    NJ  rvkjko    vbckgu
995  33664    Riverside    MA  uuphde    klvoei
290  94066      Jackson    NJ  auqaaq    zcpjsi
915  66749    Riverside    HI  bunpkb    ygqtgo
382  44426   San Rafael    NJ  nzgvcv    bhfypo
811  27485       Corona    NJ  htudrq    vbzkel


In [19]:
import math

def theoretical_similarity(amount_bundled):
    amount_bundled = amount_bundled + 1 if amount_bundled % 2 == 0 else amount_bundled # +1 because we need an odd amount of elements
    expected_similarity = 1/2 + (math.comb(amount_bundled-1, (amount_bundled-1)//2) * .5**amount_bundled)
    return expected_similarity


print(theoretical_similarity(3))
print(theoretical_similarity(5))  
print(theoretical_similarity(7))

0.75
0.6875
0.65625
