In [1]:
import torchhd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from HDDB import HDDB
from utils import similarity_func_partial
from functools import partial

### Loading the dataset

In [2]:
df = pd.read_csv('data/addresses.csv')
# strip whitespace from all values
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# df = df.drop(columns=['address', 'zip'])
df = df.drop(columns=['address'])
print(df.dtypes)
# change order of columns
df = df[['zip', 'city', 'state' , 'fname', 'lname']]
df

fname    object
lname    object
city     object
state    object
zip       int64
dtype: object


Unnamed: 0,zip,city,state,fname,lname
0,8075,Riverside,NJ,John,Doe
1,9119,Philadelphia,PA,Jack,McGinnis
2,8075,Riverside,NJ,John,Repici
3,91234,Sioux Falls,SD,Stephen,Tyler
4,298,Sioux Falls,SD,John,Blankman
5,123,Denver,CO,Joan,Anne
6,8075,Riverside,NJ,Jack,Repici
7,9119,Philadelphia,PA,Lilly,Repici


### Extending the dataset

In [3]:
all_states_df = pd.read_csv('data/states.csv')
all_states_df.columns = all_states_df.columns.str.strip().str.lower()
print(len(all_states_df['abbreviation'].unique()))
all_cities_df = pd.read_csv('data/us-cities-top-1k.csv')
all_cities_df.columns = all_cities_df.columns.str.strip().str.lower()


all_cities_df['abbreviation'] = all_cities_df['state'].apply(lambda x: all_states_df[all_states_df['state'] == x]['abbreviation'].values[0])
print(len(all_cities_df['abbreviation'].unique()))
all_cities_df.head()
print(all_cities_df[all_cities_df['abbreviation'] == 'UT'])

51
51
                 city state  population        lat         lon abbreviation
31              Logan  Utah       48913  41.736980 -111.833836           UT
193      South Jordan  Utah       59366  40.562170 -111.929658           UT
285         Bountiful  Utah       43023  40.889390 -111.880771           UT
381             Provo  Utah      116288  40.233844 -111.658534           UT
434      Spanish Fork  Utah       36956  40.114955 -111.654923           UT
440      Taylorsville  Utah       60519  40.667725 -111.938826           UT
466          Riverton  Utah       40921  40.521893 -111.939102           UT
500              Lehi  Utah       54382  40.391617 -111.850766           UT
533             Ogden  Utah       84249  41.223000 -111.973830           UT
568            Layton  Utah       70790  41.060222 -111.971053           UT
603            Murray  Utah       48612  40.666892 -111.887991           UT
630             Sandy  Utah       90231  40.564978 -111.838973           UT
673  W

In [4]:
def generate_random_rows(n=1000, city_state_relation=False):
    
    def random_names(n, name_len=10):
        return [''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), name_len, replace=True)) for i in range(n)]
    
    new_df = pd.DataFrame()
    new_df['fname'] = random_names(n,6)
    new_df['lname'] = random_names(n,6)
    new_df['state'] = np.random.choice(all_states_df['abbreviation'].values, n, replace=True)
    new_df['zip'] = np.random.randint(10000, 99999, n)
    if city_state_relation:
        new_df['city'] = new_df['state'].apply(lambda x: np.random.choice(all_cities_df[all_cities_df['abbreviation'] == x]['city'].values, 1)[0])
    else:
        new_df['city'] = np.random.choice(all_cities_df['city'].values, n, replace=True)

    return new_df

print(generate_random_rows(10, city_state_relation=True))

    fname   lname state    zip               city
0  mxpcat  xhmyeg    IL  83572             Urbana
1  ajlesi  hyixgi    KY  85257  Lexington-Fayette
2  hyqpkn  jnkfhq    HI  32624           Honolulu
3  klcczq  cietvm    WI  94184          La Crosse
4  qfhmkn  ibqgvz    UT  74930              Provo
5  mhljnk  anyvdz    TN  65402          Knoxville
6  gwvhvq  vzwtys    AK  37002          Anchorage
7  qobaxp  upxdli    UT  68626        West Jordan
8  muehts  conicm    FL  16212         Greenacres
9  phioxd  grzqcj    OH  37112        Springfield


In [5]:
# add random rows to the dataframe
random_df = generate_random_rows(1000)
print(random_df.shape)
df = pd.concat([df, random_df], ignore_index=True)
#set all column types to object
df = df.astype(str)


(1000, 5)


In [6]:
df.sample(10)

Unnamed: 0,zip,city,state,fname,lname
62,58972,West Covina,SD,nvqrxh,qwprfx
143,24284,Findlay,FL,ovfawg,ecrnpr
327,28798,Chandler,WA,mslonf,togzcy
266,72061,Stockton,TN,dxaxbb,jqczmj
378,84210,Chandler,OK,ekkurv,bfyeqb
554,62144,Salem,NM,qzdemr,fbotnb
896,94116,Mansfield,NJ,ksassf,ckuykg
955,48352,Newport Beach,ID,niyjmb,cizdey
265,43236,Shreveport,IN,hkxwdw,oaqxiu
757,14098,Rochester Hills,NC,cgwfnt,ewsgbc


In [7]:
exp_dim = 10_000
exp_vsa_type = "BSC"
sim_func = partial(similarity_func_partial, exp_vsa_type)
hdDB = HDDB(dim=exp_dim, vsa_type=exp_vsa_type)

In [8]:
hdDB.set_columns(df.columns)

In [9]:
for index, row in df.iterrows():
    values = row.values
    hdDB.add_row(index, values)

In [18]:
#https://docs.google.com/presentation/d/176_bmMfthBgrO6lBczBlgU4PAC59qf8rWUHNmKoVtLo/edit#slide=id.g226b508b046_0_20

print(hdDB.similiraity_cutoff(0.3))
print(hdDB.similiraity_cutoff(0.5))
print(hdDB.similiraity_cutoff(0.6))
print(hdDB.similiraity_cutoff(0.9))
print(hdDB.similiraity_cutoff(0.95))
print(hdDB.similiraity_cutoff(0.99))

0.689930661430611
0.6875
0.6863257047481339
0.6815598495115457
0.6798758993876929
0.67671708901005


In [11]:
vec_NJ = hdDB.columns['state']['NJ']
query_vec_NJ = torchhd.bind(hdDB.columns['state'].atomic_vector, vec_NJ)
results = hdDB.most_similar_rows(query_vec_NJ, 200)
print(results)
print()
with pd.option_context('display.max_rows', None,):
  print(df.iloc[[t[0] for t in results]])

[(205, 0.6974999904632568), (764, 0.6948000192642212), (875, 0.6922000050544739), (896, 0.6920999884605408), (127, 0.6908000111579895), (579, 0.6906999945640564), (383, 0.6901999711990356), (918, 0.6891000270843506), (814, 0.6884999871253967), (36, 0.6880999803543091), (196, 0.6868000030517578), (297, 0.6868000030517578), (712, 0.6866999864578247), (980, 0.6863999962806702), (169, 0.6855999827384949), (601, 0.6837000250816345), (831, 0.6836000084877014), (2, 0.6836000084877014), (240, 0.6834999918937683), (834, 0.6819999814033508), (6, 0.6819999814033508), (0, 0.6819000244140625), (134, 0.6819000244140625), (246, 0.6798999905586243), (490, 0.6791999936103821), (757, 0.5170999765396118), (674, 0.5148000121116638), (172, 0.5145000219345093), (412, 0.5134000182151794), (128, 0.5133000016212463), (830, 0.5131000280380249), (56, 0.5131000280380249), (845, 0.5127999782562256), (584, 0.5126000046730042), (840, 0.512499988079071), (969, 0.512499988079071), (912, 0.5120000243186951), (870, 0.51

In [12]:
vec_state_NJ = hdDB.columns['state']['NJ']
query_vec_state_NJ = torchhd.bind(hdDB.columns['state'].atomic_vector, vec_state_NJ)
results = hdDB.most_similar_rows(query_vec_state_NJ, 200)
print(results)
print(df.iloc[[t[0] for t in results]])

[(205, 0.6974999904632568), (764, 0.6948000192642212), (875, 0.6922000050544739), (896, 0.6920999884605408), (127, 0.6908000111579895), (579, 0.6906999945640564), (383, 0.6901999711990356), (918, 0.6891000270843506), (814, 0.6884999871253967), (36, 0.6880999803543091), (196, 0.6868000030517578), (297, 0.6868000030517578), (712, 0.6866999864578247), (980, 0.6863999962806702), (169, 0.6855999827384949), (601, 0.6837000250816345), (831, 0.6836000084877014), (2, 0.6836000084877014), (240, 0.6834999918937683), (834, 0.6819999814033508), (6, 0.6819999814033508), (0, 0.6819000244140625), (134, 0.6819000244140625), (246, 0.6798999905586243), (490, 0.6791999936103821), (757, 0.5170999765396118), (674, 0.5148000121116638), (172, 0.5145000219345093), (412, 0.5134000182151794), (128, 0.5133000016212463), (830, 0.5131000280380249), (56, 0.5131000280380249), (845, 0.5127999782562256), (584, 0.5126000046730042), (840, 0.512499988079071), (969, 0.512499988079071), (912, 0.5120000243186951), (870, 0.51

In [13]:
vec_zip_85301 = hdDB.columns['zip']['91234']
query_vec_zip_85301 = torchhd.bind(hdDB.columns['zip'].atomic_vector, vec_zip_85301)
results = hdDB.most_similar_rows(query_vec_zip_85301, 200)
print(results)
print(df.iloc[[t[0] for t in results]])

[(3, 0.6917999982833862), (834, 0.5163000226020813), (400, 0.5148000121116638), (883, 0.5139999985694885), (50, 0.5133000016212463), (875, 0.5133000016212463), (551, 0.5131000280380249), (164, 0.5130000114440918), (1001, 0.512499988079071), (209, 0.5120000243186951), (679, 0.5120000243186951), (76, 0.5120000243186951), (304, 0.511900007724762), (241, 0.5117999911308289), (307, 0.5116000175476074), (984, 0.5115000009536743), (958, 0.5115000009536743), (563, 0.5110999941825867), (406, 0.5109999775886536), (899, 0.5109999775886536), (183, 0.5109999775886536), (718, 0.5108000040054321), (501, 0.5105999708175659), (423, 0.5105000138282776), (218, 0.510200023651123), (228, 0.5101000070571899), (206, 0.5101000070571899), (483, 0.5099999904632568), (915, 0.5098999738693237), (584, 0.5098999738693237), (127, 0.5098999738693237), (654, 0.5098000168800354), (906, 0.5098000168800354), (302, 0.5098000168800354), (176, 0.5098000168800354), (385, 0.5098000168800354), (938, 0.5097000002861023), (376, 

In [14]:
vec_fname_john = hdDB.columns['fname']['John']
query_vec_fname_john = torchhd.bind(hdDB.columns['fname'].atomic_vector, vec_fname_john)
results = hdDB.most_similar_rows(query_vec_fname_john, 200)
print(results)
print(df.iloc[[t[0] for t in results]])

[(0, 0.6908000111579895), (2, 0.6897000074386597), (4, 0.6869000196456909), (254, 0.5169000029563904), (71, 0.5141000151634216), (588, 0.5141000151634216), (675, 0.5131999850273132), (896, 0.5130000114440918), (620, 0.5127000212669373), (958, 0.5123000144958496), (390, 0.5121999979019165), (885, 0.5116000175476074), (525, 0.5112000107765198), (292, 0.5110999941825867), (817, 0.5110999941825867), (849, 0.5109999775886536), (528, 0.510699987411499), (665, 0.5105999708175659), (228, 0.5105000138282776), (789, 0.5105000138282776), (987, 0.5103999972343445), (796, 0.5099999904632568), (329, 0.5098999738693237), (65, 0.5098999738693237), (103, 0.5098999738693237), (110, 0.5098999738693237), (30, 0.5098000168800354), (317, 0.5097000002861023), (862, 0.5097000002861023), (130, 0.5095999836921692), (700, 0.5095000267028809), (842, 0.5095000267028809), (31, 0.5092999935150146), (284, 0.5091999769210815), (1003, 0.5091000199317932), (575, 0.5091000199317932), (100, 0.5091000199317932), (238, 0.50

In [15]:
vec_lastname_repici = hdDB.columns['lname']['Blankman']
query_vec_lastname_repici = torchhd.bind(hdDB.columns['lname'].atomic_vector, vec_lastname_repici)
results = hdDB.most_similar_rows(query_vec_lastname_repici, 10)
print(results)
print(df.iloc[[t[0] for t in results]])

[(4, 0.6881999969482422), (589, 0.5163000226020813), (363, 0.5149000287055969), (308, 0.5146999955177307), (687, 0.51419997215271), (798, 0.5139999985694885), (983, 0.513700008392334), (944, 0.5131999850273132), (193, 0.5131999850273132), (774, 0.5127999782562256)]
       zip              city state   fname     lname
4      298       Sioux Falls    SD    John  Blankman
589  88246           Hanford    VA  hutogm    zjlxhf
363  60139          Goodyear    AK  qihkea    sajhes
308  85980              Yuma    NH  gbqtrr    cixxuz
687  18474  Farmington Hills    WI  gwsmdh    tfwgvh
798  74365     Miami Gardens    PA  wfjuwx    qsiehe
983  32825    Port St. Lucie    WI  dvdfuq    pkqciz
944  16748      Ormond Beach    ND  mlzvxu    ekhkwb
193  55047       Baton Rouge    VA  ikpuhs    drmjgj
774  39363        Scottsdale    OH  jytfvm    cmokqb


In [16]:
jack_repici_vec = hdDB[6]
# t = torchhd.inverse(hdDB.columns['state'].atomic_vector)

# unbinding to obtain the state that Jack Repici lives in
jack_repici_state_vec = torchhd.bind(jack_repici_vec, torchhd.inverse(hdDB.columns['state'].atomic_vector))
print(sim_func(jack_repici_state_vec, hdDB.columns['state']['NJ']))
print(sim_func(jack_repici_state_vec, hdDB.columns['state']['PA']))

query_vec_jack_repici_state = torchhd.bind(hdDB.columns['state'].atomic_vector, jack_repici_state_vec)
query_vec = torchhd.bundle(query_vec_jack_repici_state, query_vec_lastname_repici)
results = hdDB.most_similar_rows(query_vec, 10)
print(results)
print(df.iloc[[t[0] for t in results]])

BSCTensor(0.6820)
BSCTensor(0.5011)
[(6, 0.7552000284194946), (2, 0.6656000018119812), (0, 0.611299991607666), (4, 0.5907999873161316), (980, 0.5428000092506409), (918, 0.5425000190734863), (875, 0.5375999808311462), (36, 0.5371000170707703), (169, 0.5364000201225281), (383, 0.5361999869346619)]
       zip         city state   fname     lname
6     8075    Riverside    NJ    Jack    Repici
2     8075    Riverside    NJ    John    Repici
0     8075    Riverside    NJ    John       Doe
4      298  Sioux Falls    SD    John  Blankman
980  63187    Annapolis    NJ  obfnxq    emhqwh
918  81564       Joplin    NJ  kmihgj    lkmnlv
875  89187    Littleton    NJ  yghval    nskwnl
36   94775  Sioux Falls    NJ  odccur    hukwza
169  79782    Lancaster    NJ  ytjujj    mwravs
383  88908     Columbia    NJ  ykslly    wntxrw


In [17]:
import math

def theoretical_similarity(amount_bundled):
    amount_bundled = amount_bundled + 1 if amount_bundled % 2 == 0 else amount_bundled # +1 because we need an odd amount of elements
    expected_similarity = 1/2 + (math.comb(amount_bundled-1, (amount_bundled-1)//2) * .5**amount_bundled)
    return expected_similarity


print(theoretical_similarity(3))
print(theoretical_similarity(5))  
print(theoretical_similarity(7))

0.75
0.6875
0.65625
