In [20]:
import torchhd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from HDDB import HDDB
from functools import partial

In [21]:
import sys
sys.path.append('../')
from shared_code.helpers import similarity_func_partial

### Loading the dataset

In [22]:
df = pd.read_csv('data/addresses.csv')
# strip whitespace from all values
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# df = df.drop(columns=['address', 'zip'])
df = df.drop(columns=['address'])
print(df.dtypes)
# change order of columns
df = df[['zip', 'city', 'state' , 'fname', 'lname']]
df

fname    object
lname    object
city     object
state    object
zip       int64
dtype: object


Unnamed: 0,zip,city,state,fname,lname
0,8075,Riverside,NJ,John,Doe
1,9119,Philadelphia,PA,Jack,McGinnis
2,8075,Riverside,NJ,John,Repici
3,91234,Sioux Falls,SD,Stephen,Tyler
4,298,Sioux Falls,SD,John,Blankman
5,123,Denver,CO,Joan,Anne
6,8075,Riverside,NJ,Jack,Repici
7,9119,Philadelphia,PA,Lilly,Repici


### Extending the dataset

In [23]:
all_states_df = pd.read_csv('data/states.csv')
all_states_df.columns = all_states_df.columns.str.strip().str.lower()
print(len(all_states_df['abbreviation'].unique()))
all_cities_df = pd.read_csv('data/us-cities-top-1k.csv')
all_cities_df.columns = all_cities_df.columns.str.strip().str.lower()


all_cities_df['abbreviation'] = all_cities_df['state'].apply(lambda x: all_states_df[all_states_df['state'] == x]['abbreviation'].values[0])
print(len(all_cities_df['abbreviation'].unique()))
all_cities_df.head()
print(all_cities_df[all_cities_df['abbreviation'] == 'UT'])

51
51
                 city state  population        lat         lon abbreviation
31              Logan  Utah       48913  41.736980 -111.833836           UT
193      South Jordan  Utah       59366  40.562170 -111.929658           UT
285         Bountiful  Utah       43023  40.889390 -111.880771           UT
381             Provo  Utah      116288  40.233844 -111.658534           UT
434      Spanish Fork  Utah       36956  40.114955 -111.654923           UT
440      Taylorsville  Utah       60519  40.667725 -111.938826           UT
466          Riverton  Utah       40921  40.521893 -111.939102           UT
500              Lehi  Utah       54382  40.391617 -111.850766           UT
533             Ogden  Utah       84249  41.223000 -111.973830           UT
568            Layton  Utah       70790  41.060222 -111.971053           UT
603            Murray  Utah       48612  40.666892 -111.887991           UT
630             Sandy  Utah       90231  40.564978 -111.838973           UT
673  W

In [24]:
def generate_random_rows(n=1000, city_state_relation=False):
    
    def random_names(n, name_len=10):
        return [''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), name_len, replace=True)) for i in range(n)]
    
    new_df = pd.DataFrame()
    new_df['fname'] = random_names(n,6)
    new_df['lname'] = random_names(n,6)
    new_df['state'] = np.random.choice(all_states_df['abbreviation'].values, n, replace=True)
    new_df['zip'] = np.random.randint(10000, 99999, n)
    if city_state_relation:
        new_df['city'] = new_df['state'].apply(lambda x: np.random.choice(all_cities_df[all_cities_df['abbreviation'] == x]['city'].values, 1)[0])
    else:
        new_df['city'] = np.random.choice(all_cities_df['city'].values, n, replace=True)

    return new_df

print(generate_random_rows(10, city_state_relation=True))

    fname   lname state    zip            city
0  jwcesb  smcxgh    IA  52612  Council Bluffs
1  axowcc  snkkns    HI  57650        Honolulu
2  urmuil  xmzlws    NY  42092         Yonkers
3  gbryzh  xtvmjb    LA  46925     New Orleans
4  qkpyib  mtvhib    AL  21841         Madison
5  twiflz  qdzlfp    MO  30283      St. Joseph
6  mybamm  pmlngo    AK  41481       Anchorage
7  kuofvx  zxcydn    LA  33050     New Orleans
8  bsqruf  mlydxf    NJ  90740         Trenton
9  oqirgp  omxuta    CA  79231          Corona


In [25]:
# add random rows to the dataframe
random_df = generate_random_rows(1000)
print(random_df.shape)
df = pd.concat([df, random_df], ignore_index=True)
#set all column types to object
df = df.astype(str)


(1000, 5)


In [26]:
df.sample(10)

Unnamed: 0,zip,city,state,fname,lname
864,36422,San Gabriel,CT,jcbrpq,pdfbaq
419,44471,La Mesa,HI,cqzyrs,jlekfd
910,47965,Middletown,RI,npgjdb,hdlnrj
867,61442,Lenexa,UT,tzuxib,cdgusj
603,53667,Knoxville,CA,bxrtag,mqyrsf
824,49969,Yucaipa,WA,sneclg,sylnom
704,74584,Lubbock,SC,uxyuwx,uqdagl
534,64997,Kearny,MA,vxkskj,anptma
158,80015,Euless,CA,ixcdua,jnlsyw
841,30601,Worcester,ME,evheft,gctakn


In [27]:
exp_dim = 10_000
exp_vsa_type = "BSC"
sim_func = partial(similarity_func_partial, exp_vsa_type)
hdDB = HDDB(dim=exp_dim, vsa_type=exp_vsa_type)

In [28]:
hdDB.set_columns(df.columns)

In [29]:
for index, row in df.iterrows():
    values = row.values
    hdDB.add_row(index, values)

In [30]:
#https://docs.google.com/presentation/d/176_bmMfthBgrO6lBczBlgU4PAC59qf8rWUHNmKoVtLo/edit#slide=id.g226b508b046_0_20

print(hdDB.similiraity_cutoff(0.3))
print(hdDB.similiraity_cutoff(0.5))
print(hdDB.similiraity_cutoff(0.6))
print(hdDB.similiraity_cutoff(0.9))
print(hdDB.similiraity_cutoff(0.95))
print(hdDB.similiraity_cutoff(0.99))

0.689930661430611
0.6875
0.6863257047481339
0.6815598495115457
0.6798758993876929
0.67671708901005


In [31]:
vec_NJ = hdDB.columns['state']['NJ']
query_vec_NJ = torchhd.bind(hdDB.columns['state'].atomic_vector, vec_NJ)
results = hdDB.most_similar_rows(query_vec_NJ, 200)
print(results)
print()
with pd.option_context('display.max_rows', None,):
  print(df.iloc[[t[0] for t in results]])

[(148, 0.6974999904632568), (921, 0.6933000087738037), (883, 0.6909999847412109), (369, 0.6906999945640564), (885, 0.6901000142097473), (545, 0.6898000240325928), (617, 0.6898000240325928), (888, 0.6895999908447266), (9, 0.6887000203132629), (233, 0.6886000037193298), (370, 0.6881999969482422), (271, 0.6873000264167786), (6, 0.6872000098228455), (410, 0.6870999932289124), (455, 0.6850000023841858), (740, 0.6844000220298767), (357, 0.6832000017166138), (634, 0.682200014591217), (2, 0.6816999912261963), (869, 0.6805999875068665), (0, 0.6769000291824341), (142, 0.5145999789237976), (442, 0.5145000219345093), (614, 0.5144000053405762), (931, 0.5142999887466431), (671, 0.5141000151634216), (27, 0.5138999819755554), (256, 0.513700008392334), (184, 0.5131999850273132), (989, 0.5123999714851379), (1003, 0.5123999714851379), (906, 0.5121999979019165), (183, 0.5120999813079834), (334, 0.511900007724762), (156, 0.511900007724762), (400, 0.5112000107765198), (68, 0.5110999941825867), (516, 0.51059

In [32]:
vec_state_NJ = hdDB.columns['state']['NJ']
query_vec_state_NJ = torchhd.bind(hdDB.columns['state'].atomic_vector, vec_state_NJ)
results = hdDB.most_similar_rows(query_vec_state_NJ, 200)
print(results)
print(df.iloc[[t[0] for t in results]])

[(148, 0.6974999904632568), (921, 0.6933000087738037), (883, 0.6909999847412109), (369, 0.6906999945640564), (885, 0.6901000142097473), (545, 0.6898000240325928), (617, 0.6898000240325928), (888, 0.6895999908447266), (9, 0.6887000203132629), (233, 0.6886000037193298), (370, 0.6881999969482422), (271, 0.6873000264167786), (6, 0.6872000098228455), (410, 0.6870999932289124), (455, 0.6850000023841858), (740, 0.6844000220298767), (357, 0.6832000017166138), (634, 0.682200014591217), (2, 0.6816999912261963), (869, 0.6805999875068665), (0, 0.6769000291824341), (142, 0.5145999789237976), (442, 0.5145000219345093), (614, 0.5144000053405762), (931, 0.5142999887466431), (671, 0.5141000151634216), (27, 0.5138999819755554), (256, 0.513700008392334), (184, 0.5131999850273132), (989, 0.5123999714851379), (1003, 0.5123999714851379), (906, 0.5121999979019165), (183, 0.5120999813079834), (334, 0.511900007724762), (156, 0.511900007724762), (400, 0.5112000107765198), (68, 0.5110999941825867), (516, 0.51059

In [33]:
vec_zip_85301 = hdDB.columns['zip']['91234']
query_vec_zip_85301 = torchhd.bind(hdDB.columns['zip'].atomic_vector, vec_zip_85301)
results = hdDB.most_similar_rows(query_vec_zip_85301, 200)
print(results)
print(df.iloc[[t[0] for t in results]])

[(3, 0.6923999786376953), (505, 0.5156999826431274), (777, 0.5156000256538391), (573, 0.5145999789237976), (16, 0.5144000053405762), (342, 0.5133000016212463), (130, 0.5131999850273132), (560, 0.5130000114440918), (68, 0.5128999948501587), (719, 0.5128999948501587), (576, 0.5127999782562256), (174, 0.5126000046730042), (945, 0.5120000243186951), (104, 0.5116999745368958), (563, 0.5115000009536743), (725, 0.5113999843597412), (12, 0.5112000107765198), (452, 0.5112000107765198), (468, 0.5109000205993652), (479, 0.5102999806404114), (478, 0.5099999904632568), (772, 0.5098000168800354), (749, 0.5097000002861023), (114, 0.5095000267028809), (436, 0.5092999935150146), (47, 0.5090000033378601), (512, 0.508899986743927), (743, 0.508899986743927), (519, 0.5087000131607056), (442, 0.5087000131607056), (870, 0.5087000131607056), (586, 0.5085999965667725), (447, 0.5084999799728394), (632, 0.5084999799728394), (695, 0.5083000063896179), (120, 0.5081999897956848), (769, 0.5081999897956848), (40, 0.5

In [34]:
vec_fname_john = hdDB.columns['fname']['John']
query_vec_fname_john = torchhd.bind(hdDB.columns['fname'].atomic_vector, vec_fname_john)
results = hdDB.most_similar_rows(query_vec_fname_john, 200)
print(results)
print(df.iloc[[t[0] for t in results]])

[(0, 0.6908000111579895), (2, 0.6901999711990356), (4, 0.6845999956130981), (932, 0.5177000164985657), (787, 0.5130000114440918), (589, 0.5127000212669373), (688, 0.512499988079071), (274, 0.5121999979019165), (721, 0.5120999813079834), (36, 0.5113000273704529), (214, 0.5110999941825867), (767, 0.5109000205993652), (378, 0.5109000205993652), (592, 0.5109000205993652), (99, 0.5108000040054321), (685, 0.5108000040054321), (926, 0.5108000040054321), (606, 0.510699987411499), (79, 0.510699987411499), (591, 0.5105999708175659), (105, 0.5105999708175659), (798, 0.5105999708175659), (442, 0.5098000168800354), (175, 0.5097000002861023), (920, 0.5094000101089478), (605, 0.5092999935150146), (165, 0.5092999935150146), (475, 0.5091999769210815), (979, 0.5091999769210815), (540, 0.5091999769210815), (290, 0.5091000199317932), (273, 0.5091000199317932), (275, 0.5091000199317932), (240, 0.5090000033378601), (263, 0.5090000033378601), (855, 0.5090000033378601), (539, 0.5090000033378601), (704, 0.5088

In [35]:
vec_lastname_repici = hdDB.columns['lname']['Blankman']
query_vec_lastname_repici = torchhd.bind(hdDB.columns['lname'].atomic_vector, vec_lastname_repici)
results = hdDB.most_similar_rows(query_vec_lastname_repici, 10)
print(results)
print(df.iloc[[t[0] for t in results]])

[(4, 0.6851000189781189), (577, 0.5163000226020813), (139, 0.5152000188827515), (653, 0.5142999887466431), (905, 0.5142999887466431), (427, 0.5138999819755554), (614, 0.5135999917984009), (108, 0.5131999850273132), (754, 0.5128999948501587), (362, 0.5127999782562256)]
       zip          city state   fname     lname
4      298   Sioux Falls    SD    John  Blankman
577  41086     Shoreline    LA  qelvlp    hpyvkv
139  12471   West Covina    IA  zcbfrl    axqier
653  86193         Allen    WY  zspleo    lcoqec
905  42291        Tigard    OK  pvydyk    ailhqm
427  96835        Tacoma    DE  jszklx    ondntr
614  11481  Lake Charles    MT  gewabl    irncdb
108  85626       Garland    MA  jdacum    oesdqg
754  25705   West Covina    VA  zkkjse    gtnmhk
362  97506          Gary    OK  rqguoj    ftdsbs


In [36]:
jack_repici_vec = hdDB[6]
# t = torchhd.inverse(hdDB.columns['state'].atomic_vector)

# unbinding to obtain the state that Jack Repici lives in
jack_repici_state_vec = torchhd.bind(jack_repici_vec, torchhd.inverse(hdDB.columns['state'].atomic_vector))
print(sim_func(jack_repici_state_vec, hdDB.columns['state']['NJ']))
print(sim_func(jack_repici_state_vec, hdDB.columns['state']['PA']))

query_vec_jack_repici_state = torchhd.bind(hdDB.columns['state'].atomic_vector, jack_repici_state_vec)
query_vec = torchhd.bundle(query_vec_jack_repici_state, query_vec_lastname_repici)
results = hdDB.most_similar_rows(query_vec, 10)
print(results)
print(df.iloc[[t[0] for t in results]])

BSCTensor(0.6872)
BSCTensor(0.4999)
[(6, 0.7505999803543091), (2, 0.6485000252723694), (0, 0.6039000153541565), (4, 0.5856999754905701), (370, 0.550599992275238), (410, 0.5449000000953674), (617, 0.5429999828338623), (740, 0.5429999828338623), (921, 0.541700005531311), (888, 0.5414000153541565)]
       zip                        city state   fname     lname
6     8075                   Riverside    NJ    Jack    Repici
2     8075                   Riverside    NJ    John    Repici
0     8075                   Riverside    NJ    John       Doe
4      298                 Sioux Falls    SD    John  Blankman
370  68228  San Buenaventura (Ventura)    NJ  irnyjt    diqjas
410  89791                 Bakersfield    NJ  gsfhhl    zngvel
617  44240                  Charleston    NJ  fxiblb    bzzamu
740  18764                Murfreesboro    NJ  jnraxw    ihowhc
921  51178                    Savannah    NJ  ugkmyr    cvmhoj
888  58561                    Beaumont    NJ  bixgfa    suxage


In [37]:
import math

def theoretical_similarity(amount_bundled):
    amount_bundled = amount_bundled + 1 if amount_bundled % 2 == 0 else amount_bundled # +1 because we need an odd amount of elements
    expected_similarity = 1/2 + (math.comb(amount_bundled-1, (amount_bundled-1)//2) * .5**amount_bundled)
    return expected_similarity


print(theoretical_similarity(3))
print(theoretical_similarity(5))  
print(theoretical_similarity(7))

0.75
0.6875
0.65625
