In [3]:
import torchhd, torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from HDDB import HDDB
from utils import similarity_func_partial
from functools import partial

### Loading the dataset

In [4]:
df = pd.read_csv('data/addresses.csv')
# strip whitespace from all values
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df = df.drop(columns=['address', 'zip'])
df

Unnamed: 0,fname,lname,city,state
0,John,Doe,Riverside,NJ
1,Jack,McGinnis,Philadelphia,PA
2,John,Repici,Riverside,NJ
3,Stephen,Tyler,SomeTown,SD
4,John,Blankman,SomeTown,SD
5,Joan,Anne,Desert City,CO
6,Jack,Repici,Riverside,NJ
7,Lilly,Repici,Philadelphia,PA


### Extending the dataset

In [56]:
all_states_df = pd.read_csv('data/states.csv')
all_states_df.columns = all_states_df.columns.str.strip().str.lower()
print(len(all_states_df['abbreviation'].unique()))
all_cities_df = pd.read_csv('data/us-cities-top-1k.csv')
all_cities_df.columns = all_cities_df.columns.str.strip().str.lower()


all_cities_df['abbreviation'] = all_cities_df['state'].apply(lambda x: all_states_df[all_states_df['state'] == x]['abbreviation'].values[0])
print(len(all_cities_df['abbreviation'].unique()))
all_cities_df.head()
print(all_cities_df[all_cities_df['abbreviation'] == 'UT'])

51
51
                 city state  population        lat         lon abbreviation
31              Logan  Utah       48913  41.736980 -111.833836           UT
193      South Jordan  Utah       59366  40.562170 -111.929658           UT
285         Bountiful  Utah       43023  40.889390 -111.880771           UT
381             Provo  Utah      116288  40.233844 -111.658534           UT
434      Spanish Fork  Utah       36956  40.114955 -111.654923           UT
440      Taylorsville  Utah       60519  40.667725 -111.938826           UT
466          Riverton  Utah       40921  40.521893 -111.939102           UT
500              Lehi  Utah       54382  40.391617 -111.850766           UT
533             Ogden  Utah       84249  41.223000 -111.973830           UT
568            Layton  Utah       70790  41.060222 -111.971053           UT
603            Murray  Utah       48612  40.666892 -111.887991           UT
630             Sandy  Utah       90231  40.564978 -111.838973           UT
673  W

In [54]:
def generate_random_rows(n=1000, city_state_relation=False):
    
    def random_names(n, name_len=10):
        return [''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), name_len, replace=True)) for i in range(n)]
    
    new_df = pd.DataFrame()
    new_df['fname'] = random_names(n)
    new_df['lname'] = random_names(n)
    new_df['state'] = np.random.choice(all_states_df['abbreviation'].values, n, replace=True)
    if city_state_relation:
        new_df['city'] = new_df['state'].apply(lambda x: np.random.choice(all_cities_df[all_cities_df['abbreviation'] == x]['city'].values, 1)[0])
    else:
        new_df['city'] = np.random.choice(all_cities_df['city'].values, n, replace=True)

    return new_df

print(generate_random_rows(10, city_state_relation=True))

        fname       lname state          city
0  zzxemhbzde  ccasuwtaai    UT         Ogden
1  rqxnsmbruu  phbtnpfhvu    CO       Greeley
2  toeealkklj  tcarytqkrk    MN      Moorhead
3  zxjlrjuapr  edhyueuhkb    HI      Honolulu
4  xegxtkgrvb  mbjzzfyhgf    AZ    Oro Valley
5  hncikmjjgs  aaaagkokka    NE  Grand Island
6  pxubmuvnds  mlsrjtrvaq    HI      Honolulu
7  toyjwablnf  horwmovope    RI      Cranston
8  evzesdotxu  nnennplhng    NJ       Hoboken
9  vsabanjjda  vjgkgdvznp    LA  Bossier City


In [57]:
# add random rows to the dataframe
random_df = generate_random_rows(1000)
print(random_df.shape)
df = pd.concat([df, random_df], ignore_index=True)
df.sample(10)

(1000, 4)


Unnamed: 0,fname,lname,city,state
1834,kkyoucnspz,hjgrcweyjb,Palo Alto,GA
1997,ynpkkjhnzj,lqxsxfiqew,Portage,SD
586,alwpegxdlm,myogmctlak,Fort Smith,VA
70,kcwwdtxplp,jvhktwqrrj,Lexington,ME
1394,shltgibuoq,nopypltlwo,Lakewood,IA
1237,gqsycnqwur,oofzgpmahz,Apple Valley,MA
529,fzxioeinkv,hscdmxqgch,Naples,MI
301,jmvpurloth,kkkmreibnp,St. Paul,NH
1529,hbvclgbrgp,olirnhkuwm,Weston,WI
1044,qrgmmexyzd,uqvgvrijwn,Fort Lauderdale,TN


In [58]:
exp_dim = 10_000
exp_vsa_type = "MAP"
sim_func = partial(similarity_func_partial, exp_vsa_type)
hdDB = HDDB(dim=exp_dim, vsa_type=exp_vsa_type)

In [59]:
hdDB.set_columns(df.columns)

In [60]:
for index, row in df.iterrows():
    values = row.values
    hdDB.add_row(index, values)

In [61]:
vec_NJ = hdDB.columns['state']['NJ']
query_vec_NJ = torchhd.bind(hdDB.columns['state'].atomic_vector, hdDB.columns['state']['NJ'])
results = hdDB.most_similar_rows(query_vec_NJ, 5)
print(results)
print(df.iloc[[t[0] for t in results]])

[(1089, 0.4910036325454712), (1035, 0.485140323638916), (0, 0.4845851957798004), (1295, 0.48352059721946716), (1570, 0.4829839766025543)]
           fname       lname       city state
1089  kcwekzzgpr  jcxmhbvkdp  San Ramon    NJ
1035  atcldaofbo  dcweyozuel  Homestead    NJ
0           John         Doe  Riverside    NJ
1295  xstrsgxwii  bmsbhldmdg   Franklin    NJ
1570  wgqpblvyhz  tosmxforba   Palmdale    NJ


In [62]:
vec_lastname_repici = hdDB.columns['lname']['Repici']
query_vec_lastname_repici = torchhd.bind(hdDB.columns['lname'].atomic_vector, hdDB.columns['lname']['Repici'])
results = hdDB.most_similar_rows(query_vec_lastname_repici, 5)
print(results)
print(df.iloc[[t[0] for t in results]])

[(2, 0.47767457365989685), (6, 0.47504958510398865), (7, 0.47221189737319946), (428, 0.03663639351725578), (1687, 0.030521005392074585)]
           fname       lname          city state
2           John      Repici     Riverside    NJ
6           Jack      Repici     Riverside    NJ
7          Lilly      Repici  Philadelphia    PA
428   sbacvhduqq  nfdjsdgsfl    Wilmington    DE
1687  nktcxedsls  ybhrirjmmq  Gaithersburg    DE


In [64]:
jack_repici_vec = hdDB[6]
# t = torchhd.inverse(hdDB.columns['state'].atomic_vector)

# unbinding to obtain the state that Jack Repici lives in
jack_repici_state_vec = torchhd.bind(jack_repici_vec, torchhd.inverse(hdDB.columns['state'].atomic_vector))
print(sim_func(jack_repici_state_vec, hdDB.columns['state']['NJ']))
print(sim_func(jack_repici_state_vec, hdDB.columns['state']['PA']))

query_vec_jack_repici_state = torchhd.bind(hdDB.columns['state'].atomic_vector, jack_repici_state_vec)
query_vec = torchhd.bundle(query_vec_jack_repici_state, query_vec_lastname_repici)
results = hdDB.most_similar_rows(query_vec, 10)
print(results)
print(df.iloc[[t[0] for t in results]])


MAPTensor(0.4776)
MAPTensor(-0.0091)
[(6, 0.8214219808578491), (2, 0.6686680912971497), (7, 0.4192749559879303), (0, 0.22414912283420563), (460, 0.1413332223892212), (1295, 0.1337350457906723), (1410, 0.13279324769973755), (1953, 0.13200430572032928), (1228, 0.13127240538597107), (1235, 0.1279529184103012)]
           fname       lname                  city state
6           Jack      Repici             Riverside    NJ
2           John      Repici             Riverside    NJ
7          Lilly      Repici          Philadelphia    PA
0           John         Doe             Riverside    NJ
460   suzxegpacp  crtcveqkza              Bellevue    NJ
1295  xstrsgxwii  bmsbhldmdg              Franklin    NJ
1410  wfeouwiqft  idmtfhzpwy  Athens-Clarke County    NJ
1953  empmvjgixw  bzdxzvnliq                Euless    NJ
1228  sccfrryfbq  smpwrczurv               Portage    NJ
1235  shyiyhoutb  xqahinollj                Lorain    NJ
