In [29]:
from functools import partial
import numpy as np
import plotly.express as px
from tqdm import tqdm
import nltk
import torch, torchhd
nltk.download('punkt')

seed = 123
np.random.seed(seed)
torch.manual_seed(seed)

from utils import Wordsim353Loader, WikiTextDataset

[nltk_data] Downloading package punkt to /home/senn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
import sys
sys.path.append('..')
from shared_code.helpers import similarity_func_partial, top_k_vectors

In [31]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(torch.get_default_dtype())

torch.float32


### The preprocessing step needs to be done only once for each dataset

In [32]:
# wiki_dataset = "wikitext-2-v1"
wiki_dataset = "wikitext-103-v1"
data_path = './data/' + wiki_dataset + '/train-00000-of-00001.parquet'
# data_path = './data/' + wiki_dataset + '/test-00000-of-00001.parquet'
# data_path = './data/' + wiki_dataset + '/validation-00000-of-00001.parquet'
out_dir = './data/preprocessed/' + wiki_dataset

# preprocessing step
# wiki_text_dataset = WikiTextDataset(data_path, out_dir)

### Load the preprocessed data

In [33]:
file_amount = 10000
min_word_count = 3
max_word_freq = 1.0
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os

files = [file for file in os.listdir(out_dir) if file.endswith('.txt')]
input_files = [out_dir + '/' + file for file in files]
file_amount = min(file_amount, len(input_files))

file_idxs = np.random.choice(len(input_files), file_amount, replace=False)
input_files = [input_files[i] for i in file_idxs]
print(len(input_files))
print(input_files[:10])

# load the preprocessed data in a term-document matrix using scikit-learn
vectorizer = TfidfVectorizer(token_pattern=r'[\w<>]+', input='filename', min_df=min_word_count, max_df=max_word_freq)#, use_idf=False)
# vectorizer = CountVectorizer(token_pattern=r'[\w<>]+', input='filename', min_df=min_word_count, max_df=max_word_freq, binary=True)
# vectorizer = CountVectorizer(token_pattern=r'[\w<>]+', input='filename', min_df=min_word_count, max_df=max_word_freq)
doc_term_matrix = vectorizer.fit_transform(input_files)

10000
['./data/preprocessed/wikitext-103-v1/Special_Air_Service.txt', './data/preprocessed/wikitext-103-v1/Russian_monitor_Veschun.txt', './data/preprocessed/wikitext-103-v1/M_@-@_59_(_Michigan_highway_).txt', './data/preprocessed/wikitext-103-v1/Battle_of_Kranji.txt', './data/preprocessed/wikitext-103-v1/Judge_,_Jury_,_Executioner.txt', './data/preprocessed/wikitext-103-v1/Everyone_Nose_(_All_the_Girls_Standing_in_the_Line_for_the_Bathroom_).txt', './data/preprocessed/wikitext-103-v1/Reginald_Miles.txt', './data/preprocessed/wikitext-103-v1/The_Legend_of_Zelda_:_Ocarina_of_Time.txt', './data/preprocessed/wikitext-103-v1/Fair_catch_kick.txt', './data/preprocessed/wikitext-103-v1/Kamp_Krusty.txt']


In [34]:
print(doc_term_matrix.shape)

(10000, 91613)


In [35]:
print(f'Number of documents: {doc_term_matrix.shape[0]}')
print(f'Vocab: {vectorizer.get_feature_names_out()[:10]}')
print(f'Number of elements in the vocabulary: {len(vectorizer.vocabulary_)}')
print(f'Lenght of the first document: {doc_term_matrix[0].sum()}')


Number of documents: 10000
Vocab: ['<num>' 'aa' 'aaa' 'aaaa' 'aaat' 'aab' 'aac' 'aachen' 'aadt' 'aaf']
Number of elements in the vocabulary: 91613
Lenght of the first document: 903


### Plot word-document frequencies

In [36]:
# using plotly, plot a histogram of the number of docuemnts each word appears in
# set binarize to True to count the number of documents each word appears in
word_appearances = doc_term_matrix.astype(bool).sum(axis=0)
fig = px.histogram(x=word_appearances.A1, title='Document frequency of our vocabulary')#, nbins=4000)
fig.update_xaxes(title='Number of documents the words appear in')
fig.update_yaxes(title='Number of words')
# increase font size
fig.update_layout(font=dict(size=20))
fig.update_xaxes(range=[0, 500])
fig.show()

### Onto HD Vectors

In [37]:
DIMENSIONS = 5000
vocab_size = len(vectorizer.vocabulary_)
vsa_type= 'MAP'
print("Using {} device".format(device))
sim_func = partial(similarity_func_partial, vsa_type)
dtype = torch.float32

Using cuda device


In [38]:
# How many HVs can fit in a 16GB gpu using dimensionality DIMENSIONS and int16 type components
num_fit = int(16e9 / (DIMENSIONS * 16))
print(f'Number of HVs that fit in memory: {num_fit}')

Number of HVs that fit in memory: 200000


### Creating the document embeddings to learn from the data

In [39]:
# label_embeddings = torchhd.random(vocab_size, DIMENSIONS, device=device, vsa=vsa_type)
document_embeddings = torchhd.random(doc_term_matrix.shape[0], DIMENSIONS, device=device, vsa=vsa_type, dtype=dtype)
learned_embeddings = torchhd.empty(vocab_size, DIMENSIONS, device=device, vsa=vsa_type, dtype=dtype)

## Learning the word embeddings

### Using scaling by word count in document

In [40]:
epochs = 10
scale = 10

timings = []

for epoch in range(epochs):
  start = torch.cuda.Event(enable_timing=True)
  end = torch.cuda.Event(enable_timing=True)
  start.record()
  print(f'Epoch {epoch}')
  for idx, doc_vec in enumerate(tqdm(doc_term_matrix)):
    # get the indices of the non-zero elements in the vector
    indices = doc_vec.nonzero()[1]

    # get the non-zero element counts in the vector
    element_values = doc_vec[0, indices].toarray()[0]
    # scale the values to the range [0, scale]
    # element_values = (element_values / element_values.max()) * scale
    # element_values = np.ceil(element_values)
    
    # get the atomic vector for the document
    doc_embedding = document_embeddings[idx].unsqueeze(0)
    elv = torch.tensor(element_values, dtype=dtype, device=device).unsqueeze(1)#.to(device)#.float()
    # add the doc embedding to the indices of the learned embeddings, scaled by the non-zero element values
    learned_embeddings[indices] += torch.matmul(elv, doc_embedding)
  
  end.record()
  torch.cuda.synchronize()
  timings.append(start.elapsed_time(end))
  print(f'Epoch {epoch} took {timings[-1]} ms')

print(f'Average time per epoch: {np.mean(timings)} ms')
print(f'Standard deviation of time per epoch: {np.std(timings)} ms')


### No scaling, just use word occurence

In [41]:
# epochs = 10
# timings = []
# for epoch in range(epochs):
#   start = torch.cuda.Event(enable_timing=True)
#   end = torch.cuda.Event(enable_timing=True)
#   start.record()
#   print(f'Epoch {epoch}')
#   for idx, doc_vec in enumerate(tqdm(doc_term_matrix)):
#     # get the indices of the non-zero elements in the vector
#     indices = doc_vec.nonzero()[1]
    
#     # get the atomic vector for the document
#     doc_embedding = document_embeddings[idx]#.to(device).unsqueeze(0)

#     # add the doc embedding to the indices of the learned embeddings
#     learned_embeddings[indices] += doc_embedding

#   end.record()
#   torch.cuda.synchronize()
#   timings.append(start.elapsed_time(end))
#   print(f'Time taken: {timings[-1]}')

# print(f'Mean time taken: {np.mean(timings)}')
# print(f'Std time taken: {np.std(timings)}')

Epoch 0


10000it [00:07, 1321.27it/s]


Time taken: 7538.5947265625
Epoch 1


10000it [00:07, 1342.31it/s]


Time taken: 7452.55322265625
Epoch 2


10000it [00:07, 1363.96it/s]


Time taken: 7336.408203125
Epoch 3


10000it [00:07, 1359.01it/s]


Time taken: 7360.634765625
Epoch 4


10000it [00:07, 1364.26it/s]


Time taken: 7334.23095703125
Epoch 5


10000it [00:07, 1359.44it/s]


Time taken: 7358.310546875
Epoch 6


10000it [00:07, 1354.21it/s]


Time taken: 7386.14990234375
Epoch 7


10000it [00:07, 1342.29it/s]


Time taken: 7452.49462890625
Epoch 8


10000it [00:07, 1340.51it/s]


Time taken: 7464.0341796875
Epoch 9


10000it [00:07, 1337.36it/s]

Time taken: 7480.158203125
Mean time taken: 7416.35693359375
Std time taken: 66.65222236672774





### Normalisation

In [42]:
print(learned_embeddings.shape)
# learned_embeddings = torchhd.hard_quantize(learned_embeddings) 

for idx in range(len(learned_embeddings)):
    # learned_embeddings[idx] = learned_embeddings[idx].clipping(1)
    learned_embeddings[idx] = torchhd.hard_quantize(learned_embeddings[idx])
print(learned_embeddings[:3])
# # learned_embeddings = torchhd.hard_quantize(learned_embeddings) 
# # hard quantize equally picks either 1 or -1 for all the vectors, we would like to do this randomly (but uniformly) for each vector 
# random_decision_vec = torch.randint(0, 2, (learned_embeddings.shape[0],1), device=device)
# positive = torch.tensor(1, device=device)
# negative = torch.tensor(-1, device=device)
# for idx in range(len(learned_embeddings)):
#     # learned_embeddings[idx] = learned_embeddings[idx].clipping(1)
#     learned_embeddings[idx] = torch.where(learned_embeddings[idx] > 0, positive, negative) if random_decision_vec[idx] == 1 else torch.where(learned_embeddings[idx] >= 0, positive, negative)
# print(learned_embeddings[:3])

torch.Size([91613, 5000])
MAPTensor([[-1.,  1.,  1.,  ...,  1.,  1., -1.],
           [-1., -1.,  1.,  ...,  1.,  1., -1.],
           [-1.,  1.,  1.,  ..., -1.,  1.,  1.]], device='cuda:0')


In [43]:
def plot_k_most_similar_words(reference_word, memory, k=10, learning_method='VSA'):
    vocabulary = vectorizer.vocabulary_

    word_idx = vocabulary[reference_word]
    word_vector = memory[word_idx]
    # get the k most similar words to the reference word
    similarities, similar_vector_idxs = top_k_vectors(word_vector, memory, k+1, vsa_type)
    # remove the reference word from the result
    similarities = similarities[1:]
    similar_vector_idxs = similar_vector_idxs[1:]
    
    similar_words = [vectorizer.get_feature_names_out()[idx] for idx in similar_vector_idxs]
    # bar plot using plotly express
    fig = px.bar(x=similar_words, y=similarities.cpu().numpy(), title=f"Most similar words to {reference_word}, {learning_method}-based learning", labels={'x': 'Word', 'y': 'Similarity'})
    fig.update_xaxes(tickangle=-45)
    fig.update_layout(font=dict(size=20))
    fig.show()

### Nearest Neighbours to validate the embeddings

In [44]:
reference_word = "movie"

plot_k_most_similar_words(reference_word, learned_embeddings, k=15)

### plotting document co-occurence to validate embeddings capture co-occurence relationships

In [45]:
# we will plot the amount of documents that the top-k words co-occur in with the reference word
def plot_k_most_similar_words_cooccurrence(reference_word, memory, k=10):
    vocabulary = vectorizer.vocabulary_

    word_idx = vocabulary[reference_word]
    word_vector = memory[word_idx]
    # get the k most similar words to the reference word
    similarities, similar_vector_idxs = top_k_vectors(word_vector, memory, k+1, vsa_type)
    # remove the reference word from the result
    similarities = similarities[1:]
    similar_vector_idxs = similar_vector_idxs[1:].cpu().numpy()
    
    similar_words = [vectorizer.get_feature_names_out()[idx] for idx in similar_vector_idxs]
    # get the co-occurrence matrix
    cooccurrence_matrix = doc_term_matrix[:, similar_vector_idxs].astype(bool)
    # sum the co-occurrence matrix to get the amount of documents the reference word co-occurs in with the top-k words
    cooccurrence_sum = cooccurrence_matrix.sum(axis=0)
    cooccurrence_sum = cooccurrence_sum.A1
    # bar plot using plotly express
    fig = px.bar(x=similar_words, y=cooccurrence_sum, title=f"Co-occurrence of {reference_word} with the top-{k} words", labels={'x': 'Word', 'y': 'Co-occurrence'})
    fig.update_xaxes(tickangle=-45)
    fig.update_layout(font=dict(size=20))
    fig.show()

In [46]:
# plot_k_most_similar_words_cooccurrence(reference_word, learned_embeddings, k=15)

### similarity of random vocabulary word to other words in the vocabulary

In [47]:
reference_vector = learned_embeddings[vectorizer.vocabulary_[reference_word]]
similarities = sim_func(reference_vector, learned_embeddings)
similarities_cpu = similarities.cpu().numpy()
similarities_sorted = np.sort(similarities_cpu)[::-1]
# count amount of similarities above .3
similarities_above_threshold = similarities_cpu[similarities_cpu > .3]
print(f'Number of similarities above .3: {len(similarities_above_threshold)-1}')

Number of similarities above .3: 8


In [48]:
# plot similarity distribution using plotly express
fig = px.histogram(x=similarities.cpu().numpy(), title=f"Similarity distribution of word {reference_word} to all other words in vocabulary", labels={'x': 'Similarity', 'y': 'Frequency'}, nbins=1000)
fig.update_xaxes(range=[-.1, .3])
fig.update_layout(font=dict(size=20))
fig.show()

### Wordsim353 validation

In [49]:
wordsim353_loader = Wordsim353Loader("../../data_external/wordsim353/wordsim_relatedness_goldstandard.txt")
print(f"Number of word pairs: {len(wordsim353_loader)}")

wordsim_vsa_learned_similarities = []

for word1, word2, score in tqdm(wordsim353_loader):
    if word1 not in vectorizer.vocabulary_ or word2 not in vectorizer.vocabulary_:
        continue
    word1_idx = vectorizer.vocabulary_[word1]
    word2_idx = vectorizer.vocabulary_[word2]
    word1_vector = learned_embeddings[word1_idx]
    word2_vector = learned_embeddings[word2_idx]
    similarity = sim_func(word1_vector, word2_vector)
    wordsim_vsa_learned_similarities.append((word1, word2, similarity.item(), float(score)))

print(f"First 10 similarities:")
for word1, word2, sim, score in wordsim_vsa_learned_similarities[:10]:
    tab_size = word1 + word2
    tab_size = 20 - len(tab_size)
    print(f"{word1} - {word2} {tab_size * ' '} {round(sim, 3)} \t {round(score, 3)}")

corr = np.corrcoef([sim for _, _, sim, _ in wordsim_vsa_learned_similarities], [score for _, _, _, score in wordsim_vsa_learned_similarities])
print(f"Correlation between learned similarities and human scores: {corr[0, 1]}")

Number of word pairs: 252


100%|██████████| 252/252 [00:00<00:00, 1236.49it/s]

First 10 similarities:
computer - keyboard      0.092 	 7.62
planet - galaxy          0.182 	 8.11
canyon - landscape       0.054 	 7.53
day - summer             0.298 	 3.94
day - dawn               0.144 	 7.53
country - citizen        0.116 	 7.31
planet - people          0.139 	 5.75
environment - ecology    0.096 	 8.81
money - bank             0.167 	 8.5
computer - software      0.297 	 8.5
Correlation between learned similarities and human scores: 0.28417211392322433





In [50]:
# wiki_text_dataset.get_word_frequency(reference_word, document_frequency=True)
print(f'Term frequency of {reference_word}: {doc_term_matrix[:, vectorizer.vocabulary_[reference_word]].sum()}')
print(f'Amount of documents it appears in: {reference_word}: {doc_term_matrix[:, vectorizer.vocabulary_[reference_word]].count_nonzero()}')

Term frequency of movie: 1138
Amount of documents it appears in: movie: 1138


### Avg similarity vs frequency in documents

In [51]:
# import pandas as pd

# avg_similarities = []
# random_idxs = np.random.choice(range(len(vectorizer.vocabulary_)), 500, replace=False)
# df_avg_similarities = pd.DataFrame(columns=["word", "doc_freq", "avg_similarity"])

# for word_idx in tqdm(random_idxs):
#   word = vectorizer.get_feature_names_out()[word_idx]
#   word_vector = learned_embeddings[word_idx]

#   word_doc_freq = doc_term_matrix[:, word_idx].count_nonzero()
#   # remove word vecor from learned embeddings
#   # similarities = sim_func(word_vector, torch.cat((learned_embeddings[:word_idx], learned_embeddings[word_idx+1:])))
#   similarities = sim_func(word_vector, learned_embeddings)
#   avg_similarity = similarities.mean().item()
#   avg_similarities.append((word_doc_freq, avg_similarity, word))

# # Convert avg_similarities into a DataFrame
# df_avg_similarities = pd.DataFrame(avg_similarities, columns=['Word Document Frequency', 'Average Similarity', 'Word'])


In [52]:
# # scatter plot using plotly express
# fig = px.scatter(df_avg_similarities, x='Word Document Frequency', y='Average Similarity', title=f"Average similarity (over vocabulary) of 1000 vocabulary words vs the word document frequency", labels={'x': 'Word document frequency', 'y': 'Average similarity to all other learned embeddings'}, hover_data=['Word'])
# fig.show()

In [53]:
# target_word = "movie"
# print document frequency of target word
# print(f"Document frequency of {target_word}: {doc_term_matrix[:, vectorizer.vocabulary_[target_word]].count_nonzero()}")

### How much information is contained in the doc-term matrix?

In [54]:
# clear cuda
torch.cuda.empty_cache()

In [55]:
doc_term_embeddings = torchhd.ensure_vsa_tensor(doc_term_matrix.T.todense())

In [56]:
# plot_k_most_similar_words(reference_word, doc_term_embeddings, k=15)

## Wordsim353 validation

In [57]:
wordsim353_loader = Wordsim353Loader("../../data_external/wordsim353/wordsim_relatedness_goldstandard.txt")

wordsim_doc_term_similarities = []

for word1, word2, score in tqdm(wordsim353_loader):
    if word1 not in vectorizer.vocabulary_ or word2 not in vectorizer.vocabulary_:
        continue
    word1_idx = vectorizer.vocabulary_[word1]
    word2_idx = vectorizer.vocabulary_[word2]
    word1_vector = doc_term_embeddings[word1_idx]
    word2_vector = doc_term_embeddings[word2_idx]
    similarity = sim_func(word1_vector, word2_vector)
    wordsim_doc_term_similarities.append((word1, word2, similarity.item(), float(score)))


print(f"First 10 similarities:")
for word1, word2, sim, score in wordsim_doc_term_similarities[:10]:
    tab_size = word1 + word2
    tab_size = 20 - len(tab_size)
    print(f"{word1} - {word2} {tab_size * ' '} {round(sim, 3)} \t {round(score, 3)}")

corr = np.corrcoef([sim for _, _, sim, _ in wordsim_doc_term_similarities], [score for _, _, _, score in wordsim_doc_term_similarities])
print(f"Correlation between learned similarities and human scores: {corr[0, 1]}")

corr = np.corrcoef([sim for _, _, sim, _ in wordsim_doc_term_similarities], [sim for _, _, sim, _ in wordsim_vsa_learned_similarities])
print(f"Correlation between doc-term similarities and learned similarities: {corr[0, 1]}")

100%|██████████| 252/252 [00:18<00:00, 13.74it/s]

First 10 similarities:
computer - keyboard      0.122 	 7.62
planet - galaxy          0.262 	 8.11
canyon - landscape       0.086 	 7.53
day - summer             0.458 	 3.94
day - dawn               0.21 	 7.53
country - citizen        0.193 	 7.31
planet - people          0.19 	 5.75
environment - ecology    0.141 	 8.81
money - bank             0.263 	 8.5
computer - software      0.409 	 8.5
Correlation between learned similarities and human scores: 0.2837970023505008
Correlation between doc-term similarities and learned similarities: 0.9819310154150354





In [58]:
# # pre select doc-term embeddings with a similarity above 0.1 with the target word
# target_vec = doc_term_embeddings[vectorizer.vocabulary_[reference_word]]
# similarities = sim_func(target_vec, doc_term_embeddings)
# doc_term_embeddings = doc_term_embeddings[similarities > 0.1]

# plot_k_most_similar_words(reference_word, doc_term_embeddings, k=15, learning_method='None')

: 