In [17]:
from functools import partial
import numpy as np
import plotly.express as px
from tqdm import tqdm
# import nltk
import torch, torchhd
# nltk.download('punkt')

seed = 123
np.random.seed(seed)
torch.manual_seed(seed)

from utils import Wordsim353Loader, WikiTextDataset

In [18]:
import sys
sys.path.append('..')
from shared_code.helpers import similarity_func_partial, top_k_vectors

In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
print(torch.get_default_dtype())

Using cuda device
torch.float32


### The preprocessing step needs to be done only once for each dataset

In [20]:
# wiki_dataset = "wikitext-2-v1"
wiki_dataset = "wikitext-103-v1"
data_path = './data/' + wiki_dataset + '/train-00000-of-00001.parquet'
# data_path = './data/' + wiki_dataset + '/test-00000-of-00001.parquet'
# data_path = './data/' + wiki_dataset + '/validation-00000-of-00001.parquet'
out_dir = './data/preprocessed/' + wiki_dataset

# preprocessing step
# wiki_text_dataset = WikiTextDataset(data_path, out_dir)

### Load the preprocessed data

In [21]:
file_amount = 10000
min_word_count = 3
max_word_freq = 1.0
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os

files = [file for file in os.listdir(out_dir) if file.endswith('.txt')]
input_files = [out_dir + '/' + file for file in files]
file_amount = min(file_amount, len(input_files))

file_idxs = np.random.choice(len(input_files), file_amount, replace=False)
input_files = [input_files[i] for i in file_idxs]
print(len(input_files))
print(input_files[:10])

# load the preprocessed data in a term-document matrix using scikit-learn
vectorizer = TfidfVectorizer(token_pattern=r'[\w<>]+', input='filename', min_df=min_word_count, max_df=max_word_freq)#, use_idf=False)
# vectorizer = CountVectorizer(token_pattern=r'[\w<>]+', input='filename', min_df=min_word_count, max_df=max_word_freq, binary=True)
# vectorizer = CountVectorizer(token_pattern=r'[\w<>]+', input='filename', min_df=min_word_count, max_df=max_word_freq)
doc_term_matrix = vectorizer.fit_transform(input_files)

10000
['./data/preprocessed/wikitext-103-v1/Special_Air_Service.txt', './data/preprocessed/wikitext-103-v1/Russian_monitor_Veschun.txt', './data/preprocessed/wikitext-103-v1/M_@-@_59_(_Michigan_highway_).txt', './data/preprocessed/wikitext-103-v1/Battle_of_Kranji.txt', './data/preprocessed/wikitext-103-v1/Judge_,_Jury_,_Executioner.txt', './data/preprocessed/wikitext-103-v1/Everyone_Nose_(_All_the_Girls_Standing_in_the_Line_for_the_Bathroom_).txt', './data/preprocessed/wikitext-103-v1/Reginald_Miles.txt', './data/preprocessed/wikitext-103-v1/The_Legend_of_Zelda_:_Ocarina_of_Time.txt', './data/preprocessed/wikitext-103-v1/Fair_catch_kick.txt', './data/preprocessed/wikitext-103-v1/Kamp_Krusty.txt']


In [22]:
print(doc_term_matrix.shape)

(10000, 91613)


In [23]:
print(f'Number of documents: {doc_term_matrix.shape[0]}')
print(f'Vocab: {vectorizer.get_feature_names_out()[:10]}')
print(f'Number of elements in the vocabulary: {len(vectorizer.vocabulary_)}')
print(f'Lenght of the first document: {doc_term_matrix[0].sum()}')


Number of documents: 10000
Vocab: ['<num>' 'aa' 'aaa' 'aaaa' 'aaat' 'aab' 'aac' 'aachen' 'aadt' 'aaf']
Number of elements in the vocabulary: 91613
Lenght of the first document: 11.292901131964294


### Plot word-document frequencies

In [24]:
word_appearances = doc_term_matrix.astype(bool).sum(axis=0)
fig = px.histogram(x=word_appearances.A1, title='Document frequency of our vocabulary')#, nbins=4000)
fig.update_xaxes(title='Number of documents it appears in')
fig.update_yaxes(title='Number of words')
# increase font size
fig.update_layout(font=dict(size=20))
fig.update_xaxes(range=[0, 500])
fig.show()

In [25]:
vocab_size = len(vectorizer.vocabulary_)
vsa_type= 'MAP'
sim_func = partial(similarity_func_partial, vsa_type)

In [26]:
def plot_k_most_similar_words(reference_word, memory, k=10, learning_method='VSA'):
    vocabulary = vectorizer.vocabulary_

    word_idx = vocabulary[reference_word]
    word_vector = memory[word_idx]
    # get the k most similar words to the reference word
    similarities, similar_vector_idxs = top_k_vectors(word_vector, memory, k+1, vsa_type)
    # remove the reference word from the result
    similarities = similarities[1:]
    similar_vector_idxs = similar_vector_idxs[1:]
    
    similar_words = [vectorizer.get_feature_names_out()[idx] for idx in similar_vector_idxs]
    # bar plot using plotly express
    fig = px.bar(x=similar_words, y=similarities.cpu().numpy(), title=f"Most similar words to {reference_word}, {learning_method}-based learning", labels={'x': 'Word', 'y': 'Similarity'})
    fig.update_xaxes(tickangle=-45)
    fig.update_layout(font=dict(size=20))
    fig.show()

In [27]:
reference_word = "movie"

In [28]:
# wiki_text_dataset.get_word_frequency(reference_word, document_frequency=True)
print(f'Term frequency of {reference_word}: {doc_term_matrix[:, vectorizer.vocabulary_[reference_word]].sum()}')
print(f'Amount of documents it appears in: {reference_word}: {doc_term_matrix[:, vectorizer.vocabulary_[reference_word]].count_nonzero()}')

Term frequency of movie: 19.16424004915941
Amount of documents it appears in: movie: 1138


## LSA

### Learning word embeddings using LSA

In [29]:
dim_reduced = 2000

# doc_term_tensor = torch.tensor(doc_term_matrix.todense(), device=device)
doc_term_tensor = torch.sparse_coo_tensor(doc_term_matrix.nonzero(), doc_term_matrix.data, device=device, dtype=torch.float32)
# print(torch.linalg.matrix_rank(doc_term_tensor))
# svd = torch.pca_lowrank(doc_term_tensor, q=dim_reduced)
svd = torch.svd_lowrank(doc_term_tensor, q=dim_reduced)
# svd = torch.linalg.svd(doc_term_tensor, full_matrices=False)
learned_embeddings_lsa = svd[2] @ torch.diag(svd[1])
print(learned_embeddings_lsa.shape)
print(learned_embeddings_lsa[:3])
# learned_embeddings_lsa = torchhd.ensure_vsa_tensor(doc_term_matrix.T.todense())

torch.Size([91613, 2000])
tensor([[-4.0800e+01, -3.7538e-01, -7.7689e-01,  ..., -3.4626e-04,
         -1.5970e-05,  6.9931e-04],
        [-5.7145e-02, -1.3357e-02, -2.7402e-02,  ..., -2.2587e-03,
          4.4494e-05, -1.8148e-03],
        [-1.6603e-02,  3.5750e-03, -4.2938e-03,  ...,  3.6918e-03,
         -4.5636e-03,  4.2453e-03]], device='cuda:0')


## Timing

In [30]:
timings = []
epochs = 10

for epoch in range(epochs):
  start = torch.cuda.Event(enable_timing=True)
  end = torch.cuda.Event(enable_timing=True)
  start.record()
  svd = torch.svd_lowrank(doc_term_tensor, q=dim_reduced)
  learned_embeddings_lsa = svd[0] @ torch.diag(svd[1])
  end.record()
  torch.cuda.synchronize()
  timings.append(start.elapsed_time(end))
  print(f'Epoch {epoch+1}/{epochs} took {timings[-1]} ms')

print(f'Mean time: {np.mean(timings)} ms')
print(f'Std time: {np.std(timings)} ms')

Epoch 1/10 took 6345.1728515625 ms
Epoch 2/10 took 5965.302734375 ms
Epoch 3/10 took 5202.76171875 ms
Epoch 4/10 took 5168.947265625 ms
Epoch 5/10 took 5231.50439453125 ms
Epoch 6/10 took 5234.66455078125 ms
Epoch 7/10 took 5234.41748046875 ms
Epoch 8/10 took 5298.951171875 ms
Epoch 9/10 took 6245.138671875 ms
Epoch 10/10 took 6196.681640625 ms
Mean time: 5612.354248046875 ms
Std time: 479.2264566275218 ms


In [31]:
# from sklearn.decomposition import TruncatedSVD

# # tfidf_vectorizer = TfidfVectorizer(token_pattern=r'[\w<>]+', input='filename', min_df=min_word_count)
# # doc_term_matrix = tfidf_vectorizer.fit_transform(input_files)

# svd = TruncatedSVD(random_state=seed, n_components=1000, n_iter=1)
# lsa = svd.fit(doc_term_matrix)

# learned_embeddings_lsa = torch.tensor(lsa.components_, device=device).T
# print(learned_embeddings_lsa.shape)

# # learned_embeddings_lsa = torchhd.ensure_vsa_tensor(doc_term_matrix.T.todense())

## Wordsim353 validation

In [32]:
wordsim353_loader = Wordsim353Loader("../../data_external/wordsim353/wordsim_relatedness_goldstandard.txt")
print(f"Number of word pairs: {len(wordsim353_loader)}")

wordsim_lsa_learned_similarities = []

for word1, word2, score in tqdm(wordsim353_loader):
    if word1 not in vectorizer.vocabulary_ or word2 not in vectorizer.vocabulary_:
        continue
    word1_idx = vectorizer.vocabulary_[word1]
    word2_idx = vectorizer.vocabulary_[word2]
    word1_vector = learned_embeddings_lsa[word1_idx]
    word2_vector = learned_embeddings_lsa[word2_idx]
    similarity = sim_func(word1_vector, word2_vector)
    wordsim_lsa_learned_similarities.append((word1, word2, similarity.item(), float(score)))


print(f"First 10 similarities:")
for word1, word2, sim, score in wordsim_lsa_learned_similarities[:10]:
    tab_size = word1 + word2
    tab_size = 20 - len(tab_size)
    print(f"{word1} - {word2} {tab_size * ' '} {round(sim, 3)} \t {round(score, 3)}")

corr = np.corrcoef([sim for _, _, sim, _ in wordsim_lsa_learned_similarities], [score for _, _, _, score in wordsim_lsa_learned_similarities])
print(f"Correlation between learned similarities and human scores: {corr[0, 1]}")

Number of word pairs: 252


  0%|          | 0/252 [00:00<?, ?it/s]


IndexError: index 16326 is out of bounds for dimension 0 with size 10000

In [None]:
doc_term_embeddings = torchhd.ensure_vsa_tensor(doc_term_matrix.T.todense())

wordsim_doc_term_similarities = []

for word1, word2, score in tqdm(wordsim353_loader):
    if word1 not in vectorizer.vocabulary_ or word2 not in vectorizer.vocabulary_:
        continue
    word1_idx = vectorizer.vocabulary_[word1]
    word2_idx = vectorizer.vocabulary_[word2]
    word1_vector = doc_term_embeddings[word1_idx]
    word2_vector = doc_term_embeddings[word2_idx]
    similarity = sim_func(word1_vector, word2_vector)
    wordsim_doc_term_similarities.append((word1, word2, similarity.item(), float(score)))

corr = np.corrcoef([sim for _, _, sim, _ in wordsim_doc_term_similarities], [sim for _, _, sim, _ in wordsim_lsa_learned_similarities])
print(f"Correlation between doc-term similarities and learned similarities: {corr[0, 1]}")

In [None]:
plot_k_most_similar_words(reference_word, learned_embeddings_lsa, k=15, learning_method='LSA')

In [None]:
# plot_k_most_similar_words_cooccurrence(reference_word, learned_embeddings_lsa, k=15)

### similarity of random vocabulary word to other words in the vocabulary

In [None]:
reference_vector = learned_embeddings_lsa[vectorizer.vocabulary_[reference_word]]
similarities = sim_func(reference_vector, learned_embeddings_lsa)
similarities_cpu = similarities.cpu().numpy()

In [None]:
# plot similarity distribution using plotly express
fig = px.histogram(x=similarities.cpu().numpy(), title=f"Similarity distribution of word {reference_word} to all other words in vocabulary", labels={'x': 'Similarity', 'y': 'Frequency'})
fig.update_xaxes(range=[-.1, .4])
fig.update_layout(font=dict(size=20))
fig.show()

In [None]:
wordsim353_loader = Wordsim353Loader("../../data_external/wordsim353/wordsim_relatedness_goldstandard.txt")
print(f"Number of word pairs: {len(wordsim353_loader)}")

dim_reductions = [x for x in range(50, 6000, 100)]

correlations = []

svd = torch.svd_lowrank(doc_term_tensor, q=dim_reductions[-1], niter=1)

for dim in tqdm(dim_reductions):
    torch.cuda.empty_cache()
    learned_embeddings_lsa = svd[0][:, :dim] @ torch.diag(svd[1][:dim])
    wordsim_lsa_learned_similarities = []

    for word1, word2, score in wordsim353_loader:
        if word1 not in vectorizer.vocabulary_ or word2 not in vectorizer.vocabulary_:
            continue
        word1_idx = vectorizer.vocabulary_[word1]
        word2_idx = vectorizer.vocabulary_[word2]
        word1_vector = learned_embeddings_lsa[word1_idx]
        word2_vector = learned_embeddings_lsa[word2_idx]
        similarity = sim_func(word1_vector, word2_vector)
        wordsim_lsa_learned_similarities.append((word1, word2, similarity.item(), float(score)))

    corr = np.corrcoef([sim for _, _, sim, _ in wordsim_lsa_learned_similarities], [score for _, _, _, score in wordsim_lsa_learned_similarities])
    correlations.append(corr[0, 1])

Number of word pairs: 252


In [None]:
fig = px.line(x=dim_reductions, y=correlations, title='Correlation between learned similarities and human scores for different reduced LSA dimensionalities', labels={'x': 'Dimensionality', 'y': 'Correlation'})
fig.update_layout(font=dict(size=20))
fig.show()