In [47]:
from functools import partial
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
import nltk
import torch, torchhd
import torchtext.datasets as datasets
from torch.utils.data import DataLoader
nltk.download('punkt')

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

from utils import WikiTextDataset, Wordsim353Loader

[nltk_data] Downloading package punkt to /home/senn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [48]:
import sys
sys.path.append('..')
from shared_code.helpers import similarity_func_partial, top_k_vectors

In [49]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### This preprocessing step needs to be done only once for each dataset

In [50]:
# wiki_dataset = "wikitext-2-v1"
wiki_dataset = "wikitext-103-v1"
data_path = './data/' + wiki_dataset + '/train-00000-of-00001.parquet'
out_dir = './data/preprocessed/' + wiki_dataset

# wiki_text_dataset = WikiTextDataset(data_path, out_dir)

### Load the preprocessed data

In [51]:
file_amount = 1000
min_word_count = 5
max_word_freq = 1.0
# load the preprocessed data in a term-document matrix using scikit-learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os

files = [file for file in os.listdir(out_dir) if file.endswith('.txt')]
input_files = [out_dir + '/' + file for file in files]
file_amount = min(file_amount, len(input_files))

file_idxs = np.random.choice(len(input_files), file_amount, replace=False)
input_files = [input_files[i] for i in file_idxs]
print(len(input_files))
print(input_files[:10])

# vectorizer = TfidfVectorizer(token_pattern=r'[\w<>]+', input='filename', min_df=min_word_count, max_df=max_word_freq)#, use_idf=False)
vectorizer = CountVectorizer(token_pattern=r'[\w<>]+', input='filename', min_df=min_word_count, max_df=max_word_freq)#, binary=True)
doc_term_matrix = vectorizer.fit_transform(input_files)
# print(doc_term_matrix[0])

1000
['./data/preprocessed/wikitext-103-v1/Scottish_Labour_Party_leadership_election_,_2014.txt', './data/preprocessed/wikitext-103-v1/Australia_Day.txt', './data/preprocessed/wikitext-103-v1/Zoo_Station_(_song_).txt', './data/preprocessed/wikitext-103-v1/Malcolm_Hardee.txt', './data/preprocessed/wikitext-103-v1/Alberto_Henschel.txt', './data/preprocessed/wikitext-103-v1/Scott_Brown.txt', './data/preprocessed/wikitext-103-v1/My_Life_with_Master.txt', './data/preprocessed/wikitext-103-v1/Paul_Kelly_(_Australian_musician_).txt', './data/preprocessed/wikitext-103-v1/TNA_World_Tag_Team_Championship.txt', './data/preprocessed/wikitext-103-v1/2014_Orkney_earthquake.txt']


In [52]:
print(doc_term_matrix.shape)

(1000, 18688)


In [53]:
# print(f'Number of documents: {len(wiki_text_dataset)}')
# print(f'Title of first document: {wiki_text_dataset[0].title}')
# print(f'First 10 words of first document: {wiki_text_dataset[0].text[:10]}')
# print(f'Length of first document: {len(wiki_text_dataset[0].text)}')
# print(f'Number of elements in vocabulary of first document: {len(wiki_text_dataset[0].vocabulary)}')
# print(f'Number of elements in the vocabulary: {len(wiki_text_dataset.vocabulary)}')

print(f'Number of documents: {doc_term_matrix.shape[0]}')
print(f'Vocab: {vectorizer.get_feature_names_out()[:10]}')
print(f'Number of elements in the vocabulary: {len(vectorizer.vocabulary_)}')
print(f'Lenght of the first document: {doc_term_matrix[0].sum()}')


Number of documents: 1000
Vocab: ['<num>' 'aa' 'aaa' 'aadt' 'aaron' 'ab' 'abandon' 'abandoned' 'abandoning'
 'abandonment']
Number of elements in the vocabulary: 18688
Lenght of the first document: 3575


### Onto HD Vectors

In [54]:
DIMENSIONS = 10_000
vocab_size = len(vectorizer.vocabulary_)
vsa_type= 'MAP'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using {} device".format(device))
sim_func = partial(similarity_func_partial, vsa_type)

Using cuda device


### Creating the document embeddings to learn from the data

In [55]:
# label_embeddings = torchhd.random(vocab_size, DIMENSIONS, device=device, vsa=vsa_type)
document_embeddings = torchhd.random(doc_term_matrix.shape[0], DIMENSIONS, device=device, vsa=vsa_type)
learned_embeddings = torchhd.empty(vocab_size, DIMENSIONS, device=device, vsa=vsa_type)

### Learning the word embeddings

In [56]:
epochs = 1
scale = 30
# scale = 1
for epoch in range(epochs):
  print(f'Epoch {epoch}')
  for idx, doc_vec in enumerate(tqdm(doc_term_matrix)):
    # get the indices of the non-zero elements in the vector
    indices = doc_vec.nonzero()[1]

    # get the non-zero element counts in the vector
    element_values = doc_vec[0, indices].toarray()[0]
    # scale the values to the range [0, scale]
    element_values = (element_values / element_values.max()) * scale
    element_values = np.ceil(element_values)
    
    # get the atomic vector for the document
    doc_embedding = document_embeddings[idx].unsqueeze(0)
    elv = torch.tensor(element_values).unsqueeze(1).to(device).float()
    # add the doc embedding to the indices of the learned embeddings, scaled by the non-zero element values
    learned_embeddings[indices] += torch.matmul(elv, doc_embedding)



Epoch 0


1000it [00:01, 724.07it/s]


In [57]:
# epochs = 1
# for epoch in range(epochs):
#   print(f'Epoch {epoch}')
#   for idx, doc_vec in enumerate(tqdm(doc_term_matrix)):
#     # get the indices of the non-zero elements in the vector
#     indices = doc_vec.nonzero()[1]

#     # get the non-zero element counts in the vector
#     element_values = doc_vec[0, indices].toarray()[0]
    
#     # get the atomic vector for the document
#     doc_embedding = document_embeddings[idx]

#     # add the doc embedding to the indices of the learned embeddings
#     learned_embeddings[indices] += doc_embedding



### Normalisation

In [58]:
print(learned_embeddings[0])
# learned_embeddings = torchhd.hard_quantize(learned_embeddings) 
# hard quantize equally picks either 1 or -1 for all the vectors, we would like to do this randomly for each vector 
random_decision_vec = torch.randint(0, 2, learned_embeddings.shape[0], device=device)
positive = torch.tensor(1, device=device)
negative = torch.tensor(-1, device=device)
for idx in range(len(learned_embeddings)):
    # learned_embeddings[idx] = learned_embeddings[idx].clipping(1)
    learned_embeddings[idx] = torch.where(learned_embeddings[idx] > 0, positive, negative) if random_decision_vec[idx] == 1 else torch.where(learned_embeddings[idx] >= 0, positive, negative)
print(learned_embeddings[:3])

MAPTensor([[ 7.4100e+02,  8.9300e+02,  9.2300e+02,  7.7300e+02,  5.2700e+02,
             3.0300e+02,  1.9000e+01,  8.4900e+02, -2.8300e+02, -4.6300e+02,
             9.5000e+01,  6.3300e+02, -1.2490e+03,  3.4500e+02,  1.7700e+02,
            -1.8870e+03,  1.6770e+03, -1.6630e+03, -1.0790e+03,  6.1500e+02,
             5.8500e+02,  7.4700e+02, -1.0330e+03,  9.0100e+02, -3.3300e+02,
             4.3100e+02,  2.6700e+02, -1.0390e+03, -4.5500e+02,  5.5700e+02,
             6.1000e+01, -1.5370e+03,  2.7100e+02, -1.2100e+02, -7.8300e+02,
             4.7300e+02, -4.6100e+02, -1.1690e+03, -7.9900e+02, -4.3900e+02,
             1.6990e+03,  8.4900e+02,  3.2300e+02,  6.4900e+02,  2.3700e+02,
            -1.3300e+02,  1.0930e+03,  7.7100e+02,  1.5730e+03,  1.4470e+03,
             4.5700e+02,  3.0100e+02,  5.8300e+02, -1.8110e+03,  1.3910e+03,
            -1.7470e+03,  8.0900e+02, -5.9300e+02,  2.9100e+02,  1.3550e+03,
            -1.6970e+03,  1.2370e+03,  1.2450e+03,  1.8490e+03, -1.9100e+02,

In [59]:
# # are there empty embeddings?
# for idx in range(len(learned_embeddings)):
#     if torch.all(learned_embeddings[idx] == 0):
#         print(f'Empty embedding at index {idx}')

In [60]:
def plot_k_most_similar_words(reference_word, memory, k=10):
    vocabulary = vectorizer.vocabulary_

    word_idx = vocabulary[reference_word]
    word_vector = memory[word_idx]
    # get the k most similar words to the reference word
    similarities, similar_vector_idxs = top_k_vectors(word_vector, memory, k+1, vsa_type)
    # remove the reference word from the result
    similarities = similarities[1:]
    similar_vector_idxs = similar_vector_idxs[1:]
    
    similar_words = [vectorizer.get_feature_names_out()[idx] for idx in similar_vector_idxs]
    # bar plot using plotly express
    fig = px.bar(x=similar_words, y=similarities.cpu().numpy(), title=f"Most similar words to {reference_word}", labels={'x': 'Word', 'y': 'Similarity'})
    fig.update_xaxes(tickangle=-45)
    fig.show()

In [61]:
reference_word = "movie"

plot_k_most_similar_words(reference_word, learned_embeddings, k=15)

### similarity of random vocabulary word to other words in the vocabulary

In [62]:
reference_vector = learned_embeddings[vectorizer.vocabulary_[reference_word]]
similarities = sim_func(reference_vector, learned_embeddings)
similarities_cpu = similarities.cpu().numpy()
similarities_sorted = np.sort(similarities_cpu)[::-1]
# count amount of similarities above .3
similarities_above_threshold = similarities_cpu[similarities_cpu > .3]
print(f'Number of similarities above .3: {len(similarities_above_threshold)-1}')

Number of similarities above .3: 29


In [63]:
# plot similarity distribution using plotly express
fig = px.histogram(x=similarities.cpu().numpy(), title=f"Similarity distribution of word {reference_word} to all other words in vocabulary", labels={'x': 'Similarity', 'y': 'Frequency'})
fig.update_xaxes(range=[-.1, .5])
fig.show()

### Wordsim353 validation

In [67]:
wordsim353_loader = Wordsim353Loader("../../data_external/wordsim353/wordsim_relatedness_goldstandard.txt")
print(f"Number of word pairs: {len(wordsim353_loader)}")

wordsim_vsa_learned_similarities = []

for word1, word2, score in tqdm(wordsim353_loader):
    if word1 not in vectorizer.vocabulary_ or word2 not in vectorizer.vocabulary_:
        continue
    word1_idx = vectorizer.vocabulary_[word1]
    word2_idx = vectorizer.vocabulary_[word2]
    word1_vector = learned_embeddings[word1_idx]
    word2_vector = learned_embeddings[word2_idx]
    similarity = sim_func(word1_vector, word2_vector)
    wordsim_vsa_learned_similarities.append((word1, word2, similarity.item(), float(score)))

print(f"First 10 similarities:")
for word1, word2, sim, score in wordsim_vsa_learned_similarities[:10]:
    tab_size = word1 + word2
    tab_size = 20 - len(tab_size)
    print(f"{word1} - {word2} {tab_size * ' '} {round(sim, 3)} \t {round(score, 3)}")

corr = np.corrcoef([sim for _, _, sim, _ in wordsim_vsa_learned_similarities], [score for _, _, _, score in wordsim_vsa_learned_similarities])
print(f"Correlation between learned similarities and human scores: {corr[0, 1]}")

Number of word pairs: 252


100%|██████████| 252/252 [00:00<00:00, 1147.06it/s]

First 10 similarities:
computer - keyboard      0.095 	 7.62
planet - galaxy          0.109 	 8.11
canyon - landscape       0.259 	 7.53
day - summer             0.216 	 3.94
day - dawn               0.216 	 7.53
country - citizen        0.061 	 7.31
planet - people          0.138 	 5.75
environment - ecology    0.135 	 8.81
money - bank             0.181 	 8.5
computer - software      0.095 	 8.5
Correlation between learned similarities and human scores: 0.19311432150889613





In [65]:
# wiki_text_dataset.get_word_frequency(reference_word, document_frequency=True)
print(f'Term frequency of {reference_word}: {doc_term_matrix[:, vectorizer.vocabulary_[reference_word]].sum()}')
print(f'Amount of documents it appears in: {reference_word}: {doc_term_matrix[:, vectorizer.vocabulary_[reference_word]].count_nonzero()}')

Term frequency of movie: 393
Amount of documents it appears in: movie: 111


### Avg similarity vs frequency in documents

In [66]:
import pandas as pd

avg_similarities = []
random_idxs = np.random.choice(range(len(vectorizer.vocabulary_)), 500, replace=False)
df_avg_similarities = pd.DataFrame(columns=["word", "doc_freq", "avg_similarity"])

for word_idx in tqdm(random_idxs):
  word = vectorizer.get_feature_names_out()[word_idx]
  word_vector = learned_embeddings[word_idx]

  word_doc_freq = doc_term_matrix[:, word_idx].count_nonzero()
  # remove word vecor from learned embeddings
  # similarities = sim_func(word_vector, torch.cat((learned_embeddings[:word_idx], learned_embeddings[word_idx+1:])))
  similarities = sim_func(word_vector, learned_embeddings)
  avg_similarity = similarities.mean().item()
  avg_similarities.append((word_doc_freq, avg_similarity, word))

# Convert avg_similarities into a DataFrame
df_avg_similarities = pd.DataFrame(avg_similarities, columns=['Word Document Frequency', 'Average Similarity', 'Word'])


  4%|▍         | 20/500 [00:00<00:15, 31.12it/s]

 71%|███████   | 355/500 [00:08<00:03, 39.48it/s]


KeyboardInterrupt: 

In [None]:
# scatter plot using plotly express
fig = px.scatter(df_avg_similarities, x='Word Document Frequency', y='Average Similarity', title=f"Average similarity (over vocabulary) of 1000 vocabulary words vs the word document frequency", labels={'x': 'Word document frequency', 'y': 'Average similarity to all other learned embeddings'}, hover_data=['Word'])
fig.show()

In [None]:
target_word = "test"
# print document frequency of target word
print(f"Document frequency of {target_word}: {doc_term_matrix[:, vectorizer.vocabulary_[target_word]].count_nonzero()}")

Document frequency of test: 115


## LSA

### Learning word embeddings using LSA

In [None]:
from sklearn.decomposition import TruncatedSVD

# tfidf_vectorizer = TfidfVectorizer(token_pattern=r'[\w<>]+', input='filename', min_df=min_word_count)
# doc_term_matrix = tfidf_vectorizer.fit_transform(input_files)

svd = TruncatedSVD(n_components=300, random_state=42)
lsa = svd.fit(doc_term_matrix)

learned_embeddings_lsa = torch.tensor(lsa.components_, device=device).T
print(learned_embeddings_lsa.shape)

torch.Size([18688, 300])


## Wordsim353 validation

In [None]:
wordsim_lsa_learned_similarities = []

for word1, word2, score in tqdm(wordsim353_loader):
    if word1 not in vectorizer.vocabulary_ or word2 not in vectorizer.vocabulary_:
        continue
    word1_idx = vectorizer.vocabulary_[word1]
    word2_idx = vectorizer.vocabulary_[word2]
    word1_vector = learned_embeddings_lsa[word1_idx]
    word2_vector = learned_embeddings_lsa[word2_idx]
    similarity = sim_func(word1_vector, word2_vector)
    wordsim_lsa_learned_similarities.append((word1, word2, similarity.item(), float(score)))

print(f"First 10 similarities:")
for word1, word2, sim, score in wordsim_lsa_learned_similarities[:10]:
    tab_size = word1 + word2
    tab_size = 20 - len(tab_size)
    print(f"{word1} - {word2} {tab_size * ' '} {round(sim, 3)} \t {round(score, 3)}")

corr = np.corrcoef([sim for _, _, sim, _ in wordsim_lsa_learned_similarities], [score for _, _, _, score in wordsim_lsa_learned_similarities])
print(f"Correlation between learned similarities and human scores: {corr[0, 1]}")

100%|██████████| 252/252 [00:00<00:00, 850.12it/s]

First 10 similarities:
computer - keyboard      0.087 	 7.62
planet - galaxy          0.124 	 8.11
canyon - landscape       0.038 	 7.53
day - summer             0.005 	 3.94
day - dawn               0.032 	 7.53
country - citizen        -0.051 	 7.31
planet - people          -0.052 	 5.75
environment - ecology    -0.048 	 8.81
money - bank             0.045 	 8.5
computer - software      0.149 	 8.5
Correlation between learned similarities and human scores: 0.26964130059017455





### Avg similarity vs frequency in documents


In [None]:
# import pandas as pd

# avg_similarities = []
# random_idxs = np.random.choice(range(len(vectorizer.vocabulary_)), 1000, replace=False)
# df_avg_similarities = pd.DataFrame(columns=["word", "doc_freq", "avg_similarity"])

# for word_idx in tqdm(random_idxs):
#   word = vectorizer.get_feature_names_out()[word_idx]
#   word_vector = learned_embeddings_lsa[word_idx]

#   word_doc_freq = doc_term_matrix[:, word_idx].count_nonzero()
#   similarities = sim_func(word_vector, learned_embeddings_lsa)
#   avg_similarity = similarities.mean().item()
#   avg_similarities.append((word_doc_freq, avg_similarity, word))

# # Convert avg_similarities into a DataFrame
# df_avg_similarities = pd.DataFrame(avg_similarities, columns=['Word Document Frequency', 'Average Similarity', 'Word'])


In [None]:
# # scatter plot using plotly express
# # fig = px.scatter(x=[x for x, _ in avg_similarities], y=[y for _, y in avg_similarities], title=f"Average similarity of 1000 vocabulary words vs the word document frequency", labels={'x': 'Word document frequency', 'y': 'Average similarity to all other learned embeddings'})
# fig = px.scatter(df_avg_similarities, x='Word Document Frequency', y='Average Similarity', title=f"Average similarity of 1000 vocabulary words vs the word document frequency", labels={'x': 'Word document frequency', 'y': 'Average similarity to all other learned embeddings'}, hover_data=['Word'])
# fig.show()