In [36]:
from functools import partial
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
import nltk
import torch, torchhd
import torchtext.datasets as datasets
from torch.utils.data import DataLoader
nltk.download('punkt')

from utils import WikiTextDatasetPrev

[nltk_data] Downloading package punkt to /home/senn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [37]:
import sys
sys.path.append('..')
from shared_code.helpers import similarity_func_partial, top_k_vectors

In [38]:
device = "cuda" if torch.cuda.is_available() else "cpu"

## This preprocessing step needs to be done only once for each dataset

In [39]:
wiki_dataset = "wikitext-2-v1"
# wiki_dataset = "wikitext-103-v1"
data_path = './data/' + wiki_dataset + '/train-00000-of-00001.parquet'
out_dir = './data/preprocessed/' + wiki_dataset

wiki_text_dataset = WikiTextDatasetPrev(data_path, out_dir)

36718it [01:04, 565.32it/s]


### Load the preprocessed data

In [40]:
# load the preprocessed data in a term-document matrix using scikit-learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os
count_vectorizer = CountVectorizer(token_pattern=r'[\w<>]+', input='filename')
# use file iterator over files in out_dir to avoid loading all data into memory
# input files are all txt files in out_dir
files = [file for file in os.listdir(out_dir) if file.endswith('.txt')]

print(len(files))

input_files = [out_dir + '/' + file for file in files]
print(input_files)
doc_term_matrix = count_vectorizer.fit_transform(input_files)

603
['./data/preprocessed/wikitext-2-v1/Back_to_Tennessee_(_song_).txt', './data/preprocessed/wikitext-2-v1/Rob_Howard.txt', './data/preprocessed/wikitext-2-v1/2011_–_12_Columbus_Blue_Jackets_season.txt', './data/preprocessed/wikitext-2-v1/Xenon.txt', './data/preprocessed/wikitext-2-v1/Texas_A_&_M_Singing_Cadets.txt', './data/preprocessed/wikitext-2-v1/Grammy_Award_for_Best_Concept_Music_Video.txt', './data/preprocessed/wikitext-2-v1/Jenova_Chen.txt', './data/preprocessed/wikitext-2-v1/Hoyt_Wilhelm.txt', './data/preprocessed/wikitext-2-v1/Chapter_1_(_House_of_Cards_).txt', './data/preprocessed/wikitext-2-v1/WASP_@-@_44.txt', './data/preprocessed/wikitext-2-v1/<unk>___<unk>_,_Pts.txt', './data/preprocessed/wikitext-2-v1/Isabella_Beeton.txt', './data/preprocessed/wikitext-2-v1/Hurricane_Lorenzo_(_2007_).txt', './data/preprocessed/wikitext-2-v1/Yoko_Shimomura.txt', './data/preprocessed/wikitext-2-v1/Stuart_McCall.txt', './data/preprocessed/wikitext-2-v1/Not_Quite_Hollywood_:_The_Wild_,_Un

In [41]:
print(doc_term_matrix.shape)

(603, 27068)


In [42]:
# print(f'Number of documents: {len(wiki_text_dataset)}')
# print(f'Title of first document: {wiki_text_dataset[0].title}')
# print(f'First 10 words of first document: {wiki_text_dataset[0].text[:10]}')
# print(f'Length of first document: {len(wiki_text_dataset[0].text)}')
# print(f'Number of elements in vocabulary of first document: {len(wiki_text_dataset[0].vocabulary)}')
print(f'Number of elements in the vocabulary: {len(wiki_text_dataset.vocabulary)}')

# print(f'Number of documents: {doc_term_matrix.shape[0]}')
# print(f'Vocab: {count_vectorizer.get_feature_names_out()[:10]}')
# print(f'Number of elements in the vocabulary: {len(count_vectorizer.vocabulary_)}')
# print(f'Lenght of the first document: {doc_term_matrix[0].sum()}')


Number of elements in the vocabulary: 27103


### Onto HD Vectors

In [43]:
DIMENSIONS = 10_000
vocab_size = len(wiki_text_dataset.vocabulary)
vsa_type= 'MAP'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using {} device".format(device))
sim_func = partial(similarity_func_partial, vsa_type)

Using cuda device


### Creating the document embeddings to learn from the data

In [64]:
# label_embeddings = torchhd.random(vocab_size, DIMENSIONS, device=device, vsa=vsa_type)
document_embeddings = torchhd.random(len(wiki_text_dataset), DIMENSIONS, device=device, vsa=vsa_type)
learned_embeddings = torchhd.empty(vocab_size, DIMENSIONS, device=device, vsa=vsa_type)

In [65]:
for doc_idx, document in enumerate(tqdm(wiki_text_dataset)):
    document_embedding = document_embeddings[doc_idx]
    for word in document.vocabulary.idx2word:
        word_idx = wiki_text_dataset.vocabulary.word2idx[word] # wrong
        # word_embedding = label_embeddings[word_idx]
        learned_embeddings[word_idx] = learned_embeddings[word_idx].bundle(document_embedding) # Bundle is not in-place

100%|██████████| 609/609 [00:34<00:00, 17.42it/s]


In [66]:
# normalize the learned embeddings
# for idx in range(len(learned_embeddings)):
#     learned_embeddings[idx] = learned_embeddings[idx].clipping(1)
learned_embeddings = torchhd.hard_quantize(learned_embeddings)
# print(learned_embeddings)

In [67]:
# are there empty embeddings?
for idx in range(len(learned_embeddings)):
    if torch.all(learned_embeddings[idx] == 0):
        print(f'Empty embedding at index {idx}')

In [68]:
def plot_k_most_similar_words(reference_word, memory, k=10, vocabulary=wiki_text_dataset.vocabulary):
    word_idx = vocabulary.word2idx[reference_word]
    word_vector = memory[word_idx]
    # get the k most similar words to the reference word, remove the reference word from the result
    similarities, similar_vector_idxs = top_k_vectors(word_vector, memory, k+1, vsa_type)
    # remove the reference word from the result
    similarities = similarities[1:]
    similar_vector_idxs = similar_vector_idxs[1:]
    
    similar_words = [vocabulary.idx2word[idx] for idx in similar_vector_idxs]
    # bar plot using plotly express
    fig = px.bar(x=similar_words, y=similarities.cpu().numpy(), title=f"Most similar words to {reference_word}", labels={'x': 'Word', 'y': 'Similarity'})
    fig.update_xaxes(tickangle=-45)
    fig.show()

In [69]:
reference_word = "movie"

plot_k_most_similar_words(reference_word, learned_embeddings, k=15)

### similarity of random vocabulary word to other words in the vocabulary

In [70]:
reference_vector = learned_embeddings[wiki_text_dataset.vocabulary.word2idx[reference_word]]
similarities = sim_func(reference_vector, learned_embeddings)
similarities_cpu = similarities.cpu().numpy()
similarities_sorted = np.sort(similarities_cpu)[::-1]
# count amount of similarities above .3
similarities_above_threshold = similarities_cpu[similarities_cpu > .3]
print(f'Number of similarities above .3: {len(similarities_above_threshold)-1}')

Number of similarities above .3: 3


In [71]:
# plot similarity distribution using plotly express
fig = px.histogram(x=similarities.cpu().numpy(), title=f"Similarity distribution of word {reference_word} to all other words in vocabulary", labels={'x': 'Similarity', 'y': 'Frequency'})
fig.update_xaxes(range=[-.1, .5])
fig.show()

### Wordsim353 validation

In [72]:
from utils import Wordsim353Loader
    
wordsim353_loader = Wordsim353Loader("../../data_external/wordsim353/wordsim_relatedness_goldstandard.txt")
print(f"Number of word pairs: {len(wordsim353_loader)}")
print(f"First word pair: {wordsim353_loader[0]}")

wordsim_learned_similarities = []

for word1, word2, score in tqdm(wordsim353_loader):
    if word1 not in wiki_text_dataset.vocabulary.word2idx or word2 not in wiki_text_dataset.vocabulary.word2idx:
        continue
    word1_idx = wiki_text_dataset.vocabulary.word2idx[word1]
    word2_idx = wiki_text_dataset.vocabulary.word2idx[word2]
    word1_vector = learned_embeddings[word1_idx]
    word2_vector = learned_embeddings[word2_idx]
    similarity = sim_func(word1_vector, word2_vector)
    wordsim_learned_similarities.append((word1, word2, similarity.item(), float(score)))

print(f"First 10 similarities:")
for word1, word2, sim, score in wordsim_learned_similarities[:10]:
    tab_size = word1 + word2
    tab_size = 20 - len(tab_size)
    print(f"{word1} - {word2} {tab_size * ' '} {round(sim, 3)} \t {round(score, 3)}")

corr = np.corrcoef([sim for _, _, sim, _ in wordsim_learned_similarities], [score for _, _, _, score in wordsim_learned_similarities])
print(f"Correlation between learned similarities and human scores: {corr[0, 1]}")

Number of word pairs: 252
First word pair: ['computer', 'keyboard', '7.62']


100%|██████████| 252/252 [00:00<00:00, 1626.94it/s]

First 10 similarities:
computer - keyboard      0.034 	 7.62
planet - galaxy          0.108 	 8.11
canyon - landscape       0.148 	 7.53
day - summer             0.285 	 3.94
day - dawn               0.157 	 7.53
country - citizen        0.101 	 7.31
planet - people          0.084 	 5.75
environment - ecology    0.088 	 8.81
money - bank             0.143 	 8.5
computer - software      0.32 	 8.5
Correlation between learned similarities and human scores: 0.25971405480116705





In [73]:
wiki_text_dataset.get_word_frequency(reference_word, document_frequency=True)

63

### Avg similarity vs frequency in documents

In [74]:
import pandas as pd

avg_similarities = []
random_idxs = np.random.choice(range(len(wiki_text_dataset.vocabulary)), size=1000, replace=False)
df_avg_similarities = pd.DataFrame(columns=["word", "doc_freq", "avg_similarity"])

for word_idx in tqdm(random_idxs):
  word = wiki_text_dataset.vocabulary.idx2word[word_idx]
  word_vector = learned_embeddings[word_idx]

  word_doc_freq = wiki_text_dataset.get_word_frequency(word, document_frequency=True)
  similarities = sim_func(word_vector, learned_embeddings)
  avg_similarity = similarities.mean().item()
  avg_similarities.append((word_doc_freq, avg_similarity, word))

# Convert avg_similarities into a DataFrame
df_avg_similarities = pd.DataFrame(avg_similarities, columns=['Word Document Frequency', 'Average Similarity', 'Word'])


100%|██████████| 1000/1000 [00:36<00:00, 27.45it/s]


In [75]:
# scatter plot using plotly express
# fig = px.scatter(x=[x for x, _ in avg_similarities], y=[y for _, y in avg_similarities], title=f"Average similarity of 1000 vocabulary words vs the word document frequency", labels={'x': 'Word document frequency', 'y': 'Average similarity to all other learned embeddings'})
fig = px.scatter(df_avg_similarities, x='Word Document Frequency', y='Average Similarity', title=f"Average similarity of 1000 vocabulary words vs the word document frequency", labels={'x': 'Word document frequency', 'y': 'Average similarity to all other learned embeddings'}, hover_data=['Word'])
fig.show()

In [76]:
target_word = 'coordinates'
# print document frequency of target word
print(f"Document frequency of word {target_word}: {wiki_text_dataset.get_word_frequency(target_word, document_frequency=True)}")

Document frequency of word coordinates: 4


## LSA

In [77]:
# vocab_idx_mapping = {word: idx for idx, word in enumerate(wiki_text_dataset.vocabulary.idx2word)}
# also match < and >
count_vectorizer = CountVectorizer(vocabulary=wiki_text_dataset.vocabulary.idx2word, token_pattern=r'[\w<>]+')
doc_term_matrix = count_vectorizer.fit_transform([' '.join(doc.text) for doc in wiki_text_dataset])

In [78]:
print(wiki_text_dataset.vocabulary.idx2word[:10])
# 3 first 10 keys of the vocabulary
print(count_vectorizer.get_feature_names_out()[:10])
print(doc_term_matrix.shape)
# print(doc_term_matrix[:10, :10].todense())

['senjō', 'valkyria', '<num>', 'chronicles', 'japanese', 'lit', 'battlefield', 'commonly', 'referred', 'iii']
['senjō' 'valkyria' '<num>' 'chronicles' 'japanese' 'lit' 'battlefield'
 'commonly' 'referred' 'iii']
(609, 27103)


In [79]:
# get the word frequency of the reference word
reference_word = "times"
reference_word_idx = wiki_text_dataset.vocabulary.word2idx[reference_word]
word_doc_freq = doc_term_matrix[:, reference_word_idx].sum()
print(f"Term frequency of the word {reference_word}: {word_doc_freq}")

first_doc_length = len(wiki_text_dataset[0].text)
print(f"Length of first document: {first_doc_length}")
print(f"Number of words in the first document: {len(wiki_text_dataset[0].vocabulary)}")
print(f"Number of words according to the count vectorizer: {doc_term_matrix[-1].sum()}")
print(f"Number of documents the word appears in according to the count vectorizer, use the non-zero elements of the column: {doc_term_matrix[-1].count_nonzero()}")

Term frequency of the word times: 721
Length of first document: 1842
Number of words in the first document: 862
Number of words according to the count vectorizer: 556
Number of documents the word appears in according to the count vectorizer, use the non-zero elements of the column: 232


### Learning word embeddings using LSA

In [80]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50, random_state=42)
lsa = svd.fit(doc_term_matrix)

learned_embeddings_lsa = torch.tensor(lsa.components_, device=device)
print(learned_embeddings_lsa.shape)

torch.Size([50, 27103])
