In [6]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import sentencepiece as spm
import gensim
from gensim.models import Word2Vec
import json

In [26]:
# Specify the range of sentences you want to include
start_index = 0  # Index of the first sentence you want to include
end_index = 10000  # Index of the last sentence you want to include

# Open the original corpus file for reading
with open("corpus.txt", "r") as original_file:
    # Read all lines from the file
    all_lines = original_file.readlines()

# Count the number of lines
num_lines = len(all_lines)

print(f"Total number of lines in the corpus.txt file: {num_lines}")

# Extract the subset of lines based on the specified range
subset_lines = all_lines[start_index:end_index+1]  # Adding 1 to end_index to include it

# Write the subset of lines to a new file
with open("subset_corpus.txt", "w") as subset_file:
    subset_file.writelines(subset_lines)


Total number of lines in the corpus.txt file: 22893436


In [29]:
spm.SentencePieceTrainer.train('--input=subset_corpus.txt --model_prefix=small_m --vocab_size=3927 --normalization_rule_name=nfkc_cf')
sp = spm.SentencePieceProcessor()
sp.load('small_m.model')

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=subset_corpus.txt --model_prefix=small_m --vocab_size=3927 --normalization_rule_name=nfkc_cf
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: subset_corpus.txt
  input_format: 
  model_prefix: small_m
  model_type: UNIGRAM
  vocab_size: 3927
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_i

True

In [30]:
# Tokenized sentences as pieces
# In the corpus you have all the data of the queries and also the passages, so the tokenization is done on everything
tokenized_sentences = sp.encode_as_pieces(subset_lines)
tokenized_sentences = {
    subset_lines[i]: tokenized_sentences[i]
    for i in range(0, len(subset_lines))
}

# Tokenized sentences as ids
tokenized_sentences_with_ids = sp.encode_as_ids(subset_lines)
tokenized_sentences_with_ids = {
    subset_lines[i]: tokenized_sentences_with_ids[i]
    for i in range(0, len(subset_lines))
}

In [31]:
vocabulary = list(
    {
        token for token_sent in tokenized_sentences.values()
        for token in token_sent
    }
)
print('vocabulary size: ', len(vocabulary))

vocabulary size:  3765


# Embeddings

In [32]:
# Train Word2Vec model
model = Word2Vec(
    tokenized_sentences.values(),
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
    #sg=1, # 1 for Skip-Gram, default 0 for BGOW
    # SG is more computationally costly for bug data BGOW might be as good!
)
# Save the trained model
model.save("small_word2vec.model")
embeddings = model.wv.vectors
embeds = model.wv
print(f"The length of the embeddings with wv.vectors is {len(embeddings)}")
print(f"The length of the embedding with wv is {len(embeds)}")

# Create a dictionary mapping each token to its embedding
token_embeddings = {token: embeddings[i] for i, token in enumerate(vocabulary)}
token_embeddings_with_ids = {
    i: embeddings[i] for i, token in enumerate(vocabulary)
}

The length of the embeddings with wv.vectors is 3765
The length of the embedding with wv is 3765


In [38]:
# Load the trained Word2Vec model
model = Word2Vec.load("small_word2vec.model")

# Get the embeddings of a word
word = "▁happy"
embedding = model.wv[word]

print(f"Embedding for '{word}': {embedding}")

Embedding for '▁happy': [-0.01102994 -0.19077699  0.46447647 -1.0125498  -0.374872   -0.630869
  0.6027745   0.9606834  -1.2407521  -1.4744995  -0.32542947 -0.9579944
  0.8536063   0.94020474 -0.14423439  0.21909064 -0.00725378 -0.5639201
 -0.61398417 -1.0377836   0.08764975  0.17330503  0.38137314 -0.3942622
  1.1649728  -0.1826953  -0.1610005  -0.43996832 -0.3622171   0.21537629
  1.2118291   0.87262595  0.46588743 -0.4666382   0.5205513   0.74417275
 -0.1511431  -0.41923487 -0.2680784  -0.5324718   0.5550724  -0.81215733
 -0.06027303  0.23636402 -0.02789747 -0.5857216   0.755192   -0.4271193
 -0.32232478  0.3431555   0.24147639  0.06752449 -0.43039113  0.05127047
  0.26393446 -0.29193097 -0.47953898  0.35544926 -0.9251122   0.08964888
  0.20621695  0.04197603  0.7394184  -0.18384862 -1.2065048  -0.53734195
 -0.679461   -0.21074373 -0.96567345  1.277809   -0.1685814   0.36402482
  0.6832084   0.5689435  -0.05702838  0.33325988  0.55867225 -0.20658119
 -0.45986596  0.6572193   0.22160

In [39]:
similar_words = model.wv.most_similar(word)

# Print the similar words
print(f"Similar words to '{word}':")
for similar_word, similarity_score in similar_words:
    print(f"{similar_word}: {similarity_score}")

Similar words to '▁happy':
▁excited: 0.919988751411438
▁sad: 0.8788611888885498
▁so: 0.8725952506065369
▁much: 0.8707915544509888
▁scared: 0.8674418330192566
▁felt: 0.8576525449752808
▁relieved: 0.8546566963195801
▁proud: 0.8518763184547424
▁very: 0.8418503403663635
▁surprised: 0.8366981148719788
