In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import sentencepiece as spm
import gensim
from gensim.models import Word2Vec
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Specify the range of sentences you want to include
start_index = 0  # Index of the first sentence you want to include
end_index = 10000  # Index of the last sentence you want to include

# Open the original corpus file for reading
with open("corpus.txt", "r") as original_file:
    # Read all lines from the file
    all_lines = original_file.readlines()

# Count the number of lines
num_lines = len(all_lines)

print(f"Total number of lines in the corpus.txt file: {num_lines}")

# Extract the subset of lines based on the specified range
subset_lines = all_lines[start_index:end_index+1]  # Adding 1 to end_index to include it

# Write the subset of lines to a new file
with open("subset_corpus.txt", "w") as subset_file:
    subset_file.writelines(subset_lines)


Total number of lines in the corpus.txt file: 22893436


In [3]:
spm.SentencePieceTrainer.train('--input=subset_corpus.txt --model_prefix=small_m --vocab_size=3927 --normalization_rule_name=nfkc_cf')
sp = spm.SentencePieceProcessor()
sp.load('small_m.model')

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=subset_corpus.txt --model_prefix=small_m --vocab_size=3927 --normalization_rule_name=nfkc_cf
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: subset_corpus.txt
  input_format: 
  model_prefix: small_m
  model_type: UNIGRAM
  vocab_size: 3927
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_i

True

In [4]:
# Tokenized sentences as pieces
tokenized_sentences = sp.encode_as_pieces(subset_lines)
tokenized_sentences = {
    subset_lines[i]: tokenized_sentences[i]
    for i in range(0, len(subset_lines))
}

# Tokenized sentences as ids
tokenized_sentences_with_ids = sp.encode_as_ids(subset_lines)
tokenized_sentences_with_ids = {
    subset_lines[i]: tokenized_sentences_with_ids[i]
    for i in range(0, len(subset_lines))
}

In [5]:
vocabulary = list(
    {
        token for token_sent in tokenized_sentences.values()
        for token in token_sent
    }
)
print('vocabulary size: ', len(vocabulary))

vocabulary size:  3765


# Embeddings

In [6]:
# Train Word2Vec model
model = Word2Vec(
    tokenized_sentences.values(),
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
    #sg=1, # 1 for Skip-Gram, default 0 for BGOW
    # SG is more computationally costly for bug data BGOW might be as good!
)
# Save the trained model
model.save("small_word2vec.model")
embeddings = model.wv.vectors
embeds = model.wv
print(f"The length of the embeddings with wv.vectors is {len(embeddings)}")
print(f"The length of the embedding with wv is {len(embeds)}")

# Create a dictionary mapping each token to its embedding
token_embeddings = {token: embeddings[i] for i, token in enumerate(vocabulary)}
token_embeddings_with_ids = {
    i: embeddings[i] for i, token in enumerate(vocabulary)
}

The length of the embeddings with wv.vectors is 3765
The length of the embedding with wv is 3765


In [7]:
# Load the trained Word2Vec model
model = Word2Vec.load("small_word2vec.model")

# Get the embeddings of a word
word = "▁happy"
embedding = model.wv[word]

print(f"Embedding for '{word}': {embedding}")

Embedding for '▁happy': [-0.18930872 -0.3658714   0.23972197 -0.6109619  -0.8768496  -0.13622326
  0.10373501 -0.13753134 -0.80557567 -0.6762138   0.07669332 -0.06175397
  0.39594507  0.63344616 -0.33380193  0.58113176 -0.20738696 -0.39435822
 -0.422448   -0.47013718  0.04413772  0.41993368  0.11135497 -0.4304177
  1.039249    0.1567777  -0.17144758 -0.07162035 -0.1553094  -0.24645993
  0.41265914  0.9227355   0.3565301  -0.55990106  0.51909155  0.83321744
 -0.31594372 -0.44052303 -0.26385888 -0.41355833  0.74404526 -1.0657189
 -0.03619651  0.0628233  -0.20251249 -0.64448905  0.56800115 -0.3275801
 -0.48824784  0.473413   -0.01188486  0.15580404  0.05779416  0.06676568
  0.49295136  0.0134197  -0.1255605   0.66052115 -0.42865375  0.14555936
  0.2059935  -0.21149616  0.6078993  -0.04945381 -1.441568   -0.39993483
 -0.23215418  0.28482088 -0.99521863  1.6616209   0.14355545  0.12992221
  0.9227397   0.33335307  0.09674026  0.41543236  0.48518592 -0.35784376
 -0.6252508   1.0908644  -0.28

In [8]:
similar_words = model.wv.most_similar(word)

# Print the similar words
print(f"Similar words to '{word}':")
for similar_word, similarity_score in similar_words:
    print(f"{similar_word}: {similarity_score}")

Similar words to '▁happy':
▁excited: 0.9461204409599304
▁sad: 0.8972897529602051
▁curious: 0.8888179659843445
▁proud: 0.873695969581604
▁scared: 0.8688650727272034
▁ashamed: 0.8582441210746765
▁much: 0.8475256562232971
▁very: 0.8392248749732971
▁so: 0.8357988595962524
▁lonely: 0.8294639587402344
