### Get embeddings
We are retraining embeddings based on the cbow architecture in word2vec. 

First, tokenize the combined text8 and ms-marco dataset.

In [23]:
from tokenizer import preprocess, create_lookup_tables
import pickle

# open the combined_text8_msmarco.txt file and read the text
with open('./data/temp/combined_text8_msmarco.txt', 'r') as f:
    combined_text = f.read()

# tokenize the text and save as a .pkl file
combined_tokens = preprocess(combined_text)
with open('./data/temp/combined_corpus.pkl', 'wb') as f:
    pickle.dump(combined_tokens, f)

# create the lookup tables
vocab_to_int, int_to_vocab = create_lookup_tables(combined_tokens)

# save the lookup tables to a .pkl file
with open('./data/temp/combined_vocab_to_int.pkl', 'wb') as f:
    pickle.dump(vocab_to_int, f)

with open('./data/temp/combined_int_to_vocab.pkl', 'wb') as f:
    pickle.dump(int_to_vocab, f)


Number of words in the text before filtering:  74281642
Number of words in the text after filtering:  73326867
Number of words in the vocabulary:  146489


Now that we have our new vocabulary from the combined text8 wiki data and ms-marco data, let's generate the embeddings using the CBOW architecture.

In [24]:
#print(corpus[:100])

# Generate the training data from the corpus
# The training data looks like a list of tuples,
# where each tuple contains a list of context words and the target word (not the IDs)


def generate_training_data(corpus):
   data = []

   # start from index 2 and end 2 positions before the last word
   # this ensures we always have 2 words before and after the target word
   # for a 5-len sliding window


   for i in range(2, len(corpus) - 2):
       # Get the context words
       # 'i' is the index of the target word
       # [i-2:i] gets the two words before the target word
       # [i+1:i+3] gets the two words after the target word
       context_words = corpus[i-2:i] + corpus[i+1:i+3]
      
       # Get the target word
       target_word = corpus[i]


       # Append the tuple to the data list
       data.append((context_words, target_word))


   return data

In [None]:
# usage

# load the corpus
with open('./data/temp/combined_corpus.pkl', 'rb') as f:
    corpus = pickle.load(f)

training_data = generate_training_data(corpus)
print("CBOW training data generated")



In [7]:
# quick eyeball check to see if the training data is correct
# show the last 30 tuples in the training data
training_data[-30:]


[(['esp', 'magnetic', 'or', 'electric'], 'poles'),
 (['magnetic', 'poles', 'electric', 'charge'], 'or'),
 (['poles', 'or', 'charge', '<PERIOD>'], 'electric'),
 (['or', 'electric', '<PERIOD>', '3'], 'charge'),
 (['electric', 'charge', '3', '<PERIOD>'], '<PERIOD>'),
 (['charge', '<PERIOD>', '<PERIOD>', '<LEFT_PAREN>'], '3'),
 (['<PERIOD>', '3', '<LEFT_PAREN>', 'general'], '<PERIOD>'),
 (['3', '<PERIOD>', 'general', 'physics'], '<LEFT_PAREN>'),
 (['<PERIOD>', '<LEFT_PAREN>', 'physics', '<RIGHT_PAREN>'], 'general'),
 (['<LEFT_PAREN>', 'general', '<RIGHT_PAREN>', 'the'], 'physics'),
 (['general', 'physics', 'the', 'particular'], '<RIGHT_PAREN>'),
 (['physics', '<RIGHT_PAREN>', 'particular', 'state'], 'the'),
 (['<RIGHT_PAREN>', 'the', 'state', 'of'], 'particular'),
 (['the', 'particular', 'of', 'a'], 'state'),
 (['particular', 'state', 'a', 'part'], 'of'),
 (['state', 'of', 'part', 'of'], 'a'),
 (['of', 'a', 'of', 'a'], 'part'),
 (['a', 'part', 'a', 'body'], 'of'),
 (['part', 'of', 'body', 

### In this section we start training the cbow model (word2vec) to get the embeddings 

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import tqdm
import wandb
import datetime
import torch.nn.functional as F



In [11]:
## Define the data input structure and the model
class CBOWDataset(Dataset):
   def __init__(self, data: list[tuple[list[str], str]], word_to_id: dict[str, int]):
       self.data = data
       self.word_to_id = word_to_id


   # overriding the __len__ method to tell PyTorch how many samples you have
   def __len__(self):
       return len(self.data)


   # overriding the __getitem__ method
   # to tell PyTorch how to retrieve a specific sample and convert it to the format your model expects
   def __getitem__(self, idx):
       context, target = self.data[idx]
       context_ids = torch.tensor([self.word_to_id[word] for word in context], dtype=torch.long)
       target_id = torch.tensor(self.word_to_id[target], dtype=torch.long)
       return context_ids, target_id

class CBOW(torch.nn.Module):
   def __init__(self, vocab_size, embedding_dim):
       super(CBOW, self).__init__()
       self.embedding = nn.Embedding(vocab_size, embedding_dim)
       self.linear = nn.Linear(embedding_dim, vocab_size)


   def forward(self, inputs):
       embed = self.embedding(inputs)
       embed = embed.mean(dim=1)
       out = self.linear(embed)
       probs = F.log_softmax(out, dim=1)
       return probs


In [12]:
# Initialize settings
torch.manual_seed(42)

if torch.cuda.is_available():
   print(f"GPU: {torch.cuda.get_device_name(0)}")
   print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
   # Enable cuDNN auto-tuner
   torch.backends.cudnn.benchmark = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Print that the GPU is being used
print(f"Using device: {device}")


GPU: NVIDIA RTX A4000
GPU Memory: 15.82 GB
Using device: cuda


In [18]:
print(len(vocab_to_int))

146490


In [19]:
timestamp = datetime.datetime.now().strftime('%Y_%m_%d__%H_%M_%S')

dataset = CBOWDataset(training_data, vocab_to_int)
train_size = int(0.7 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Initialize wandb with your configuration
wandb.init(
   entity="evelyntants-personal",
   project="word2vec_embeddings",
   name=f"{timestamp}",
   config={
       # Model parameters
       "embedding_dim": 200,
       "vocab_size": 146490,
      
       # Training parameters
       "batch_size": 1024,
       "learning_rate": 0.003,
       "num_epochs": 5,
       "train_split": 0.7,
      
       # Optimizer parameters
       "weight_decay": 1e-5,
      
       # DataLoader parameters
       "num_workers": 4,
      
       # Architecture details
       "model_type": "CBOW",
       "context_size": 4  # 2 words before + 2 words after
   }
)

# Then use the config values throughout your code
EMBEDDING_DIM = wandb.config.embedding_dim
BATCH_SIZE = wandb.config.batch_size
LEARNING_RATE = wandb.config.learning_rate
NUM_EPOCHS = wandb.config.num_epochs
TRAIN_SPLIT = wandb.config.train_split

0,1
batch_loss,▆█▇▇▅▅▅▅▅▅▄▄▅▃▃▅▂▄▂▄▂▃▂▄▃▁▂▂▃▃▂▃▂▁▁▂▁▂▂▃

0,1
batch_loss,6.47462


In [20]:

# Create data loaders with GPU pinning
train_loader = DataLoader(
   train_dataset,
   batch_size=wandb.config.batch_size,
   shuffle=True,
   pin_memory=True,  # Enable pinning for faster GPU transfer
   num_workers=wandb.config.num_workers     # Use multiple workers for data loading
)


test_loader = DataLoader(
   test_dataset,
   batch_size=wandb.config.batch_size,
   shuffle=False,
   pin_memory=True,
   num_workers=wandb.config.num_workers
)



In [21]:
model = CBOW(
   vocab_size=wandb.config.vocab_size,
   embedding_dim=wandb.config.embedding_dim)


model = model.to(device)


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),
                            lr=wandb.config.learning_rate,
                            weight_decay=wandb.config.weight_decay)



#Add evaluation function
def evaluate(model, test_loader, criterion, device):
   model.eval()
   total_loss = 0
   with torch.no_grad():
      for context, target in test_loader:
           context, target = context.to(device), target.to(device)
           output = model(context)
           loss = criterion(output, target)
           total_loss += loss.item()
   return total_loss / len(test_loader)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [22]:
# simplified training loop
# Training Loop
for epoch in range(wandb.config.num_epochs):
    model.train()
    total_train_loss = 0
    progress = tqdm.tqdm(train_loader, desc=f'Epoch {epoch+1}', leave=False)
    for inputs, targets in progress:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        
        # Update the progress bar with the current loss
        progress.set_postfix(loss=loss.item())

        wandb.log({'batch_loss': loss.item()})

    # Calculate average training loss
    avg_train_loss = total_train_loss / len(train_loader)

    # Evaluate on test set
    model.eval()
    total_test_loss = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_test_loss += loss.item()

    avg_test_loss = total_test_loss / len(test_loader)

    # Log epoch metrics
    wandb.log({
        'epoch': epoch + 1,
        'train_loss': avg_train_loss,
        'test_loss': avg_test_loss,
    })

    # Print epoch summary
    print(f'Epoch {epoch+1}/{wandb.config.num_epochs}: Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}')

    # Save model checkpoint after every epoch
    checkpoint_path = f"../model/cbow_epoch{epoch+1}_{timestamp}.pt"
    torch.save(model.state_dict(), checkpoint_path)
    model_artifact = wandb.Artifact('model-weights', type='model')
    model_artifact.add_file(checkpoint_path)
    wandb.log_artifact(model_artifact)

    # Save embeddings separately
    embedding_weights = model.embedding.weight.data.cpu()
    embedding_path = f"../model/embeddings_epoch{epoch+1}_{timestamp}.pt"
    torch.save(embedding_weights, embedding_path)
    embedding_artifact = wandb.Artifact('embeddings', type='embeddings')
    embedding_artifact.add_file(embedding_path)
    wandb.log_artifact(embedding_artifact)

# Finish Wandb
wandb.finish()

                                                  

RuntimeError: Caught RuntimeError in pin memory thread for device 0.
Original Traceback (most recent call last):
  File "/root/MLX_Week2/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 41, in do_one_step
    data = pin_memory(data, device)
  File "/root/MLX_Week2/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 98, in pin_memory
    clone[i] = pin_memory(item, device)
  File "/root/MLX_Week2/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 64, in pin_memory
    return data.pin_memory(device)
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

