In [1]:
# Word2Vec Implementation

# Import necessary libraries

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:

# Define the Word2Vec model class

class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, target_word, context_word):
        target_embed = self.in_embed(target_word)
        context_embed = self.out_embed(context_word)
        return target_embed, context_embed

In [5]:
# Define the training function

def train_word2vec(corpus, window_size, embedding_dim, num_epochs, learning_rate):
    # Preprocess the corpus and build the vocabulary
    tokenized_corpus = word_tokenize(corpus.lower())
    vocabulary = list(set(tokenized_corpus))
    word_to_idx = {word: i for i, word in enumerate(vocabulary)}
    idx_to_word = {i: word for i, word in enumerate(vocabulary)}
    vocab_size = len(vocabulary)
    # Create the target-context word pairs
    training_pairs = []
    for i in range(len(tokenized_corpus)):
        target_word = tokenized_corpus[i]
        for j in range(i - window_size, i + window_size + 1):
            if j != i and j >= 0 and j < len(tokenized_corpus):
                context_word = tokenized_corpus[j]
                training_pairs.append((word_to_idx[target_word], word_to_idx[context_word]))
    # Initialize the Word2Vec model
    model = Word2Vec(vocab_size, embedding_dim)
    # Define the loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        total_loss = 0.0
        for target_word, context_word in training_pairs:
            # Zero the gradients
            optimizer.zero_grad()
            # Forward pass
            target_embed, context_embed = model(torch.LongTensor([target_word]), torch.LongTensor([context_word]))
            # Compute the loss
            loss = criterion(target_embed, context_embed)
            # Backward pass
            loss.backward()
            # Update the model parameters
            optimizer.step()
            # Accumulate the loss
            total_loss += loss.item()
        # Print the average loss for the epoch
        avg_loss = total_loss / len(training_pairs)
        print(f"Epoch: {epoch+1}, Average Loss: {avg_loss}")
    # Return the trained Word2Vec model
    return model

In [13]:
# Define the main function

def main():
    # Set hyperparameters
    corpus = "I love to learn deep learning. It is fascinating!"
    window_size = 3
    embedding_dim = 10
    num_epochs = 50
    learning_rate = 0.001
    # Load and preprocess the corpus
    tokenized_corpus = word_tokenize(corpus.lower())
    vocabulary = list(set(tokenized_corpus))
    word_to_idx = {word: i for i, word in enumerate(vocabulary)}
    idx_to_word = {i: word for i, word in enumerate(vocabulary)}
    # Train the Word2Vec model
    model = train_word2vec(corpus, window_size, embedding_dim, num_epochs, learning_rate)
    # Evaluate the trained model using word similarity or analogy tasks
    word_pairs = [("deep", "learning")]
    for word1, word2 in word_pairs:
        idx1 = word_to_idx[word1]
        idx2 = word_to_idx[word2]
        embed1 = model.in_embed(torch.LongTensor([idx1])).detach().numpy()
        embed2 = model.in_embed(torch.LongTensor([idx2])).detach().numpy()
        similarity = cosine_similarity(embed1, embed2)[0][0]
        print(f"Similarity between '{word1}' and '{word2}': {similarity}")
    # Print the learned word embeddings
    embeddings = []
    for i in range(len(vocabulary)):
        word = idx_to_word[i]
        embed = model.in_embed(torch.LongTensor([i])).detach().numpy()
        embeddings.append((word, embed))
    for word, embed in embeddings:
        print(f"Word: {word}, Embedding: {embed}")
    # Save the trained model
    torch.save(model.state_dict(), "word2vec_model.pth")
# Run the main function
if __name__ == "__main__":
    main()

Epoch: 1, Average Loss: 1.9363663682231196
Epoch: 2, Average Loss: 1.9312489242465407
Epoch: 3, Average Loss: 1.9261469609207578
Epoch: 4, Average Loss: 1.9210604887317728
Epoch: 5, Average Loss: 1.915989280850799
Epoch: 6, Average Loss: 1.9109334592466
Epoch: 7, Average Loss: 1.9058927683918565
Epoch: 8, Average Loss: 1.9008674025535583
Epoch: 9, Average Loss: 1.8958572138238836
Epoch: 10, Average Loss: 1.8908620057282624
Epoch: 11, Average Loss: 1.885881965359052
Epoch: 12, Average Loss: 1.880916859816622
Epoch: 13, Average Loss: 1.8759667349082452
Epoch: 14, Average Loss: 1.8710314846701093
Epoch: 15, Average Loss: 1.8661111091022138
Epoch: 16, Average Loss: 1.8612055910958185
Epoch: 17, Average Loss: 1.8563147689457293
Epoch: 18, Average Loss: 1.851438738129757
Epoch: 19, Average Loss: 1.8465772651963763
Epoch: 20, Average Loss: 1.8417304820484586
Epoch: 21, Average Loss: 1.8368982600945014
Epoch: 22, Average Loss: 1.832080508823748
Epoch: 23, Average Loss: 1.8272772243729345
Epoch