# Embeddings Demo using PyTorch

## Word2Vec

### Import Necessary Libraries

In [1]:
#input
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

### Prepare Corpus and **Vocabulary**

In [2]:
# Step 1: Prepare a simple corpus(sentence)
corpus = "He is the king . The king is royal. She is the royal queen . He is a prince, and she is a princess ."

# Step 2: Tokenize and build vocabulary
tokens = corpus.lower().split()  # Tokenize and lower-case words
vocab = set(tokens)  # Create a set of unique words
word2idx = {word: idx for idx, word in enumerate(vocab)}  # Create word to index mapping
idx2word = {idx: word for word, idx in word2idx.items()}  # Create index to word mapping
vocab_size = len(vocab)  # Calculate the size of vocabulary

In [7]:
print("Tokens: ", tokens)
print("Vocabulary: ", vocab)
print("Word to index mapping: ", word2idx)
print("Index to word mapping: ", idx2word)
print("Vocabulary size: ", vocab_size)
print(len(tokens))

Tokens:  ['he', 'is', 'the', 'king', '.', 'the', 'king', 'is', 'royal.', 'she', 'is', 'the', 'royal', 'queen', '.', 'he', 'is', 'a', 'prince,', 'and', 'she', 'is', 'a', 'princess', '.']
Vocabulary:  {'a', 'and', 'queen', 'prince,', 'she', '.', 'king', 'he', 'the', 'royal.', 'royal', 'is', 'princess'}
Word to index mapping:  {'a': 0, 'and': 1, 'queen': 2, 'prince,': 3, 'she': 4, '.': 5, 'king': 6, 'he': 7, 'the': 8, 'royal.': 9, 'royal': 10, 'is': 11, 'princess': 12}
Index to word mapping:  {0: 'a', 1: 'and', 2: 'queen', 3: 'prince,', 4: 'she', 5: '.', 6: 'king', 7: 'he', 8: 'the', 9: 'royal.', 10: 'royal', 11: 'is', 12: 'princess'}
Vocabulary size:  13
25


### Create Dataset

In [8]:
context_window = 2  # Define the size of context window
data = []  # Initialize empty list to hold data
# creates data that has 4 context words with 1 target word 
# Loop through each token and extract its context and target word
for i in range(context_window, len(tokens) - context_window):
    context = [tokens[i - t] for t in range(context_window, 0, -1)] + [tokens[i + t] for t in range(1, context_window + 1)]

    target = tokens[i] 
    data.append((context, target))  # Append the context and target word as a tuple to the data list

print(data)
print(len(data))

[(['he', 'is', 'king', '.'], 'the'), (['is', 'the', '.', 'the'], 'king'), (['the', 'king', 'the', 'king'], '.'), (['king', '.', 'king', 'is'], 'the'), (['.', 'the', 'is', 'royal.'], 'king'), (['the', 'king', 'royal.', 'she'], 'is'), (['king', 'is', 'she', 'is'], 'royal.'), (['is', 'royal.', 'is', 'the'], 'she'), (['royal.', 'she', 'the', 'royal'], 'is'), (['she', 'is', 'royal', 'queen'], 'the'), (['is', 'the', 'queen', '.'], 'royal'), (['the', 'royal', '.', 'he'], 'queen'), (['royal', 'queen', 'he', 'is'], '.'), (['queen', '.', 'is', 'a'], 'he'), (['.', 'he', 'a', 'prince,'], 'is'), (['he', 'is', 'prince,', 'and'], 'a'), (['is', 'a', 'and', 'she'], 'prince,'), (['a', 'prince,', 'she', 'is'], 'and'), (['prince,', 'and', 'is', 'a'], 'she'), (['and', 'she', 'a', 'princess'], 'is'), (['she', 'is', 'princess', '.'], 'a')]
21


### Define Skip-gram Model

In [9]:
# Step 4: Define the Skip-gram model architecture
#skipgram gives probabilities of closest contedxt words
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(SkipGram, self).__init__()
        self.in_embed = nn.Embedding(vocab_size, embed_dim)
        self.out_embed = nn.Embedding(vocab_size, embed_dim)
        self.vocab_size = vocab_size
    #creates input and output embedding layers
        
    def forward(self, target, context):
        in_embeds = self.in_embed(target)
        out_embeds = self.out_embed(context)
        scores = torch.matmul(out_embeds, in_embeds.t())
        return scores.squeeze()
    #takes input words converts to corresponding word vectors, takes context words converts to vectors
    #squeeze gets rid of extra stuff

### Initialize Model and Train

In [10]:
# Step 5: Initialize the model, loss, and optimizer
embed_dim = 50
model = SkipGram(vocab_size, embed_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)


In [11]:
print(model)

SkipGram(
  (in_embed): Embedding(13, 50)
  (out_embed): Embedding(13, 50)
)


In [20]:
# Step 6: Training loop
#goes through dataset for epochs, computes loss, backpropagates, updates parameters of the tensors, prints loss
epochs = 10
for epoch in range(epochs):
    total_loss = 0
    for context, target in data:
        print(context, target)
        context_idx = torch.tensor([word2idx[w] for w in context], dtype=torch.long)
        target_idx = torch.tensor([word2idx[target]], dtype=torch.long)
        
        optimizer.zero_grad()
        scores = model(target_idx, context_idx)
        # change scores to be a float tensor

        context_idx = context_idx.float()
        loss = criterion(scores, context_idx)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        
        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(data):.4f}')

['he', 'is', 'king', '.'] the
['is', 'the', '.', 'the'] king
['the', 'king', 'the', 'king'] .
['king', '.', 'king', 'is'] the
['.', 'the', 'is', 'royal.'] king
['the', 'king', 'royal.', 'she'] is
['king', 'is', 'she', 'is'] royal.
['is', 'royal.', 'is', 'the'] she
['royal.', 'she', 'the', 'royal'] is
['she', 'is', 'royal', 'queen'] the
['is', 'the', 'queen', '.'] royal
['the', 'royal', '.', 'he'] queen
['royal', 'queen', 'he', 'is'] .
['queen', '.', 'is', 'a'] he
['.', 'he', 'a', 'prince,'] is
['he', 'is', 'prince,', 'and'] a
['is', 'a', 'and', 'she'] prince,
['a', 'prince,', 'she', 'is'] and
['prince,', 'and', 'is', 'a'] she
['and', 'she', 'a', 'princess'] is
['she', 'is', 'princess', '.'] a
['he', 'is', 'king', '.'] the
['is', 'the', '.', 'the'] king
['the', 'king', 'the', 'king'] .
['king', '.', 'king', 'is'] the
['.', 'the', 'is', 'royal.'] king
['the', 'king', 'royal.', 'she'] is
['king', 'is', 'she', 'is'] royal.
['is', 'royal.', 'is', 'the'] she
['royal.', 'she', 'the', 'royal']

### Evaluate Model

In [25]:
# Step 7: Evaluation
def most_similar(word, word2idx, idx2word, embedding_matrix, topk=5):
    word_embedding = embedding_matrix[word2idx[word]]
    similarities = cosine_similarity([word_embedding], embedding_matrix)[0]
    print([tup for tup in zip(idx2word.values(), similarities)])
    similar_words = [(idx2word[idx], similarities[idx]) for idx in np.argsort(similarities, axis=-1)[-topk-1:-1][::-1]]
    return similar_words

In [31]:
embedding_matrix = model.in_embed.weight.data.numpy()
# print(embedding_matrix) # Debug line
most_similar_words = most_similar('king', word2idx, idx2word, embedding_matrix, topk=5)
print(most_similar_words)

[('a', 0.21617997), ('and', -0.34440866), ('queen', 0.20008719), ('prince,', -0.13608588), ('she', -0.038912423), ('.', 0.056709126), ('king', 1.0000001), ('he', -0.091720924), ('the', 0.19127563), ('royal.', -0.17081617), ('royal', -0.024660546), ('is', 0.09099622), ('princess', 0.0979481)]
[('a', 0.21617997), ('queen', 0.20008719), ('the', 0.19127563), ('princess', 0.0979481), ('is', 0.09099622)]


In [32]:
# Initialize a tensor to store the embeddings
embedding_matrix = torch.zeros((vocab_size, embed_dim))
# Fill the tensor with the Word2Vec embeddings
for i, word in enumerate(idx2word.values()):
    embedding_matrix[i] = torch.tensor(model[word])

# Create an nn.Embedding layer and load the pre-trained embeddings
embedding_layer = torch.nn.Embedding.from_pretrained(embedding_matrix) 

TypeError: 'SkipGram' object is not subscriptable

In [None]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.fc1 = nn.Linear(embed_dim, 128)
        self.fc2 = nn.Linear(128, num_class)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

## Using word2vec for Text Classification

### Installing Required Libraries

In [None]:
# !pip install gensim torch

### Loading Pre-trained Word2Vec Embeddings

TODO: make sure to download the embeddings file from [kaggle](https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300) and place it in the same directory as this notebook

In [None]:
from gensim.models import KeyedVectors

# Load vectors directly from the file
model = KeyedVectors.load_word2vec_format('path/to/GoogleNews-vectors-negative300.bin', binary=True)

### Converting Embeddings to PyTorch Tensors


In [None]:
vocab_size = len(model.index_to_key)
embedding_dim = model.vector_size

# Initialize a tensor to store the embeddings
embedding_matrix = torch.zeros((vocab_size, embedding_dim))

# Fill the tensor with the Word2Vec embeddings
for i, word in enumerate(model.index_to_key):
    embedding_matrix[i] = torch.tensor(model[word])

# Create an nn.Embedding layer and load the pre-trained embeddings
embedding_layer = torch.nn.Embedding.from_pretrained(embedding_matrix)

### Building a Simple Text Classification Model

In [None]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.fc1 = nn.Linear(embed_dim, 128)
        self.fc2 = nn.Linear(128, num_class)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

### Training the Model

In [None]:
# Training code here

### Evaluating the Model

In [None]:
# Evaluation code here

## GloVe

### Loading Pre-trained GloVe Embeddings

In [None]:
from torchtext.vocab import GloVe
# Load GloVe vectors using torchtext
glove = GloVe(name='6B', dim=100)

RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory

### Converting Embeddings to PyTorch Tensors

In [None]:
# Fetch the indices for the words in your vocabulary
word_indices = [glove.stoi[word] for word in ['hello', 'world']]

# Create a tensor with the GloVe embeddings
embedding_tensor = torch.stack([glove.vectors[i] for i in word_indices])

NameError: name 'glove' is not defined

### Building a Text Classification Model


In [None]:
import torch.nn as nn

class TextClassifier(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, num_class):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_tensor)
        self.fc1 = nn.Linear(embedding_dim, 128)
        self.fc2 = nn.Linear(128, num_class)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

### Training the Model

In [None]:
# Training code here

### Evaluating the Model

In [None]:
# Evaluation code here