In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf

# Check TensorFlow version
print(f"TensorFlow version: {tf.__version__}")


TensorFlow version: 2.10.0


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [4]:
# Check PyTorch version
print(f"PyTorch version: {torch.__version__}")


PyTorch version: 2.3.1+cpu


## Loading the Data

In [5]:
corpus = ["apple banana fruit", "banana apple fruit", "banana fruit apple",
          "dog cat animal", "cat animal dog", "cat dog animal"]

In [6]:
#1. tokenization by each word
corpus = [sent.split(" ") for sent in corpus]
corpus

[['apple', 'banana', 'fruit'],
 ['banana', 'apple', 'fruit'],
 ['banana', 'fruit', 'apple'],
 ['dog', 'cat', 'animal'],
 ['cat', 'animal', 'dog'],
 ['cat', 'dog', 'animal']]

In [7]:
#2. numeralization
flatten = lambda l: [item for sublist in l for item in sublist] #function to concat lists
vocabs = list(set(flatten(corpus))) #find and store unique words

In [8]:
vocabs

['fruit', 'animal', 'dog', 'banana', 'cat', 'apple']

In [9]:
word2index = {val:index for index, val in enumerate(vocabs)} #create dict with words and their indexes

In [10]:
word2index

{'fruit': 0, 'animal': 1, 'dog': 2, 'banana': 3, 'cat': 4, 'apple': 5}

In [11]:
#The "<UNK>" token stands for "unknown" and is used to handle words that are not in our vocabulary
vocabs.append('<UNK>')
word2index['<UNK>'] = 6

In [12]:
#creating reverse dictionary to map indices back to words
index2word = {word:index for index, word in word2index.items()}
index2word[5]

'apple'

## Preparing data for training

In [13]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus):

    skipgrams = []

   
    for doc in corpus:
        
        for i in range(1, len(doc)-1):
            
            center = word2index[doc[i]]
            
            outside = (word2index[doc[i-1]], word2index[doc[i+1]])
           
            for each_out in outside:
                skipgrams.append([center, each_out])
                
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)

In [14]:
x, y = random_batch(2, corpus)

In [15]:
x

array([[1],
       [1]])

In [16]:
y

array([[2],
       [4]])

## Model

In [17]:
len(vocabs)

7

In [18]:
embedding = nn.Embedding(7, 2)

In [19]:
x_tensor = torch.LongTensor(x)
embedding(x_tensor).shape

torch.Size([2, 1, 2])

In [20]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss

In [21]:
#prepare all vocabs

batch_size = 2
voc_size   = len(vocabs)

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs

tensor([[0, 1, 2, 3, 4, 5, 6],
        [0, 1, 2, 3, 4, 5, 6]])

In [22]:
model = Skipgram(voc_size, 2)
model

Skipgram(
  (embedding_center): Embedding(7, 2)
  (embedding_outside): Embedding(7, 2)
)

In [23]:
input_tensor = torch.LongTensor(x)
label_tensor = torch.LongTensor(y)

In [24]:
loss = model(input_tensor, label_tensor, all_vocabs)

In [25]:
loss

tensor(2.1866, grad_fn=<NegBackward0>)