In [17]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf

# Check TensorFlow version
print(f"TensorFlow version: {tf.__version__}")


TensorFlow version: 2.10.0


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [4]:
# Check PyTorch version
print(f"PyTorch version: {torch.__version__}")


PyTorch version: 2.3.1+cpu


## Loading the Data

In [5]:
corpus = ["apple banana fruit", "banana apple fruit", "banana fruit apple",
          "dog cat animal", "cat animal dog", "cat dog animal"]

In [6]:
#1. tokenization by each word
corpus = [sent.split(" ") for sent in corpus]
corpus

[['apple', 'banana', 'fruit'],
 ['banana', 'apple', 'fruit'],
 ['banana', 'fruit', 'apple'],
 ['dog', 'cat', 'animal'],
 ['cat', 'animal', 'dog'],
 ['cat', 'dog', 'animal']]

In [7]:
#2. numeralization
flatten = lambda l: [item for sublist in l for item in sublist] #function to concat lists
vocabs = list(set(flatten(corpus))) #find and store unique words

In [8]:
vocabs

['animal', 'fruit', 'dog', 'cat', 'banana', 'apple']

In [9]:
word2index = {val:index for index, val in enumerate(vocabs)} #create dict with words and their indexes

In [10]:
word2index

{'animal': 0, 'fruit': 1, 'dog': 2, 'cat': 3, 'banana': 4, 'apple': 5}

In [11]:
#The "<UNK>" token stands for "unknown" and is used to handle words that are not in our vocabulary
vocabs.append('<UNK>')
word2index['<UNK>'] = 6

In [12]:
#creating reverse dictionary to map indices back to words
index2word = {word:index for index, word in word2index.items()}
index2word[5]

'apple'

## Preparing data for training

In [32]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus):

    skipgrams = []

   
    for doc in corpus:
        
        for i in range(1, len(doc)-1):
            
            center = word2index[doc[i]]
            
            outside = (word2index[doc[i-1]], word2index[doc[i+1]])
           
            for each_out in outside:
                skipgrams.append([center, each_out])
                
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)

In [33]:
x, y = random_batch(2, corpus)

In [38]:
x

array([[1],
       [4]])

In [39]:
y

array([[5],
       [1]])

## Model

In [40]:
len(vocabs)

7

In [41]:
embedding = nn.Embedding(7, 2)

In [42]:
x_tensor = torch.LongTensor(x)
embedding(x_tensor).shape

torch.Size([2, 1, 2])