<a href="https://colab.research.google.com/github/SanjayBista1010/DeepLearning/blob/main/CBOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
sentences = [
    "i love cats",
    "i love dogs",
    "cats and dogs are friends"
]

In [3]:
words = " ".join(sentences).split()
print(words)
vocab = sorted(set(words))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w,i in word2idx.items()}
vocab_size = len(vocab)
print(vocab_size)

['i', 'love', 'cats', 'i', 'love', 'dogs', 'cats', 'and', 'dogs', 'are', 'friends']
7


In [4]:
print("vocab:", vocab)
print("word2inx:", word2idx)

vocab: ['and', 'are', 'cats', 'dogs', 'friends', 'i', 'love']
word2inx: {'and': 0, 'are': 1, 'cats': 2, 'dogs': 3, 'friends': 4, 'i': 5, 'love': 6}


In [5]:
sequences = [[word2idx[w] for w in s.split()] for s in sentences]
sequences

[[5, 6, 2], [5, 6, 3], [2, 0, 3, 1, 4]]

In [6]:
contexts = []
targets = []
window = 1

In [7]:
#only considering positions that have both left and right words

for seq in sequences:
  for i in range(window, len(seq)-window):
    context = [seq[i-1], seq[i+1]] #left and right
    target = seq[i]
    contexts.append(context)
    targets.append(target)

In [8]:
contexts

[[5, 2], [5, 3], [2, 3], [0, 1], [3, 4]]

In [9]:
targets

[6, 6, 0, 3, 1]

In [10]:
contexts = torch.tensor(contexts, dtype=torch.long)
print(contexts)
targets = torch.tensor(targets, dtype=torch.long)
print(targets)

tensor([[5, 2],
        [5, 3],
        [2, 3],
        [0, 1],
        [3, 4]])
tensor([6, 6, 0, 3, 1])


In [11]:
class CBOWDataset(Dataset):
    def __init__(self, contexts, targets):
        self.contexts = contexts
        self.targets = targets
    def __len__(self):
        return len(self.targets)
    def __getitem__(self, idx):
        return self.contexts[idx], self.targets[idx]

In [12]:
dataset = CBOWDataset(contexts, targets)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [13]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.fc = nn.Linear(emb_dim, vocab_size)

    def forward(self, x):
        embeds = self.emb(x)
        mean = embeds.mean(dim=1)
        out = self.fc(mean)
        return out

In [14]:
emb_dim = 10
model = CBOW(vocab_size=vocab_size, emb_dim=emb_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [15]:
epochs = 150
for epoch in range(1, epochs + 1):
    total_loss = 0.0
    for ctx, tgt in loader:
        optimizer.zero_grad()
        logits = model(ctx)
        loss = criterion(logits, tgt)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 30 == 0 or epoch == 1:
        print(f"Epoch {epoch:3d} - avg loss: {total_loss / len(loader):.4f}")

Epoch   1 - avg loss: 1.8932
Epoch  30 - avg loss: 0.1896
Epoch  60 - avg loss: 0.0368
Epoch  90 - avg loss: 0.0192
Epoch 120 - avg loss: 0.0104
Epoch 150 - avg loss: 0.0055


In [16]:
embeddings = model.emb.weight.detach().numpy()
for word, idx in word2idx.items():
    print(word, embeddings[idx][:5])

and [ 3.3820736   0.5001095  -0.23621322 -1.3641393  -0.18905242]
are [ 1.0290234  -0.7453744   1.7554126   0.08215817 -0.9867991 ]
cats [-0.77449286  0.03931859  0.59852093  1.4365642   1.1196452 ]
dogs [-1.2772852  0.6949459 -0.7905663  1.4546074  2.192363 ]
friends [ 0.8086073   0.36950937 -2.0837982   1.0096343   0.11085237]
i [-0.4513455 -2.0338087 -1.3544607 -2.953002  -2.0117567]
love [-0.22716475 -1.6090178  -0.09160855  1.7066832   0.94668406]


In [17]:
def predict(context_words):
    idxs = torch.tensor([[word2idx[w] for w in context_words]], dtype=torch.long)  # [1,2]
    with torch.no_grad():
        logits = model(idxs)             # [1, vocab_size]
        pred_idx = logits.argmax(dim=1).item()
    return idx2word[pred_idx]

In [18]:
print("Example prediction for context ['i', 'cats'] ->", predict(['i', 'cats']))
print("Example prediction for context ['cats', 'are'] ->", predict(['cats', 'are']))

Example prediction for context ['i', 'cats'] -> love
Example prediction for context ['cats', 'are'] -> dogs
