# Get Word Embeddings

### Importing Libs

In [1]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

In [2]:
sentences = [
    "This movie was amazing and inspiring",
    "I hated the film, it was boring",
    "The plot was good but the acting was terrible",
    "Great music and excellent direction",
    "Worst experience ever, do not recommend",
    "A masterpiece with stunning visuals",
    "Just wow!",
    "I Loved the story but the ending was too predictable",
    "Decent movie, but not my type",
    "Fantastic script and brilliant performances"
]

### Tokenize

In [3]:
tokenized = [s.lower().split() for s in sentences]
tokenized

[['this', 'movie', 'was', 'amazing', 'and', 'inspiring'],
 ['i', 'hated', 'the', 'film,', 'it', 'was', 'boring'],
 ['the', 'plot', 'was', 'good', 'but', 'the', 'acting', 'was', 'terrible'],
 ['great', 'music', 'and', 'excellent', 'direction'],
 ['worst', 'experience', 'ever,', 'do', 'not', 'recommend'],
 ['a', 'masterpiece', 'with', 'stunning', 'visuals'],
 ['just', 'wow!'],
 ['i',
  'loved',
  'the',
  'story',
  'but',
  'the',
  'ending',
  'was',
  'too',
  'predictable'],
 ['decent', 'movie,', 'but', 'not', 'my', 'type'],
 ['fantastic', 'script', 'and', 'brilliant', 'performances']]

### Build Vocabulary

In [4]:
# Create a vocabulary of unique words
word2idx = {"<pad>": 0, "<unk>": 1}  # special tokens
idx = 2  # start indexing from 2
  
for sentence in tokenized:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1

print("Vocabulary size:", len(word2idx))
print("Sample vocabulary:", list(word2idx.items())[:10])


Vocabulary size: 49
Sample vocabulary: [('<pad>', 0), ('<unk>', 1), ('this', 2), ('movie', 3), ('was', 4), ('amazing', 5), ('and', 6), ('inspiring', 7), ('i', 8), ('hated', 9)]


### Map Words to IDs in Vocab Dict

In [5]:
# Map words to their IDs
numericalized = [
    [word2idx.get(word, word2idx["<unk>"]) for word in sentence]
    for sentence in tokenized
]

print("Tokenized sentence: ", tokenized[0])
print("Mapped sentence: ", numericalized[0])


Tokenized sentence:  ['this', 'movie', 'was', 'amazing', 'and', 'inspiring']
Mapped sentence:  [2, 3, 4, 5, 6, 7]


### Padding 

To make the length of all sentences equal

In [6]:
# Convert lists to tensors and pad
tensors = [torch.tensor(seq, dtype=torch.long) for seq in numericalized]
padded = pad_sequence(tensors, batch_first=True, padding_value=word2idx["<pad>"])

print("Padded shape:", padded.shape)  # (10 sentences, max_length)
padded


Padded shape: torch.Size([10, 10])


tensor([[ 2,  3,  4,  5,  6,  7,  0,  0,  0,  0],
        [ 8,  9, 10, 11, 12,  4, 13,  0,  0,  0],
        [10, 14,  4, 15, 16, 10, 17,  4, 18,  0],
        [19, 20,  6, 21, 22,  0,  0,  0,  0,  0],
        [23, 24, 25, 26, 27, 28,  0,  0,  0,  0],
        [29, 30, 31, 32, 33,  0,  0,  0,  0,  0],
        [34, 35,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 8, 36, 10, 37, 16, 10, 38,  4, 39, 40],
        [41, 42, 16, 27, 43, 44,  0,  0,  0,  0],
        [45, 46,  6, 47, 48,  0,  0,  0,  0,  0]])

### Embedding Layers

In [7]:
embedding_dim = 50
emb = nn.Embedding(
    num_embeddings=len(word2idx), # Vocab size
    embedding_dim=embedding_dim
)

# Get embeddings for each word in each sentence
embeddings = emb(padded)  # shape: (10, max_length, 50)

print("Embedding shape:", embeddings.shape)
embeddings[6] # Shorted sentence "Just wow"


Embedding shape: torch.Size([10, 10, 50])


tensor([[-8.1857e-01,  2.9337e-02, -5.3407e-02, -1.1396e-01,  6.1634e-01,
         -5.7820e-01, -3.5250e-01,  1.3037e-01,  1.4291e+00,  2.2386e-01,
         -7.9323e-01, -3.4040e-01, -1.5354e+00, -7.3298e-02,  1.5117e-01,
          1.7054e+00,  1.4555e+00,  9.4428e-01, -9.1803e-01,  1.9198e+00,
          3.3070e-01,  6.7981e-01, -1.2322e+00,  6.9062e-01, -1.9629e+00,
         -7.7507e-01, -1.1899e+00,  1.1907e+00, -7.7802e-01, -9.4976e-01,
         -7.3032e-04,  4.6982e-01, -1.4232e-02,  8.2827e-01,  1.1129e+00,
          8.9355e-01,  3.4804e-01,  1.1375e-01,  5.0504e-01,  1.0646e+00,
         -9.8610e-01,  8.6534e-01, -9.8221e-01,  7.1091e-01, -2.4942e-01,
          6.1519e-01,  1.2143e+00, -1.4612e+00, -3.6125e-02,  2.1664e+00],
        [ 5.9329e-01, -1.6449e+00, -4.7879e-01, -1.3855e+00,  8.5603e-01,
         -5.7774e-01,  1.6986e+00,  5.9460e-01,  1.0321e+00,  1.1529e+00,
         -1.2122e-01, -6.3745e-01, -7.5871e-01, -1.7465e-01, -1.0648e-01,
          7.9560e-01, -1.9406e+00, -4