In [7]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

In [None]:
# --- Example sentences (10) ---
sentences = [
    "This movie was amazing and inspiring",
    "I hated the film, it was boring",
    "The acting was brilliant",
    "Terrible direction and poor editing",
    "Loved every minute of it",
    "The plot was weak and predictable",
    "Fantastic music and visuals",
    "Not worth watching",
    "Absolutely wonderful experience",
    "Disappointing performance overall"
]

In [11]:
# -------------------------
# 1) Tokenize (very simple tokenizer)
# -------------------------
tokenized = [s.lower().split() for s in sentences]
# Example: tokenized[0] == ["this","movie","was","amazing","and","inspiring"]
tokenized

[['this', 'movie', 'was', 'amazing', 'and', 'inspiring'],
 ['i', 'hated', 'the', 'film,', 'it', 'was', 'boring'],
 ['the', 'acting', 'was', 'brilliant'],
 ['terrible', 'direction', 'and', 'poor', 'editing'],
 ['loved', 'every', 'minute', 'of', 'it'],
 ['the', 'plot', 'was', 'weak', 'and', 'predictable'],
 ['fantastic', 'music', 'and', 'visuals'],
 ['not', 'worth', 'watching'],
 ['absolutely', 'wonderful', 'experience'],
 ['disappointing', 'performance', 'overall']]

In [13]:
# -------------------------
# 2) Build a vocabulary (simple)
# -------------------------
# special tokens
PAD = "<pad>"
UNK = "<unk>"

# collect tokens and build mapping
counter = Counter(tok for sent in tokenized for tok in sent)

In [15]:
# keep all tokens for this tiny example; in real tasks you may limit vocabulary size
vocab_tokens = [PAD, UNK] + sorted(counter.keys())
print(f"Vocab size: {len(vocab_tokens)} (including PAD and UNK)")
vocab_tokens

Vocab size: 39 (including PAD and UNK)


['<pad>',
 '<unk>',
 'absolutely',
 'acting',
 'amazing',
 'and',
 'boring',
 'brilliant',
 'direction',
 'disappointing',
 'editing',
 'every',
 'experience',
 'fantastic',
 'film,',
 'hated',
 'i',
 'inspiring',
 'it',
 'loved',
 'minute',
 'movie',
 'music',
 'not',
 'of',
 'overall',
 'performance',
 'plot',
 'poor',
 'predictable',
 'terrible',
 'the',
 'this',
 'visuals',
 'was',
 'watching',
 'weak',
 'wonderful',
 'worth']

In [16]:
# mappings
word2idx = {w: i for i, w in enumerate(vocab_tokens)}
idx2word = {i: w for w, i in word2idx.items()}

pad_idx = word2idx[PAD]
unk_idx = word2idx[UNK]

word2idx, idx2word, pad_idx, unk_idx

({'<pad>': 0,
  '<unk>': 1,
  'absolutely': 2,
  'acting': 3,
  'amazing': 4,
  'and': 5,
  'boring': 6,
  'brilliant': 7,
  'direction': 8,
  'disappointing': 9,
  'editing': 10,
  'every': 11,
  'experience': 12,
  'fantastic': 13,
  'film,': 14,
  'hated': 15,
  'i': 16,
  'inspiring': 17,
  'it': 18,
  'loved': 19,
  'minute': 20,
  'movie': 21,
  'music': 22,
  'not': 23,
  'of': 24,
  'overall': 25,
  'performance': 26,
  'plot': 27,
  'poor': 28,
  'predictable': 29,
  'terrible': 30,
  'the': 31,
  'this': 32,
  'visuals': 33,
  'was': 34,
  'watching': 35,
  'weak': 36,
  'wonderful': 37,
  'worth': 38},
 {0: '<pad>',
  1: '<unk>',
  2: 'absolutely',
  3: 'acting',
  4: 'amazing',
  5: 'and',
  6: 'boring',
  7: 'brilliant',
  8: 'direction',
  9: 'disappointing',
  10: 'editing',
  11: 'every',
  12: 'experience',
  13: 'fantastic',
  14: 'film,',
  15: 'hated',
  16: 'i',
  17: 'inspiring',
  18: 'it',
  19: 'loved',
  20: 'minute',
  21: 'movie',
  22: 'music',
  23: 'not',

In [19]:
# -------------------------
# 3) Numericalize (tokens -> indices) and pad into a batch
# -------------------------
encoded = []
for sent in tokenized:
    ids = [word2idx.get(tok, unk_idx) for tok in sent]
    encoded.append(torch.tensor(ids, dtype=torch.long))

# pad_sequence will create a (batch_size, max_len) tensor when batch_first=True
padded = pad_sequence(encoded, batch_first=True, padding_value=pad_idx)
# padded.shape -> (10, max_len)
print("Padded indices shape:", padded.shape)
print("Example padded row (first sentence):", padded[0].tolist())
padded

Padded indices shape: torch.Size([10, 7])
Example padded row (first sentence): [32, 21, 34, 4, 5, 17, 0]


tensor([[32, 21, 34,  4,  5, 17,  0],
        [16, 15, 31, 14, 18, 34,  6],
        [31,  3, 34,  7,  0,  0,  0],
        [30,  8,  5, 28, 10,  0,  0],
        [19, 11, 20, 24, 18,  0,  0],
        [31, 27, 34, 36,  5, 29,  0],
        [13, 22,  5, 33,  0,  0,  0],
        [23, 38, 35,  0,  0,  0,  0],
        [ 2, 37, 12,  0,  0,  0,  0],
        [ 9, 26, 25,  0,  0,  0,  0]])

In [21]:
# -------------------------
# 4) Embedding layer (word embeddings)
# -------------------------
embedding_dim = 50
emb = nn.Embedding(num_embeddings=len(vocab_tokens), embedding_dim=embedding_dim, padding_idx=pad_idx)

# get word embeddings: shape -> (batch_size, seq_len, embedding_dim)
word_embeds = emb(padded)
print("Word embeddings shape:", word_embeds.shape)  # e.g. (10, max_len, 50)


Word embeddings shape: torch.Size([10, 7, 50])


In [22]:
# -------------------------
# 5) Convert to sentence embeddings (mean pooling ignoring padding)
# -------------------------
# create mask for non-padding tokens: shape (batch_size, seq_len, 1)
mask = (padded != pad_idx).unsqueeze(-1).float()  # 1 where real token, 0 where PAD

# zero out padding embeddings and sum
sum_embeds = (word_embeds * mask).sum(dim=1)  # shape (batch_size, embedding_dim)
# count real tokens per sentence
lengths = mask.sum(dim=1)  # shape (batch_size, 1)
lengths = lengths.clamp(min=1)  # avoid division by zero
sent_embeds = sum_embeds / lengths  # mean over non-pad tokens
print("Sentence embeddings shape:", sent_embeds.shape)  # (10, embedding_dim)


Sentence embeddings shape: torch.Size([10, 50])


In [23]:
# -------------------------
# 6) Inspect results
# -------------------------
print("\n--- Example outputs ---")
print("Vocabulary sample:", list(word2idx.items())[:8])
print("First sentence tokens:", tokenized[0])
print("First sentence indices:", padded[0].tolist())
print("First sentence embedding vector (size):", sent_embeds[0].shape)
print("First 6 values of first sentence embedding:", sent_embeds[0][:6].tolist())


--- Example outputs ---
Vocabulary sample: [('<pad>', 0), ('<unk>', 1), ('absolutely', 2), ('acting', 3), ('amazing', 4), ('and', 5), ('boring', 6), ('brilliant', 7)]
First sentence tokens: ['this', 'movie', 'was', 'amazing', 'and', 'inspiring']
First sentence indices: [32, 21, 34, 4, 5, 17, 0]
First sentence embedding vector (size): torch.Size([50])
First 6 values of first sentence embedding: [0.033360760658979416, 0.3194434344768524, -1.1688367128372192, -1.0341030359268188, 0.09556436538696289, -0.027250101789832115]
