In [1]:
# Install specific compatible versions of gensim, numpy, and scipy
!pip install gensim==4.3.3 numpy==1.26.4 scipy==1.11.4



In [2]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300")  # download the model and return as object ready for use



# Example of a word as a vector

In [3]:
word_vectors = model

# Let us look how the vector embedding of a word looks like
print(word_vectors['computer'])  # Example: Accessing the vector for the word 'computer'

[ 1.07421875e-01 -2.01171875e-01  1.23046875e-01  2.11914062e-01
 -9.13085938e-02  2.16796875e-01 -1.31835938e-01  8.30078125e-02
  2.02148438e-01  4.78515625e-02  3.66210938e-02 -2.45361328e-02
  2.39257812e-02 -1.60156250e-01 -2.61230469e-02  9.71679688e-02
 -6.34765625e-02  1.84570312e-01  1.70898438e-01 -1.63085938e-01
 -1.09375000e-01  1.49414062e-01 -4.65393066e-04  9.61914062e-02
  1.68945312e-01  2.60925293e-03  8.93554688e-02  6.49414062e-02
  3.56445312e-02 -6.93359375e-02 -1.46484375e-01 -1.21093750e-01
 -2.27539062e-01  2.45361328e-02 -1.24511719e-01 -3.18359375e-01
 -2.20703125e-01  1.30859375e-01  3.66210938e-02 -3.63769531e-02
 -1.13281250e-01  1.95312500e-01  9.76562500e-02  1.26953125e-01
  6.59179688e-02  6.93359375e-02  1.02539062e-02  1.75781250e-01
 -1.68945312e-01  1.21307373e-03 -2.98828125e-01 -1.15234375e-01
  5.66406250e-02 -1.77734375e-01 -2.08984375e-01  1.76757812e-01
  2.38037109e-02 -2.57812500e-01 -4.46777344e-02  1.88476562e-01
  5.51757812e-02  5.02929

In [4]:
print(word_vectors['cat'].shape)

(300,)


# Similar words

# King + Woman - Man = ?

In [5]:
# Example of using most_similar
print(word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=10))

[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.5181134343147278), ('sultan', 0.5098593831062317), ('monarchy', 0.5087411999702454)]


# Let us check the similarity b/w a few pair of words

In [6]:
# Example of calculating similarity
print(word_vectors.similarity('woman', 'man'))
print(word_vectors.similarity('king', 'queen'))
print(word_vectors.similarity('uncle', 'aunt'))
print(word_vectors.similarity('boy', 'girl'))
print(word_vectors.similarity('nephew', 'niece'))
print(word_vectors.similarity('paper', 'water'))

0.76640123
0.6510957
0.7643474
0.8543272
0.7594367
0.11408084


# Most similar words

In [7]:
print(word_vectors.most_similar("tower", topn=5))

[('towers', 0.8531750440597534), ('skyscraper', 0.6417425870895386), ('Tower', 0.639177143573761), ('spire', 0.594687819480896), ('responded_Understood_Atlasjet', 0.5931612253189087)]


# Now let us see the vector similarity

In [8]:
import numpy as np
# Words to compare
word1 = 'man'
word2 = 'woman'

word3 = 'semiconductor'
word4 = 'earthworm'

word5 = 'nephew'
word6 = 'niece'

# Calculate the vector difference
vector_difference1 = model[word1] - model[word2]
vector_difference2 = model[word3] - model[word4]
vector_difference3 = model[word5] - model[word6]

# Calculate the magnitude of the vector difference
magnitude_of_difference1 = np.linalg.norm(vector_difference1)
magnitude_of_difference2 = np.linalg.norm(vector_difference2)
magnitude_of_difference3 = np.linalg.norm(vector_difference3)


# Print the magnitude of the difference
print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word1, word2, magnitude_of_difference1))
print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word3, word4, magnitude_of_difference2))
print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word5, word6, magnitude_of_difference3))

The magnitude of the difference between 'man' and 'woman' is 1.73
The magnitude of the difference between 'semiconductor' and 'earthworm' is 5.67
The magnitude of the difference between 'nephew' and 'niece' is 1.96


## CREATING TOKEN EMBEDDINGS

Example: Token ID → Embedding Vector in PyTorch
We’ll simulate a simple case with:

A vocabulary of 10 tokens (IDs from 0 to 9)

Each token represented by a 4-dimensional vector

In [9]:
import torch
import torch.nn as nn


Step 2: Create an Embedding Layer



In [10]:
# Create an embedding layer for 10 tokens, each with 4 features
embedding = nn.Embedding(num_embeddings=10, embedding_dim=4)
#This creates a lookup table (a matrix of size 10 × 4) where each row is a vector for a token ID from 0 to 9.

Step 3: Define Token IDs (Input)

In [11]:
# Let's say we want embeddings for token IDs: 1, 2, and 4
token_ids = torch.tensor([1, 2, 4])


Step 4: Pass Token IDs Through the Embedding Layer

In [12]:
# Get the embedding vectors
embedded_vectors = embedding(token_ids)
# Now, embedded_vectors contains 3 vectors (one for each token), each of size 4.



 Step 5: Print the Result

In [13]:
print("Token IDs:", token_ids)
print("Embedding Vectors:\n", embedded_vectors)


Token IDs: tensor([1, 2, 4])
Embedding Vectors:
 tensor([[-1.0913, -0.0933,  0.2355,  1.1382],
        [ 0.1318,  0.7752,  0.7384, -0.3266],
        [ 1.0427,  0.2396, -1.6096, -2.1470]], grad_fn=<EmbeddingBackward0>)


In [14]:
# View All 10 Embeddings
print("All embeddings:\n", embedding.weight.data)


All embeddings:
 tensor([[-0.4529, -0.0054, -0.1694, -0.3142],
        [-1.0913, -0.0933,  0.2355,  1.1382],
        [ 0.1318,  0.7752,  0.7384, -0.3266],
        [ 2.6535,  0.1505, -0.2863, -0.2661],
        [ 1.0427,  0.2396, -1.6096, -2.1470],
        [ 0.6722,  0.1205,  1.2672, -1.1633],
        [-0.3426, -1.8629,  0.2311, -0.8454],
        [-0.9374, -0.0618,  1.6912, -1.8479],
        [-1.3406, -0.4582,  1.1096,  1.3724],
        [-0.3250, -1.1878,  1.5225, -0.7665]])


## Positional Encodding

In [35]:
# Custom Dataset with Sliding Window
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length + 1]

            if len(input_chunk) == max_length and len(target_chunk) == max_length:
                self.input_ids.append(torch.tensor(input_chunk))
                self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


In [36]:
# Create Dataloader Function
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=False, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    print(f"Dataset created with {len(dataset)} samples")
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,
                      drop_last=drop_last, num_workers=num_workers)


In [37]:
# Load Text & Create Dataloader
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

max_length = 4         # Sequence length
batch_size = 8         # Number of sequences per batch
stride = 4             # No overlap

dataloader = create_dataloader_v1(
    raw_text, batch_size=batch_size, max_length=max_length,
    stride=stride, shuffle=False, drop_last=False
)


Dataset created with 1286 samples


In [38]:
if len(dataloader.dataset) == 0:
    print(" Dataset is empty. Try smaller max_length or stride.")
else:
    data_iter = iter(dataloader)
    inputs, targets = next(data_iter)

    print(" Token IDs:\n", inputs)
    print(" Inputs shape:", inputs.shape)  # ➤ Shape: [8, 4]


 Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
 Inputs shape: torch.Size([8, 4])


In [39]:
# Token Embedding Layer
vocab_size = 50257       # GPT-2 vocab size
output_dim = 256         # Embedding dimension

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
token_embeddings = token_embedding_layer(inputs)

print("Token embeddings shape:", token_embeddings.shape)  # ➤ Shape: [8, 4, 256]


Token embeddings shape: torch.Size([8, 4, 256])


In [41]:
#  Positional Embedding Layer
pos_embedding_layer = torch.nn.Embedding(max_length, output_dim)

# Positions: [0, 1, 2, 3]
pos_ids = torch.arange(max_length)              # ➤ Shape: [4]
pos_embeddings = pos_embedding_layer(pos_ids)   # ➤ Shape: [4, 256]
print(" Positional embeddings shape:", pos_embeddings.shape)


 Positional embeddings shape: torch.Size([4, 256])


In [43]:
# Combine Token + Positional Embeddings
# Broadcast position embeddings across batch (8)
pos_embeddings = pos_embeddings.unsqueeze(0)          # ➤ Shape: [1, 4, 256]
input_embeddings = token_embeddings + pos_embeddings  # ➤ Shape: [8, 4, 256]

print(" Input embeddings shape (final):", input_embeddings.shape)


 Input embeddings shape (final): torch.Size([1, 8, 4, 256])


| Component                     | Shape         | Description                      |
| ----------------------------- | ------------- | -------------------------------- |
| `inputs`                      | `[8, 4]`      | Token IDs                        |
| `token_embeddings`            | `[8, 4, 256]` | Token ID → vector                |
| `pos_embeddings`              | `[4, 256]`    | One vector per position          |
| `pos_embeddings.unsqueeze(0)` | `[1, 4, 256]` | Broadcasted for batch            |
| `input_embeddings`            | `[8, 4, 256]` | Final input to Transformer block |
