In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
sent = ["the glass of milk",
        "the glass of juice",
        "the cup of tea",
        "I am a good boy",
        "I am a good developer",
        "understand the meaning of words",
        "your videos are good"]

In [3]:
# Define the vacubalary size
voc_size = 10000

In [4]:
# One hot Representation
one_hot_rep = [one_hot(words,voc_size) for words in sent]
one_hot_rep

[[9917, 6216, 1029, 5327],
 [9917, 6216, 1029, 5769],
 [9917, 1165, 1029, 2026],
 [5848, 6567, 4873, 1763, 5701],
 [5848, 6567, 4873, 1763, 4466],
 [582, 9917, 1082, 1029, 7291],
 [8564, 8416, 6779, 1763]]

In [5]:
# word Embedding Representation
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import pad_sequences

In [6]:
sent_length = 8
embedded_docs = pad_sequences(one_hot_rep, padding="pre",maxlen = sent_length)
print(embedded_docs)

[[   0    0    0    0 9917 6216 1029 5327]
 [   0    0    0    0 9917 6216 1029 5769]
 [   0    0    0    0 9917 1165 1029 2026]
 [   0    0    0 5848 6567 4873 1763 5701]
 [   0    0    0 5848 6567 4873 1763 4466]
 [   0    0    0  582 9917 1082 1029 7291]
 [   0    0    0    0 8564 8416 6779 1763]]


In [7]:
# Feature Representation
dim = 10
model = Sequential()
model.add(Embedding(voc_size,dim,input_length=sent_length))
model.compile("adam","mse")



In [8]:
model.summary()

In [9]:
model.predict(embedded_docs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 435ms/step


array([[[-0.03427273,  0.04586866,  0.00929971, -0.04792982,
          0.02398277,  0.02328685, -0.03871827,  0.02580975,
          0.04588839, -0.00742799],
        [-0.03427273,  0.04586866,  0.00929971, -0.04792982,
          0.02398277,  0.02328685, -0.03871827,  0.02580975,
          0.04588839, -0.00742799],
        [-0.03427273,  0.04586866,  0.00929971, -0.04792982,
          0.02398277,  0.02328685, -0.03871827,  0.02580975,
          0.04588839, -0.00742799],
        [-0.03427273,  0.04586866,  0.00929971, -0.04792982,
          0.02398277,  0.02328685, -0.03871827,  0.02580975,
          0.04588839, -0.00742799],
        [-0.01965251,  0.03135104,  0.01402876,  0.01379016,
         -0.03591919, -0.01205577,  0.00141083, -0.02327669,
         -0.04379898, -0.04719068],
        [-0.02825204,  0.01457674,  0.00036437,  0.0441901 ,
          0.03212296, -0.03848406,  0.04848189, -0.0336314 ,
          0.02269825, -0.03881244],
        [ 0.0315528 , -0.03684436, -0.02680424,  0.0

In [10]:
embedded_docs[0]

array([   0,    0,    0,    0, 9917, 6216, 1029, 5327], dtype=int32)

In [12]:
import numpy as np
model.predict(np.expand_dims(embedded_docs[0], axis=0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


array([[[-0.03427273,  0.04586866,  0.00929971, -0.04792982,
          0.02398277,  0.02328685, -0.03871827,  0.02580975,
          0.04588839, -0.00742799],
        [-0.03427273,  0.04586866,  0.00929971, -0.04792982,
          0.02398277,  0.02328685, -0.03871827,  0.02580975,
          0.04588839, -0.00742799],
        [-0.03427273,  0.04586866,  0.00929971, -0.04792982,
          0.02398277,  0.02328685, -0.03871827,  0.02580975,
          0.04588839, -0.00742799],
        [-0.03427273,  0.04586866,  0.00929971, -0.04792982,
          0.02398277,  0.02328685, -0.03871827,  0.02580975,
          0.04588839, -0.00742799],
        [-0.01965251,  0.03135104,  0.01402876,  0.01379016,
         -0.03591919, -0.01205577,  0.00141083, -0.02327669,
         -0.04379898, -0.04719068],
        [-0.02825204,  0.01457674,  0.00036437,  0.0441901 ,
          0.03212296, -0.03848406,  0.04848189, -0.0336314 ,
          0.02269825, -0.03881244],
        [ 0.0315528 , -0.03684436, -0.02680424,  0.0

# Using WORD2VEC 

In [None]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300")

In [1]:
import torch
input_ids = torch.tensor([2,3,5,1])

In [3]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size,output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [4]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


# Using GPT-3 Vector dimensions

In [5]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size,output_dim)

In [10]:
%pip install tiktoken

from torch.utils.data import Dataset
import tiktoken

Collecting tiktokenNote: you may need to restart the kernel to use updated packages.

  Using cached tiktoken-0.11.0-cp310-cp310-win_amd64.whl.metadata (6.9 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Using cached regex-2025.9.18-cp310-cp310-win_amd64.whl.metadata (41 kB)
Using cached tiktoken-0.11.0-cp310-cp310-win_amd64.whl (884 kB)
Using cached regex-2025.9.18-cp310-cp310-win_amd64.whl (276 kB)
Installing collected packages: regex, tiktoken

   ---------------------------------------- 0/2 [regex]
   ---------------------------------------- 2/2 [tiktoken]

Successfully installed regex-2025.9.18 tiktoken-0.11.0


In [7]:
def create_dataloaderV1(txt,batch_size=4,max_length=256,stride=128,shuffle=True,drop_last=True,num_workers=0):
    # Intializing the tokenizer 
    tokenizer = tiktoken.get_encoding("gpt2")

    # Creating dataset
    dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_sizev= batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

In [8]:
class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt,allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0,len(token_ids)-max_length,stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i+1,i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__self(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index],self.target_ids[index]