In [20]:
import torch
import tiktoken

In [21]:
tokenize = tiktoken.get_encoding("gpt2")
text = "Your journey starts with one step"
vocab_size = tokenize.n_vocab
max_size = 3

In [22]:
text_encode = tokenize.encode(text)
input_id = torch.tensor(text_encode)
text_size = len(text_encode)

In [23]:
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenize,max_length,stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenize.encode(txt)
        print("token length : ",len(token_ids))
        
        for i in range(0,len(token_ids)-max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1 : i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index],self.target_ids[index]

In [52]:
vocab_size = tokenize.n_vocab
max_size = 3
def dataloaderV1(txt,max_length,stride,batch_size):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)
    dataload = DataLoader(dataset,
                        batch_size=batch_size)
    return dataload

dataload = dataloaderV1(text,3,1,1)
iterator = iter(dataload)

embedding = torch.nn.Embedding(vocab_size,max_size)

for batch in dataload:
    input_ids_batch, target_ids_batch = batch
    print("input_ids_batch, target_ids_batch" , input_ids_batch, " ", target_ids_batch)
    embedding_batch = embedding(input_ids_batch)
    print(embedding_batch)

token length :  6
input_ids_batch, target_ids_batch tensor([[7120, 7002, 4940]])   tensor([[7002, 4940,  351]])
tensor([[[ 1.2058,  1.0512,  0.3289],
         [ 0.1538, -1.6546, -0.3034],
         [-0.5631,  3.1535,  0.6226]]], grad_fn=<EmbeddingBackward0>)
input_ids_batch, target_ids_batch tensor([[7002, 4940,  351]])   tensor([[4940,  351,  530]])
tensor([[[ 0.1538, -1.6546, -0.3034],
         [-0.5631,  3.1535,  0.6226],
         [-0.4531,  1.1252, -0.1373]]], grad_fn=<EmbeddingBackward0>)
input_ids_batch, target_ids_batch tensor([[4940,  351,  530]])   tensor([[ 351,  530, 2239]])
tensor([[[-0.5631,  3.1535,  0.6226],
         [-0.4531,  1.1252, -0.1373],
         [ 0.3593,  0.1302, -0.5676]]], grad_fn=<EmbeddingBackward0>)


In [55]:
# instead embedding for all doing for one time
token_id = torch.tensor(text_encode)
input_embedding = embedding(token_id)
print("input_embedding" , input_embedding)

input_embedding tensor([[ 1.2058,  1.0512,  0.3289],
        [ 0.1538, -1.6546, -0.3034],
        [-0.5631,  3.1535,  0.6226],
        [-0.4531,  1.1252, -0.1373],
        [ 0.3593,  0.1302, -0.5676],
        [ 0.9942,  1.2144, -0.7297]], grad_fn=<EmbeddingBackward0>)
