# Data Processing Pipeline

In [1]:
# Opening the file
import re
filename = 'the-verdict.txt'
with open(filename,'r') as f:
    data = f.read()
print("The total length of the filename is:", len(data))

The total length of the filename is: 20479


In [2]:
# text = "Hello, world. This, is a test."
text = "Hello, world. Is this-- a test?"
# [,.:;?_!"()\']
match = re.split(r'([],.:;?_!"-()\']|--|\s+)', text) 
if match:
    print("The match is:" ,match)
match = list(set(match))
print(match)
match.remove('')
match.remove(' ')
print(match)

The match is: ['Hello', ',', '', ' ', 'world', '.', '', ' ', 'Is', ' ', 'this', '--', '', ' ', 'a', ' ', 'test', '?', '']
['', 'test', ' ', 'this', 'Hello', 'Is', ',', '--', 'world', '.', '?', 'a']
['test', 'this', 'Hello', 'Is', ',', '--', 'world', '.', '?', 'a']


In [3]:
result = re.split(r'([],.:;?_!"-()\']|--|\s+)', data) 
result = sorted(list(set(result)))
result.remove('')
result.remove(' ')
result.remove('\n\n')
result.append("<|unk|>")
print(len(result))

1131


In [4]:
vocab ={}
for i in range(len(result)):
    vocab[i] = result[i]


In [5]:
class SimpleTokeniser():
    def __init__(self):
        self.vocab={}
        filename = 'the-verdict.txt'
        with open(filename,'r') as f:
            data = f.read()
        result = re.split(r'([],.:;?_!"-()\']|--|\s+)', data) 
        result = sorted(list(set(result)))
        result.remove('')
        result.remove(' ')
        result.remove('\n\n')
        result.extend(["<|endoftext|>","<|unk|>"])
        for i in range(len(result)):
            self.vocab[result[i]] = i
    def encode(self,data):
        result = re.split(r'([],.:;?_!"-()\']|--|\s+)', data) 
        result = [item.strip() for item in result if item.strip()]
        token = []
        for i in result:
            if i in self.vocab:
                # print(i)
                token.append(int(self.vocab[i]))
                # print(self.vocab[i])
            else:
                print(i)
                token.append(int(self.vocab["<|unk|>"]))
        return token
    def decode(self, token):
        result = " "
        decode_vocab = dict((v,k) for k,v in self.vocab.items())
        for i in token:
            if i in decode_vocab:
                if decode_vocab[i] in [']',',','.',':',';','?','_','!','"','-','(',')','\\']:
                    result = result + decode_vocab[i]
                else:
                    result = result + " " + decode_vocab[i]
        return result.strip()
        # return result
        
tokenizer = SimpleTokeniser()
result = tokenizer.encode("In the sunlit terraces of the palace.")
print(result)
decoder = tokenizer.decode([55, 988, 956, 984, 722, 988, 1131, 7])
print(decoder)


palace
[55, 988, 956, 984, 722, 988, 1131, 7]
In the sunlit terraces of the <|unk|>.


In [6]:
# Byte encoding

import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))
tokenizer = tiktoken.get_encoding("gpt2")

# Encode the given text
enc_text = tokenizer.encode(data)
print(len(enc_text))

tiktoken version: 0.9.0
5145


In [34]:
# Creating data loaders
import torch
from torch.utils.data import Dataset, DataLoader

class SimpleDataset(Dataset):
    def __init__(self, tokenizer, data, stride, context_length):
        self.input =[]
        self.output =[]
        self.tokenizer = tokenizer
        enc_text = tokenizer.encode(data)
        for i in range(0,len(enc_text)-context_length, stride):
            self.input.append(enc_text[i:i+context_length])
            self.output.append(enc_text[i+1:i+1+context_length])
    def __len__(self):
        return len(self.input)
    def __getitem__(self,index):
        return torch.tensor(self.input)[index], torch.tensor(self.output)[index]

In [35]:
# Create a dataloader object

def create_dataloader(data, context_length=256, stride=128,shuffle= False,batch_size=4,num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    enc_text = tokenizer.encode(data)
    dataset = SimpleDataset(tokenizer, data, stride, context_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,num_workers=num_workers)
    return dataloader


In [None]:
# Initiating the dataloader object
dataloader = create_dataloader(
    data, context_length=4, stride=1, shuffle=False, batch_size=1
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [42]:
# Initating the token and positional embeddings layer

vocab_size = 50257
output_dim = 256
context_length=4

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

positional_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [43]:
# Intialising the first batch of input
dataloader = create_dataloader(
    data, context_length=4, stride=1, shuffle=False, batch_size=8
)

data_iter = iter(dataloader)
input,output = next(data_iter)
print("The shape of the input", input.shape)

# Initalise the token and postional embedding

token_embeddings = token_embedding_layer(input)
print(token_embeddings.shape)
positonal_embedding = positional_embedding_layer(torch.arange(context_length))
print(positonal_embedding.shape)

# Intiating the input embedding containing the both positional and token embeddings
input_embeddings = token_embeddings + positonal_embedding
print(input_embeddings.shape)

The shape of the input torch.Size([8, 4])
torch.Size([8, 4, 256])
torch.Size([4, 256])
torch.Size([8, 4, 256])
