In [None]:
!pip install datasets

1. Importing the Datasets

In [None]:
pip install -U datasets

In [None]:
from datasets import load_dataset

ds = load_dataset("roneneldan/TinyStories")

2. Tokenization of the Dataset


In [None]:
!pip install tiktoken
import tiktoken
import os
import numpy as np
from tqdm.auto import tqdm
from pathlib import Path
import pandas as pd
enc = tiktoken.get_encoding("gpt2")
def process(example):
  ids = enc.encode_ordinary(example['text'])
  out = {'ids': ids, 'len': len(ids)}
  return out

if not os.path.exists("train.bin"):
    tokenized = ds.map(
        process,
        remove_columns=['text'],
        desc="tokenizing the splits",
        num_proc=8,
        )

for split, dset in tokenized.items(): # Iterating through the Dataset Splits
        arr_len = np.sum(dset['len'], dtype=np.uint64) # Counting the number of tokens in the dset after spliting
        filename = f'{split}.bin' # Creating two bins i.e bag of words train.bin and val.bin
        dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16) we have max tokens as 50257 as int16 can 65k so it better
        arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,)) # here mode w+ is to create a bin if not created and write the token ids into thr bin if the bin is already created
        # IT deletes and creates a new bin
        total_batches = 1024 # Divided into the batches

        idx = 0 # Assinging the idx to batches
        for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'): # Iterating along the batches in the specified bins
            # Batch together samples for faster write
            batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy') # It creates in any contigous 1D array manner
            arr_batch = np.concatenate(batch['ids']) #After Shards it concats the ids
            # Write into mmap
            arr[idx : idx + len(arr_batch)] = arr_batch # It is Copying the token ids into the arr.memmap asn arr was empty and arr_batc contained the token ids
            idx += len(arr_batch) # It is incrementing each one
        arr.flush()

In [None]:
# Some functions from https://github.com/karpathy/nanoGPT/blob/master/train.py with slight modifications
#block size = context window
def get_batch(split):
    # We recreate np.memmap every batch to avoid a memory leak, as per
    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
    if split == 'train':
        data = np.memmap('train.bin', dtype=np.uint16, mode='r')
    else:
        data = np.memmap('validation.bin', dtype=np.uint16, mode='r')
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device_type == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y
