In [1]:
import pandas as pd
import os
import torch
import numpy as np
import tiktoken
from tqdm import tqdm
from IPython import display

In [2]:
DATASET_PATH = './dataset'
local_dir = "./data/tinystories"
shard_size = int(1e8)   

DATA_CACHE_DIR = local_dir
os.makedirs(DATA_CACHE_DIR, exist_ok=True)

In [3]:
enc = tiktoken.get_encoding('gpt2')
eot = enc._special_tokens['<|endoftext|>']

def tokenize(doc):
    tokens = [eot]
    tokens.extend(enc.encode_ordinary(doc['text']))
    tokens_np = np.array(tokens)
    tokens_np_uint16 = tokens_np.astype(np.uint16)
    return tokens_np_uint16


In [4]:
df = pd.read_csv(os.path.join(DATASET_PATH,'cleaned_data.csv'))

In [5]:
df.shape

(2141479, 1)

In [6]:
df = df.dropna()

In [7]:
df = df.iloc[:]

In [8]:
df.shape

(2141479, 1)

In [9]:
shard_idx = 0
# preallocate buffer to hold current shard
all_tokens_np = np.empty((shard_size,), dtype=np.uint16)
token_count = 0
val_shard_size = int(shard_size*1)
train_shard_size = shard_size
progress_bar = None
for i,row in df.iterrows():
    tokens = tokenize(row)
    # check if there is enough space in current shard for new tokens
    shard_size = val_shard_size if shard_idx==0 else train_shard_size
    if token_count + len(tokens) < shard_size:
        # simply append tokens to current shard
        all_tokens_np[token_count : token_count + len(tokens)] = tokens
        token_count += len(tokens)
        if progress_bar is None:
            progress_bar = tqdm(total=shard_size, unit='tokens', desc=f'shard {shard_idx}')
        progress_bar.update(len(tokens))
    else:
        split = 'val' if shard_idx == 0 else 'train'
        filepath = os.path.join(DATA_CACHE_DIR, f'tinystories_{split}_{shard_idx:06d}')
        remainder = shard_size - token_count
        progress_bar.update(remainder)
        all_tokens_np[token_count : token_count + remainder] = tokens[:remainder]
        np.save(filepath, all_tokens_np)
        shard_idx += 1
        progress_bar = None
        all_tokens_np[0:len(tokens) - remainder] = tokens[remainder:]
        token_count = len(tokens) - remainder

if token_count != 0:
    split = 'val' if shard_idx == 0 else 'train'
    filepath = os.path.join(DATA_CACHE_DIR, f"tinystories_{split}_{shard_idx:06d}")
    np.save(filepath, all_tokens_np[:token_count])

shard 0: 100%|██████████| 100000000/100000000 [00:39<00:00, 2559595.16tokens/s]
shard 1: 100%|█████████▉| 99999860/100000000 [00:39<00:00, 2510812.37tokens/s]
shard 2: 100%|█████████▉| 99999852/100000000 [00:40<00:00, 2464631.52tokens/s]
shard 3: 100%|█████████▉| 99999855/100000000 [00:39<00:00, 2514726.80tokens/s]
shard 4:  52%|█████▏    | 52021805/100000000 [00:20<00:18, 2623299.12tokens/s]

shard 4:  52%|█████▏    | 52041481/100000000 [00:30<00:18, 2623299.12tokens/s]