In [1]:
import pandas as pd
import os
import torch
import numpy as np
import tiktoken
import random
from tqdm import tqdm
from IPython import display

In [2]:
DATASET_PATH = './dataset'
local_dir = "./data/gutenberg"
val_ratio = 0.2
shard_size = int(1e8)

DATA_CACHE_DIR = local_dir
os.makedirs(DATA_CACHE_DIR, exist_ok=True)


In [3]:
buffers = {
    "val":  np.empty((shard_size,), dtype=np.uint16),
    "train": np.empty((shard_size,), dtype=np.uint16)
}
counts = {"val": 0, "train": 0}
shard_idx = {"val": 0, "train": 0}

In [4]:
enc = tiktoken.get_encoding('gpt2')
eot = enc._special_tokens['<|endoftext|>']

def tokenize(doc):
    tokens = [eot]
    tokens.extend(enc.encode_ordinary(doc['text']))
    tokens_np = np.array(tokens)
    tokens_np_uint16 = tokens_np.astype(np.uint16)
    return tokens_np_uint16


In [5]:
df = pd.read_csv(os.path.join(DATASET_PATH,'cleaned_data.csv'))

In [6]:
df.shape

(3000, 1)

In [7]:
df = df.dropna()

In [8]:
df = df.iloc[:]

In [9]:
df.shape

(3000, 1)

In [10]:
counts = {"val": 0, "train": 0}
shard_idx = {"val": 0, "train": 0}

progress = {"val": None, "train": None}

for i, row in df.iterrows():
    tokens = tokenize(row)

    split = "val" if random.random() < val_ratio else "train"

    buf = buffers[split]
    count = counts[split]

    if count + len(tokens) < shard_size:
        buf[count:count + len(tokens)] = tokens
        counts[split] += len(tokens)

        if progress[split] is None:
            progress[split] = tqdm(total=shard_size, unit='tokens', desc=f"{split} shard {shard_idx[split]}")
        progress[split].update(len(tokens))

    else:
        filename = f"tinystories_{split}_{shard_idx[split]:06d}.npy"
        np.save(os.path.join(DATA_CACHE_DIR, filename), buf[:count])

        shard_idx[split] += 1

        progress[split] = tqdm(total=shard_size, unit='tokens',
                               desc=f"{split} shard {shard_idx[split]}")

        # Add remainder to new shard
        remainder = shard_size - count
        buf[:remainder] = tokens[:remainder]
        counts[split] = remainder
        progress[split].update(remainder)


train shard 0:  99%|█████████▉| 99422942/100000000 [00:38<00:00, 2876372.50tokens/s]
train shard 0: 100%|█████████▉| 99597312/100000000 [00:39<00:00, 2543333.86tokens/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


In [None]:
for split in ["val", "train"]:
    if counts[split] > 0:
        filename = f"tinystories_{split}_{shard_idx[split]:06d}.npy"
        np.save(os.path.join(DATA_CACHE_DIR, filename),
                buffers[split][:counts[split]])

train shard 2:  66%|██████▌   | 66249884/100000000 [00:42<00:10, 3071963.48tokens/s]

: 