In [1]:
from datasets import load_dataset, Dataset
import sys
from datetime import datetime as dt
from typing import Any, List, Type
from transformers import OpenAIGPTTokenizerFast, OpenAIGPTConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = OpenAIGPTConfig()
config

OpenAIGPTConfig {
  "afn": "gelu",
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "openai-gpt",
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 512,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.48.2",
  "vocab_size": 40478
}

In [3]:
def get_size_in_gb(object: Any) -> float:
    return sys.getsizeof(object) / 1024**3

In [4]:
def tokenize_to_fixed_length(samples: List[str], tokenizer: OpenAIGPTTokenizerFast, batch_size: int = 100, length: int = 512, display_step: int = 100, **tokenizer_kwargs):
    tokenized_text_samples = []
    tokenized_text_sample = []
    
    n_steps = len(samples) // batch_size

    for step, start in enumerate(range(0,len(samples),batch_size)):
        if step % display_step == 0:
            print(f'Step {step+1}/{n_steps+1} @ {dt.now().strftime("%Y-%m-%d %H:%M:%S")}')
            print(tokenized_text_sample)
        end = start + batch_size
        batch = samples[start:end]
        tokenized_batch = tokenizer(batch, **tokenizer_kwargs)['input_ids']
        
        for tokenized_sample in tokenized_batch:
            tokenized_text_sample.extend(tokenized_sample)

        while len(tokenized_text_sample) >= length:
            tokenized_text_samples.append(tokenized_text_sample[:length])
            tokenized_text_sample = tokenized_text_sample[length:]

    return tokenized_text_samples

In [5]:
bookcorpusopen = Dataset.load_from_disk("bookcorpusopen-train")
len(bookcorpusopen), type(bookcorpusopen), get_size_in_gb(bookcorpusopen)

(74004228, datasets.arrow_dataset.Dataset, 5.21540641784668e-08)

In [6]:
bookcorpusopen.column_names

['text']

In [7]:
bookcorpusopen_list = bookcorpusopen[:]['text']

In [8]:
len(bookcorpusopen_list), get_size_in_gb(bookcorpusopen_list)

(74004228, 0.6057334467768669)

In [10]:
bookcorpusopen_list[-20:]

['without another word , she brushed past her mother and went into the kitchen .',
 'wearing his best suit and tie , her father stood at the bar , putting on his cufflinks .',
 'at his expectant look , she held up her hand .',
 '`` i know we leave in half an hour .',
 "i 'll be ready .",
 "i swear . ''",
 'she then threw open the basement door .',
 'pounding down the steps , she tried calming down .',
 "she did n't know what it was about her parents ' seemingly good intentions that grated on her last nerve .",
 "of course , they had n't bothered her as much when she had lived alone .",
 'now that she was back under their roof , they seemed to forget she was twenty-five , a mother , and not their little girl to boss around anymore .',
 "with clinicals looming to enable her to finish her nursing degree , she 'd known she would n't be able to work fulltime .",
 'although she loved the freedom and independence of having her own apartment , there was no way she could afford it and daycare f

In [9]:
bookcorpusopen_set = set(bookcorpusopen_list)
len(bookcorpusopen_set), get_size_in_gb(bookcorpusopen_set)

(38832894, 1.0000002011656761)

In [10]:
gpt1_tokenizer = OpenAIGPTTokenizerFast.from_pretrained(config.model_type)
gpt1_tokenizer.add_special_tokens({
    'unk_token':'<unk>',
    'bos_token':'<s>', # only needed for fine-tuning tasks
    'eos_token':'<e>', # only needed for fine-tuning tasks
    'sep_token':'<$>', # only needed for fine-tuning tasks
    'pad_token':'<p>' # only needed for fine-tuning tasks
    }
)

4

In [11]:
gpt1_tokenizer('this is an example sentence',return_attention_mask=False,return_token_type_ids=False)

{'input_ids': [616, 544, 531, 6989, 5958]}

In [12]:
tokenized_list_1m = tokenize_to_fixed_length(bookcorpusopen_list[:1000000], tokenizer=gpt1_tokenizer, display_step=2000, return_attention_mask=False,return_token_type_ids=False)

Step 1/10001 @ 2025-02-03 13:18:57
Step 2001/10001 @ 2025-02-03 13:19:01
Step 4001/10001 @ 2025-02-03 13:19:03
Step 6001/10001 @ 2025-02-03 13:19:08
Step 8001/10001 @ 2025-02-03 13:19:10


In [13]:
len(tokenized_list_1m), get_size_in_gb(tokenized_list_1m)

(30930, 0.00025828927755355835)

Tokenizing the entire dataset with a batch size of 50 takes about 48 minutes.

In [14]:
tokenized_list = tokenize_to_fixed_length(bookcorpusopen_list, tokenizer=gpt1_tokenizer, batch_size=50, display_step=2000, return_attention_mask=False,return_token_type_ids=False)

Step 1/1480085 @ 2025-02-03 13:19:13
Step 2001/1480085 @ 2025-02-03 13:19:14
Step 4001/1480085 @ 2025-02-03 13:19:15
Step 6001/1480085 @ 2025-02-03 13:19:16
Step 8001/1480085 @ 2025-02-03 13:19:17
Step 10001/1480085 @ 2025-02-03 13:19:19
Step 12001/1480085 @ 2025-02-03 13:19:20
Step 14001/1480085 @ 2025-02-03 13:19:21
Step 16001/1480085 @ 2025-02-03 13:19:22
Step 18001/1480085 @ 2025-02-03 13:19:23
Step 20001/1480085 @ 2025-02-03 13:19:24
Step 22001/1480085 @ 2025-02-03 13:19:26
Step 24001/1480085 @ 2025-02-03 13:19:27
Step 26001/1480085 @ 2025-02-03 13:19:28
Step 28001/1480085 @ 2025-02-03 13:19:29
Step 30001/1480085 @ 2025-02-03 13:19:30
Step 32001/1480085 @ 2025-02-03 13:19:31
Step 34001/1480085 @ 2025-02-03 13:19:32
Step 36001/1480085 @ 2025-02-03 13:19:33
Step 38001/1480085 @ 2025-02-03 13:19:35
Step 40001/1480085 @ 2025-02-03 13:19:36
Step 42001/1480085 @ 2025-02-03 13:19:37
Step 44001/1480085 @ 2025-02-03 13:19:38
Step 46001/1480085 @ 2025-02-03 13:19:39
Step 48001/1480085 @ 202

Token indices sequence length is longer than the specified maximum sequence length for this model (1119 > 512). Running this sequence through the model will result in indexing errors


Step 172001/1480085 @ 2025-02-03 13:22:29
Step 174001/1480085 @ 2025-02-03 13:22:30
Step 176001/1480085 @ 2025-02-03 13:22:32
Step 178001/1480085 @ 2025-02-03 13:22:33
Step 180001/1480085 @ 2025-02-03 13:22:34
Step 182001/1480085 @ 2025-02-03 13:22:35
Step 184001/1480085 @ 2025-02-03 13:22:36
Step 186001/1480085 @ 2025-02-03 13:22:37
Step 188001/1480085 @ 2025-02-03 13:22:38
Step 190001/1480085 @ 2025-02-03 13:22:39
Step 192001/1480085 @ 2025-02-03 13:22:40
Step 194001/1480085 @ 2025-02-03 13:22:41
Step 196001/1480085 @ 2025-02-03 13:22:42
Step 198001/1480085 @ 2025-02-03 13:22:44
Step 200001/1480085 @ 2025-02-03 13:22:45
Step 202001/1480085 @ 2025-02-03 13:22:46
Step 204001/1480085 @ 2025-02-03 13:22:47
Step 206001/1480085 @ 2025-02-03 13:22:49
Step 208001/1480085 @ 2025-02-03 13:22:50
Step 210001/1480085 @ 2025-02-03 13:22:51
Step 212001/1480085 @ 2025-02-03 13:22:52
Step 214001/1480085 @ 2025-02-03 13:22:53
Step 216001/1480085 @ 2025-02-03 13:22:55
Step 218001/1480085 @ 2025-02-03 1

The dataset should contain 2'098'676 samples. Each sample is a sequence of 512 tokens generated by concatenating contiguous sentences and truncating as needed. This implies an average token count of just 13 tokens per original sentence.

This means there are some instances were tokens within a sample originate from different books, but that is fine given the encoding nature of the GPT 1 model's pretraining phase - we use the entire sequence as both input AND target. 

In [15]:
len(tokenized_list), get_size_in_gb(tokenized_list)

(2098676, 0.01595195382833481)

In [16]:
tokenized_list[-1][:10]

[481, 2264, 4782, 239, 7591, 240, 513, 8108, 1674, 2052]

In [17]:
import pickle

with open("tokenized_bookcorpusopen_list.pkl","wb") as token_file:
    pickle.dump(tokenized_list,token_file)

In [18]:
gpt1_tokenizer.save_pretrained("openai-gpt-local")

('openai-gpt-local\\tokenizer_config.json',
 'openai-gpt-local\\special_tokens_map.json',
 'openai-gpt-local\\vocab.json',
 'openai-gpt-local\\merges.txt',
 'openai-gpt-local\\added_tokens.json',
 'openai-gpt-local\\tokenizer.json')

In [35]:
stats = [len(sentence) for sentence in bookcorpusopen_list]

In [36]:
min(stats), sum(stats) / len(stats), max(stats)

(1, 61.338552845926586, 460937)

In [37]:
import pandas as pd

In [38]:
stats_df = pd.DataFrame(data = stats,columns=['length'])
stats_df

Unnamed: 0,length
0,78
1,62
2,62
3,198
4,121
...,...
74004223,95
74004224,86
74004225,111
74004226,107


In [39]:
import numpy as np

percentiles = [np.percentile(stats,q=i) for i in range(1,100,10)]
percentiles

[7.0, 17.0, 25.0, 33.0, 41.0, 51.0, 62.0, 76.0, 94.0, 124.0]

In [18]:
bookcorpusopen_list[:10]

['usually , he would be tearing around the living room , playing with his toys .',
 'but just one look at a minion sent him practically catatonic .',
 "that had been megan 's plan when she got him dressed earlier .",
 "he 'd seen the movie almost by mistake , considering he was a little young for the pg cartoon , but with older cousins , along with her brothers , mason was often exposed to things that were older .",
 'she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .',
 "`` are n't you being a good boy ? ''",
 'she said .',
 'mason barely acknowledged her .',
 'instead , his baby blues remained focused on the television .',
 'since the movie was almost over , megan knew she better slip into the bedroom and finish getting ready .']