# GPT2-large Training Experiments

In [None]:
import transformers
import torch

### PARAMETERS ###
config = transformers.GPT2Config.from_pretrained("gpt2-large")
config.do_sample = config.task_specific_params['text-generation']['do_sample'] # Ensure random text gen with each prompt
config.max_length = config.task_specific_params['text-generation']['max_length'] = 200
  
print(config)

### MODEL AND TOKENIZER ###
model = transformers.GPT2LMHeadModel.from_pretrained("gpt2-large", config=config)
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2-large")

## Tokenizer
The Tokenizer breaks down a text into words (or tokens) and assigns each an encoding number. The encoding number is used by GPT-2 to identify wordparts.  

In [None]:
enc = tokenizer(["The elf queen"], return_tensors='pt')
print('enc =', enc)
print(tokenizer.batch_decode(enc['input_ids']))

## Text Generation
Use the tokenizer result to prompt the GPT2 model. The tokenizer step encoded the tokens into numbers that the model can understand. The output will need to be decoded for us to read.

In [None]:
out = model.generate(input_ids=enc['input_ids'],
attention_mask = enc['attention_mask'], max_length = 50)

print('out=', out)
print(tokenizer.batch_decode(out))

## Training from the GPT1 Paper Dataset
We will make a custom class for the dataset, then use the Transformer library Trainer to do unsupervised learning.

In [None]:
# Helper functions
import transformers
import torch

def break_text_to_pieces(text_path: str, tokenizer: transformers.PreTrainedTokenizer, block_len: int = 512) -> list[str]:
    """Read a file and convert it to tokenized blocks, edding <|endoftext|> to each block"""
    with open(text_path) as f:
        text = f.read()
    chunk_len0 = block_len - 1  # Leave space for a TOKEN_ENDOFTEXT
    tokens = tokenizer.encode(text)
    blocks = []
    pos = 0
    while pos < len(tokens):
        chunk = tokens[pos: pos + chunk_len0]
        chunk.append(TOKEN_ENDOFTEXT)
        blocks.append(chunk)
        pos += chunk_len0

    if len(blocks[-1]) < block_len:
        del blocks[-1]

    for block in blocks:
        print(len(block))

    return blocks

def train_val_split(data: list[str], ratio: float):
    n = len(data)
    assert n >= 2
    n_val = max(1, int(n * ratio))
    return data[n_val:], data[:n_val]

def prepare_dsets(text_path: str, tokenizer: transformers.PreTrainedTokenizer, block_len: int):
    """Read the text, prepare the datasets """
    data = break_text_to_pieces(text_path, tokenizer, block_len)
    data_train, data_val = train_val_split(data, 0.2)
    return MyDset(data_train), MyDset(data_val)

In [None]:
# Custom Dataset
class MyDset(torch.utils.data.Dataset):
    def __init__(self, data: list[list[int]]):
        self.data = []
        for d in data:
            input_ids = torch.tensor(d, dtype=torch.int64)
            attention_mask = torch.ones(len(d), dtype=torch.int64)
            self.data.append({'input_ids': input_ids,
                'attention_mask': attention_mask, 'labels':input_ids})

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        return self.data[idx]

In [None]:
# Training configuration
training_args = transformers.TrainingArguments(
    output_dir="gpt1_save",
    learning_rate=1e-3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=20,
    evaluation_strategy='epoch',
    save_strategy='no',
)

In [None]:
# Train the model with GPT1
TOKEN_ENDOFTEXT = 50256  # <|endoftext|>def break_text_to_pieces(text_path: str, tokenizer: transformers.PreTrainedTokenizer, block_len: int = 1024) -> List[List[int]]:
    """Read a file and convert it to tokenized blocks, adding TOKEN_ENDOFTEXT to each block"""
    with open(text_path) as f:
        text = f.read()
    tokens = tokenizer.encode(text)
    blocks = []
    pos = 0
    while pos < len(tokens):
        chunk = tokens[pos: pos + block_len - 1]  # Ensure chunk length is within block_len - 1
        if len(chunk) < block_len - 1:
            chunk.extend([tokenizer.pad_token_id] * (block_len - 1 - len(chunk)))  # Pad the chunk if needed
        chunk.append(tokenizer.eos_token_id)  # Add EOS token
        blocks.append(chunk)
        pos += len(chunk) - 1  # Move position by the length of the chunk minus EOS token

    return blocks
BLOCK_LEN = 512
TEXT_CORPUS = 'gpt1_paper.txt'

# Load model and tokenizer
model = transformers.GPT2LMHeadModel.from_pretrained('gpt2')
# model = transformers.GPT2LMHeadModel(transformers.GPT2Config())
tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')

# Create datasets and loader
dset_train, dset_val = prepare_dsets(TEXT_CORPUS, tokenizer, BLOCK_LEN)
print(len(dset_train.__getitem__(0)['input_ids']))

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
)
trainer.train()

In [None]:
# Unsupervised training with reuters dataset turned into corpus
import nltk
nltk.download('reuters')
from nltk.corpus import reuters

# Get all file IDs from the Reuters corpus
file_ids = reuters.fileids()

# Initialize an empty list to store the combined text
combined_text = []

# Iterate through each file ID and read its text
for file_id in file_ids:
    text = reuters.raw(file_id)
    combined_text.append(text)

# Combine all text into a single corpus
corpus = '\n'.join(combined_text)

# Now you can use the 'corpus' variable for training your model

# Write the corpus to a .txt file
with open('reuters_corpus.txt', 'w', encoding='utf-8') as file:
    file.write(corpus)

# Train the model with GPT1
TOKEN_ENDOFTEXT = 50256  # <|endoftext|>
BLOCK_LEN = 1024
TEXT_CORPUS = "reuters_corpus_small.txt"

# Load model and tokenizer
model = transformers.GPT2LMHeadModel.from_pretrained('gpt2')
# model = transformers.GPT2LMHeadModel(transformers.GPT2Config())
tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')

# Create datasets and loader
dset_train, dset_val = prepare_dsets(TEXT_CORPUS, tokenizer, BLOCK_LEN)
for i in range(0,100):
    print(len(dset_train.__getitem__(i)['input_ids']))

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
)
trainer.train()

In [None]:
# Save the training
model.save_pretrained('./gpt2_reuters_large/')
tokenizer.save_pretrained('./gpt2_reuters_large/')

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the saved model and tokenizer
model_path = "./gpt2_reuters_large/"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Move model to the same device used for training
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Test loop
while True:
    prompt = input("Enter your prompt: ")
    enc = tokenizer(prompt, return_tensors='pt').to(device)
    print('enc =', enc)
    print(tokenizer.batch_decode(enc['input_ids']))

    out = model.generate(input_ids=enc['input_ids'],
                         attention_mask=enc['attention_mask'],
                         max_length=50)
    
    #print('out =', out)
    print('Output:', tokenizer.batch_decode(out))


## OpenWebText

In [1]:
# Download and format the dataset into a corpus of text
from datasets import load_dataset

# Load the OpenWebText-10k dataset
openwebtext_dataset = load_dataset("stas/openwebtext-10k")

# Print information about the dataset
print(openwebtext_dataset)
print(openwebtext_dataset["train"])
print(openwebtext_dataset['train']["text"][:1])
# Write the corpus to a .txt file
with open("openwebtext-10k.txt", 'w', encoding='utf-8') as file:
    for text in openwebtext_dataset['train']['text']:
        file.write(text + "\n\n")

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 10000
    })
})
Dataset({
    features: ['text'],
    num_rows: 10000
})
["A magazine supplement with an image of Adolf Hitler and the title 'The Unreadable Book' is pictured in Berlin. No law bans “Mein Kampf” in Germany, but the government of Bavaria, holds the copyright and guards it ferociously. (Thomas Peter/REUTERS)\n\nThe city that was the center of Adolf Hitler’s empire is littered with reminders of the Nazi past, from the bullet holes that pit the fronts of many buildings to the hulking Luftwaffe headquarters that now house the Finance Ministry.\n\nWhat it doesn’t have, nor has it since 1945, are copies of Hitler’s autobiography and political manifesto, “Mein Kampf,” in its bookstores. The latest attempt to publish excerpts fizzled this week after the Bavarian government challenged it in court, although an expurgated copy appeared at newspaper kiosks around the country.\n\nBut in Germany — where k

In [2]:
# Helper functions
import transformers
import torch

def break_text_to_pieces(text_path: str, tokenizer: transformers.PreTrainedTokenizer, block_len: int = 512) -> list[str]:
    """Read a file and convert it to tokenized blocks, edding <|endoftext|> to each block"""
    with open(text_path) as f:
        text = f.read()
    chunk_len0 = block_len - 1  # Leave space for a TOKEN_ENDOFTEXT
    tokens = tokenizer.encode(text)
    blocks = []
    pos = 0
    while pos < len(tokens):
        chunk = tokens[pos: pos + chunk_len0]
        chunk.append(TOKEN_ENDOFTEXT)
        blocks.append(chunk)
        pos += chunk_len0

    if len(blocks[-1]) < block_len:
        del blocks[-1]

    #for block in blocks:
    #    print(len(block))

    return blocks

def train_val_split(data: list[str], ratio: float):
    n = len(data)
    assert n >= 2
    n_val = max(1, int(n * ratio))
    return data[n_val:], data[:n_val]

def prepare_dsets(text_path: str, tokenizer: transformers.PreTrainedTokenizer, block_len: int):
    """Read the text, prepare the datasets """
    data = break_text_to_pieces(text_path, tokenizer, block_len)
    data_train, data_val = train_val_split(data, 0.2)
    return MyDset(data_train), MyDset(data_val)

# Custom Dataset
class MyDset(torch.utils.data.Dataset):
    def __init__(self, data: list[list[int]]):
        self.data = []
        for d in data:
            input_ids = torch.tensor(d, dtype=torch.int64)
            attention_mask = torch.ones(len(d), dtype=torch.int64)
            self.data.append({'input_ids': input_ids,
                'attention_mask': attention_mask, 'labels':input_ids})

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        return self.data[idx]

In [None]:
# Training configuration
training_args = transformers.TrainingArguments(
    output_dir="gpt1_save",
    learning_rate=1e-3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=20,
    evaluation_strategy='epoch',
    save_strategy='no',
    fp16=True,  # Enable mixed precision training
)

# Train the model with openwebtext-10k

TOKEN_ENDOFTEXT = 50256  # <|endoftext|>
BLOCK_LEN = 1024
TEXT_CORPUS = "openwebtext-10k-small.txt"

# Load model and tokenizer
model = transformers.GPT2LMHeadModel.from_pretrained('gpt2')
# model = transformers.GPT2LMHeadModel(transformers.GPT2Config())
tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')

# Create datasets and loader
dset_train, dset_val = prepare_dsets(TEXT_CORPUS, tokenizer, BLOCK_LEN)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
)
trainer.train()

Token indices sequence length is longer than the specified maximum sequence length for this model (1953143 > 1024). Running this sequence through the model will result in indexing errors


Epoch,Training Loss,Validation Loss


In [None]:
# Save the training
model.save_pretrained('./gpt2_openwebtext10k_large/')
tokenizer.save_pretrained('./gpt2_openwebtext10k_large/')