In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os 
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
import numpy as np
import random
torch.manual_seed(42)

<torch._C.Generator at 0x71a194292cd0>

In [31]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') #gpt2-medium
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2") # Fallback padding
from transformers import get_linear_schedule_with_warmup ,TrainingArguments

In [32]:
class GPT2Dataset(Dataset):
    def __init__(self, poem_dirs, tokenizer, max_length=768):
        self.poem_dirs = poem_dirs
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.input_ids = []
        self.attn_masks = []
        self.txt_list = []
        
        for poem_dir in self.poem_dirs:
            for root, _, files in os.walk(poem_dir):
                for file in files:
                    file = f"/{root}/{file}"
                    with open(file, encoding='utf-8') as f:
                            poem = f.read()
                            self.txt_list.append(poem)
     

    
        for txt in self.txt_list:
                    encodings_dict = self.tokenizer(
                        '<|startoftext|>' + txt + '<|endoftext|>',
                        truncation=True,
                        max_length=max_length,
                        padding="max_length",
                        return_tensors='pt'
                    )
                    # Remove the batch dimension (1, max_length) -> (max_length,)
                    self.input_ids.append(encodings_dict['input_ids'].squeeze(0))
                    self.attn_masks.append(encodings_dict['attention_mask'].squeeze(0))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
            return {
                'input_ids': self.input_ids[idx],
                'attention_mask': self.attn_masks[idx],
                'labels': self.input_ids[idx]  # For language modeling, labels are the same as input_ids
            }

In [33]:
poem_dirs = ['~/Desktop/poem_generator/forms', '~/Desktop/poem_generator/topics']  # Adjusted for consistency

# Expand user home directory and ensure absolute paths
poem_dirs = [os.path.expanduser(path) for path in poem_dirs]


dataset = GPT2Dataset(poem_dirs, tokenizer, max_length=768)
print(dataset[0])
print(len(dataset))

{'input_ids': tensor([   27,    91,  9688,  1659,  5239,    91,    29,   464, 11303,  6864,
        46961,   579,    64,   198, 21816,  1987,    11, 12131,   198,    34,
          808,    11, 37593,    11, 37593,   198,  1026,   318,  1903,   285,
         1211,   198,    33,  7252,    11,   275,  7252,    11,   275,  7252,
          198,    33,   707,    75,    11,   275,   707,    75,    11,   275,
          707,    75,   198, 43413,   265,    11,  7245,   265,  2555,    26,
          788,   257,   198, 31632,  5238, 48484,   903,    13, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 

In [34]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

18,591 training samples
2,066 validation samples


In [35]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [36]:
epochs = 5
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
sample_every = 100

In [37]:
optimizer = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [38]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=optimizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,9.4504
20,8.7329
30,7.989
40,5.7235
50,3.7475
60,2.2576
70,1.4881
