In [44]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

# Load the tokenizer and model
model_name = "gpt2-large"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))  # Adjust the model's embedding size

# Check if MPS (Metal Performance Shaders) is available
device = torch.device("mps" if torch.has_mps else "cpu")
model.to(device)

# Load the haiku dataset
dataset = load_dataset("davanstrien/haiku_kto")

# Inspect dataset keys
print(dataset['train'][0])

  device = torch.device("mps" if torch.has_mps else "cpu")


{'prompt': "Write a haiku about the elk's bugling in the forest.", 'completion': "Autumn leaves quiver,\nElk's call echoes through trees,\nNature's symphony.", 'label': False, 'label-suggestion': None, 'label-suggestion-metadata': {'type': None, 'score': None, 'agent': None}, 'external_id': None, 'metadata': '{"prompt": "Write a haiku about the elk\'s bugling in the forest.", "generation_model": "mistralai/Mistral-7B-Instruct-v0.2"}', 'messages': [{'content': "Write a haiku about the elk's bugling in the forest.", 'role': 'user'}, {'content': "Autumn leaves quiver,\nElk's call echoes through trees,\nNature's symphony.", 'role': 'assistant'}]}


In [45]:
# Split the training data into train and validation sets (90% train, 10% validation)
train_val_split = dataset['train'].train_test_split(test_size=0.1)
train_data = train_val_split['train']
val_data = train_val_split['test']

In [46]:
# Extract haiku texts and tokenize them
def extract_and_tokenize_function(batch):
    haikus = []
    for example in batch['messages']:
        haiku = next((message['content'] for message in example if message['role'] == 'assistant'), None)
        if haiku:
            haikus.append(haiku)
    tokenized = tokenizer(haikus, truncation=True, padding='max_length', max_length=50)
    input_ids = torch.tensor(tokenized['input_ids'])
    attention_mask = torch.tensor(tokenized['attention_mask'])
    labels = input_ids.clone()
    labels[labels == tokenizer.pad_token_id] = -100
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

# Apply tokenization function with batching
train_data = train_data.map(extract_and_tokenize_function, batched=True, remove_columns=train_data.column_names)
val_data = val_data.map(extract_and_tokenize_function, batched=True, remove_columns=val_data.column_names)

train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [47]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",  # Save strategy set to "epoch"
    evaluation_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)



In [48]:
# Fine-tune the model
trainer.train()

  0%|          | 0/123 [00:00<?, ?it/s]

  8%|▊         | 10/123 [03:42<47:21, 25.15s/it]

{'loss': 4.8446, 'grad_norm': 17.9531307220459, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.24}


 16%|█▋        | 20/123 [08:01<43:10, 25.15s/it]

{'loss': 4.6471, 'grad_norm': 15.744894981384277, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.49}


 24%|██▍       | 30/123 [12:08<37:26, 24.15s/it]

{'loss': 4.6936, 'grad_norm': 13.247477531433105, 'learning_rate': 3e-06, 'epoch': 0.73}


 33%|███▎      | 40/123 [14:49<17:39, 12.77s/it]

{'loss': 4.1773, 'grad_norm': 15.666923522949219, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.98}


                                                
 33%|███▎      | 41/123 [15:13<20:09, 14.75s/it]

{'eval_loss': 3.7638936042785645, 'eval_runtime': 5.3768, 'eval_samples_per_second': 1.86, 'eval_steps_per_second': 0.93, 'epoch': 1.0}


 41%|████      | 50/123 [18:09<17:57, 14.75s/it]

{'loss': 3.563, 'grad_norm': 13.233949661254883, 'learning_rate': 5e-06, 'epoch': 1.22}


 49%|████▉     | 60/123 [20:56<16:06, 15.35s/it]

{'loss': 3.4297, 'grad_norm': 12.892355918884277, 'learning_rate': 6e-06, 'epoch': 1.46}


 57%|█████▋    | 70/123 [22:56<08:17,  9.39s/it]

{'loss': 3.07, 'grad_norm': 13.70809555053711, 'learning_rate': 7.000000000000001e-06, 'epoch': 1.71}


 65%|██████▌   | 80/123 [23:44<02:34,  3.60s/it]

{'loss': 2.9739, 'grad_norm': 14.871001243591309, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.95}


                                                
 67%|██████▋   | 82/123 [23:58<03:04,  4.51s/it]

{'eval_loss': 3.0768918991088867, 'eval_runtime': 3.1898, 'eval_samples_per_second': 3.135, 'eval_steps_per_second': 1.568, 'epoch': 2.0}


 73%|███████▎  | 90/123 [25:11<02:43,  4.95s/it]

{'loss': 2.6421, 'grad_norm': 14.238887786865234, 'learning_rate': 9e-06, 'epoch': 2.2}


 81%|████████▏ | 100/123 [25:49<01:42,  4.46s/it]

{'loss': 2.2724, 'grad_norm': 16.64166831970215, 'learning_rate': 1e-05, 'epoch': 2.44}


 89%|████████▉ | 110/123 [26:52<01:45,  8.13s/it]

{'loss': 2.5263, 'grad_norm': 19.697540283203125, 'learning_rate': 1.1000000000000001e-05, 'epoch': 2.68}


 98%|█████████▊| 120/123 [29:40<00:51, 17.23s/it]

{'loss': 2.3625, 'grad_norm': 13.184134483337402, 'learning_rate': 1.2e-05, 'epoch': 2.93}


                                                 
100%|██████████| 123/123 [30:28<00:00, 15.79s/it]

{'eval_loss': 2.963078737258911, 'eval_runtime': 2.17, 'eval_samples_per_second': 4.608, 'eval_steps_per_second': 2.304, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
100%|██████████| 123/123 [31:19<00:00, 15.28s/it]

{'train_runtime': 1879.9743, 'train_samples_per_second': 0.131, 'train_steps_per_second': 0.065, 'train_loss': 3.4029242856715753, 'epoch': 3.0}





TrainOutput(global_step=123, training_loss=3.4029242856715753, metrics={'train_runtime': 1879.9743, 'train_samples_per_second': 0.131, 'train_steps_per_second': 0.065, 'total_flos': 52279211520000.0, 'train_loss': 3.4029242856715753, 'epoch': 3.0})

In [49]:
# Save the fine-tuned model
model.save_pretrained("./fine-tuned-haiku-model")
tokenizer.save_pretrained("./fine-tuned-haiku-model")

('./fine-tuned-haiku-model/tokenizer_config.json',
 './fine-tuned-haiku-model/special_tokens_map.json',
 './fine-tuned-haiku-model/vocab.json',
 './fine-tuned-haiku-model/merges.txt',
 './fine-tuned-haiku-model/added_tokens.json')

In [50]:
# Function to generate haiku
def generate_haiku(prompt, model, tokenizer, max_length=30):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(
        input_ids=input_ids, 
        attention_mask=attention_mask, 
        max_length=max_length, 
        num_return_sequences=1, 
        no_repeat_ngram_size=2, 
        early_stopping= True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [51]:
# Test the fine-tuned model
prompt = "The oceans breeze"
haiku = generate_haiku(prompt, model, tokenizer)
print(f"Generated Haiku:\n{haiku}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Haiku:
The oceans breeze,
Nature's symphony,


Life's gentle symphonic dance.

...

.

