In [1]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Assuming you already have a DataFrame with columns 'poem' and 'topic'
# Load your DataFrame
df = pd.read_csv("/kaggle/input/poem-classification-dataset/data.csv")

# Save the poems to a text file, required for training
with open('poems.txt', 'w') as f:
    for poem in df['poem']:
        f.write(poem + '\n\n')

# Load the tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

2024-05-17 22:20:30.265305: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-17 22:20:30.265413: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-17 22:20:30.391444: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [2]:
# Create a dataset from the poems text file
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )
    return dataset

In [3]:
train_dataset = load_dataset('/kaggle/working/poems.txt', tokenizer)



In [4]:
# Create a data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [5]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

In [6]:
# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [15]:
## Fine-tune the model
import os
os.environ['WANDB_MODE'] = 'disabled'

trainer.train()

Step,Training Loss
500,4.0408
1000,4.0621
1500,4.047
2000,4.0709
2500,4.0708
3000,4.0663
3500,4.1011
4000,4.0865
4500,4.0593
5000,4.0778


TrainOutput(global_step=8892, training_loss=4.089257367709387, metrics={'train_runtime': 899.2714, 'train_samples_per_second': 39.552, 'train_steps_per_second': 9.888, 'total_flos': 2323408748544000.0, 'train_loss': 4.089257367709387, 'epoch': 1.0})

In [16]:
trainer.save_model("./result_train2")
tokenizer.save_pretrained("./result_train2")

('./result_train2/tokenizer_config.json',
 './result_train2/special_tokens_map.json',
 './result_train2/vocab.json',
 './result_train2/merges.txt',
 './result_train2/added_tokens.json')

In [17]:
# Load the fine-tuned model for generation
model = GPT2LMHeadModel.from_pretrained('/kaggle/working/result_train2')
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [22]:
# Function to generate poems from a prompt
def generate_poem(prompt, model, tokenizer, max_length=200, temperature=0.7):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(inputs, max_length=max_length, temperature=temperature, num_return_sequences=1, num_beams=5, no_repeat_ngram_size=2)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [24]:
# Example usage
prompt = "Long live the king"
generated_poem = generate_poem(prompt, model, tokenizer)
print(generated_poem)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Long live the king’s blood,
And the blood of his sons.

The king is dead, and the sons
Are dead. The king was born
In the land of the dead;
He was the son of a king
Who died in his own blood.




(from “The Song of Solomon” by Robert Klee, translated from the Spanish by David S. Lewis, published by the University of California Press, 2001)

                                                 “I have heard the voice of God,          And I have seen the face of Heav'n.                                                 The Lord is in the midst of all things;             He is the King of Israel, the Lord of hosts, The Lord and God of men; He is God and man; and he is good and wise; And he hath power and will.     And He hath the power
