In [28]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("paultimothymooney/poetry")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/poetry


In [29]:
!pip install datasets



In [38]:
import os
from datasets import Dataset

# Assuming your Kaggle dataset is downloaded to a directory
dataset_path = "/root/.cache/kagglehub/datasets/paultimothymooney/poetry/versions/16"

# Read all text files into a list
text_data = []
for filename in os.listdir(dataset_path):
    if filename.endswith(".txt"):
        with open(os.path.join(dataset_path, filename), "r", encoding="utf-8") as f:
            text = f.read().strip()
            if text:  # Skip empty files
                text_data.append({"text": text + " <|endoftext|>"})  # Add separator token

# Convert to HuggingFace Dataset
dataset = Dataset.from_list(text_data)

In [46]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Critical fix

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length"  # Uniform length
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

In [47]:
tokenized_dataset = tokenized_dataset.map(
    lambda examples: {
        "input_ids": examples["input_ids"],
        "attention_mask": examples["attention_mask"],
        "labels": examples["input_ids"].copy()  # Labels = inputs for LM
    },
    batched=True
)

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

In [48]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [51]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer

model = GPT2LMHeadModel.from_pretrained("gpt2")

# Training configuration
training_args = TrainingArguments(
    output_dir="./lyrics_generator",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_steps=100,
    save_steps=500,
    eval_strategy="steps",
    eval_steps=500,
    learning_rate=5e-5,
    report_to="none",  # Disables WandB
    remove_unused_columns=False  # Critical fix
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=55, training_loss=2.7912861217151987, metrics={'train_runtime': 41.7155, 'train_samples_per_second': 5.274, 'train_steps_per_second': 1.318, 'total_flos': 57484247040000.0, 'train_loss': 2.7912861217151987, 'epoch': 5.0})

In [52]:
model.save_pretrained("./fine_tuned_lyrics_gpt2")
tokenizer.save_pretrained("./fine_tuned_lyrics_gpt2")

('./fine_tuned_lyrics_gpt2/tokenizer_config.json',
 './fine_tuned_lyrics_gpt2/special_tokens_map.json',
 './fine_tuned_lyrics_gpt2/vocab.json',
 './fine_tuned_lyrics_gpt2/merges.txt',
 './fine_tuned_lyrics_gpt2/added_tokens.json')

In [56]:
from transformers import pipeline

lyrics_generator = pipeline(
    "text-generation",
    model="./fine_tuned_lyrics_gpt2",
    tokenizer=tokenizer
)

print(lyrics_generator("When the night comes", max_length=100)[0]["generated_text"])

Device set to use cuda:0


When the night comes

And the moon rises in it, to see us
The dream that makes us fall
And all that's to be forgotten
The dream is coming true.

And you, like me, have told me all
The tale of all this
And you had this one word to tell of it all
And this word is still the same
And what do I know how to tell it all
So tell me what is the true meaning of this dream

