In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("paultimothymooney/poetry")

print("Path to dataset files:", path)

In [None]:
!pip install datasets

In [None]:
import os
from datasets import Dataset

# Assuming your Kaggle dataset is downloaded to a directory
dataset_path = "/root/.cache/kagglehub/datasets/paultimothymooney/poetry/versions/16"

# Read all text files into a list
text_data = []
for filename in os.listdir(dataset_path):
    if filename.endswith(".txt"):
        with open(os.path.join(dataset_path, filename), "r", encoding="utf-8") as f:
            text = f.read().strip()
            if text:  # Skip empty files
                text_data.append({"text": text + " <|endoftext|>"})  # Add separator token

# Convert to HuggingFace Dataset
dataset = Dataset.from_list(text_data)

In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Critical fix

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length"  # Uniform length
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_dataset = tokenized_dataset.map(
    lambda examples: {
        "input_ids": examples["input_ids"],
        "attention_mask": examples["attention_mask"],
        "labels": examples["input_ids"].copy()  # Labels = inputs for LM
    },
    batched=True
)

In [None]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [None]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer

model = GPT2LMHeadModel.from_pretrained("gpt2")

# Training configuration
training_args = TrainingArguments(
    output_dir="./lyrics_generator",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_steps=100,
    save_steps=500,
    eval_strategy="steps",
    eval_steps=500,
    learning_rate=5e-5,
    report_to="none",  # Disables WandB
    remove_unused_columns=False  # Critical fix
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

In [None]:
model.save_pretrained("./fine_tuned_lyrics_gpt2")
tokenizer.save_pretrained("./fine_tuned_lyrics_gpt2")

In [None]:
from transformers import pipeline

lyrics_generator = pipeline(
    "text-generation",
    model="./fine_tuned_lyrics_gpt2",
    tokenizer=tokenizer
)

print(lyrics_generator("When the night comes", max_length=100)[0]["generated_text"])