In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("paultimothymooney/poetry")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/poetry


Step 1: Install Hugging Face Transformers

In [4]:
!pip install -q transformers datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

Imports

In [5]:
import pandas as pd
from transformers import (
    GPT2LMHeadModel, GPT2Tokenizer,
    TextDataset, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments, pipeline, set_seed
)
import os

In [9]:
dataset_path = "/kaggle/input/poetry"
output_file = "poems.txt"

# Combine all text files into one
with open(output_file, "w", encoding="utf-8") as outfile:
    for filename in os.listdir(dataset_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(dataset_path, filename)
            with open(file_path, "r", encoding="utf-8") as infile:
                content = infile.read().strip()
                outfile.write(content + "\n\n")

print(f"✅ Combined all artist lyrics into: {output_file}")

✅ Combined all artist lyrics into: poems.txt


In [10]:
dataset_path = "/kaggle/input/poetry"
print("Files in dataset folder:")
print(os.listdir(dataset_path))

Files in dataset folder:
['Kanye_West.txt', 'johnny-cash.txt', 'kanye-west.txt', 'bruno-mars.txt', 'dickinson.txt', 'amy-winehouse.txt', 'blink-182.txt', 'paul-simon.txt', 'patti-smith.txt', 'bieber.txt', 'disney.txt', 'jimi-hendrix.txt', 'lin-manuel-miranda.txt', 'adele.txt', 'dj-khaled.txt', 'beatles.txt', 'r-kelly.txt', 'lady-gaga.txt', 'radiohead.txt', 'britney-spears.txt', 'alicia-keys.txt', 'rihanna.txt', 'joni-mitchell.txt', 'dolly-parton.txt', 'drake.txt', 'Lil_Wayne.txt', 'notorious_big.txt', 'eminem.txt', 'janisjoplin.txt', 'prince.txt', 'bruce-springsteen.txt', 'bob-dylan.txt', 'notorious-big.txt', 'lil-wayne.txt', 'dr-seuss.txt', 'nicki-minaj.txt', 'bob-marley.txt', 'al-green.txt', 'nickelback.txt', 'michael-jackson.txt', 'lorde.txt', 'kanye.txt', 'leonard-cohen.txt', 'ludacris.txt', 'bjork.txt', 'nursery_rhymes.txt', 'nirvana.txt', 'cake.txt', 'missy-elliott.txt']


In [11]:
# 🔹 Step 5: Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Required by Trainer
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [12]:
# 🔹 Step 6: Dataset creation
def load_dataset(file_path, tokenizer, block_size=64):
    from transformers import TextDataset
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

def get_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

train_dataset = load_dataset("poems.txt", tokenizer, block_size=64)
data_collator = get_data_collator(tokenizer)



In [14]:

# 🔹 Step 7: Training setup
training_args = TrainingArguments(
    output_dir="./gpt2-poetry",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=6000,
    save_total_limit=2,
    logging_steps=500,
    report_to="none"
)

os.environ["WANDB_DISABLED"] = "true"
