In [2]:
# 🔹 Step 1: Install required libraries
!pip install -q transformers datasets

from google.colab import files
import os
import torch


# 🔹 Step 2: Import libraries
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    pipeline,
    set_seed
)

In [3]:
# 🔹 Step 3: Upload your lyrics file (e.g., lyrics.txt)
print("Please upload a plain .txt file of English song lyrics...")
uploaded = files.upload()
lyrics_file = list(uploaded.keys())[0]  # e.g., 'lyrics.txt'

Please upload a plain .txt file of English song lyrics...


Saving lyrics.txt to lyrics (1).txt


In [4]:
# 🔹 Step 4: Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [9]:

# 🔹 Step 5: Load dataset and data collator
def load_dataset(file_path, tokenizer, block_size=32):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

def get_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # GPT2 is a causal LM
    )

train_dataset = load_dataset(lyrics_file, tokenizer)
data_collator = get_data_collator(tokenizer)




In [10]:
# 🔹 Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-lyrics",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=100
)


In [11]:
# 🔹 Step 7: Trainer setup and training
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

print("\n🔁 Training GPT-2 on your lyrics...")
trainer.train()





🔁 Training GPT-2 on your lyrics...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mudvik4321[0m ([33mudvik4321-chaitanya-bharathi-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=3, training_loss=3.4840380350748696, metrics={'train_runtime': 300.8761, 'train_samples_per_second': 0.01, 'train_steps_per_second': 0.01, 'total_flos': 48992256000.0, 'train_loss': 3.4840380350748696, 'epoch': 3.0})

In [8]:
# Check file content
with open(lyrics_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    print(f"✅ Uploaded file has {len(lines)} lines")
    print("📝 Example line:", lines[0] if lines else "File is empty")


✅ Uploaded file has 8 lines
📝 Example line: I'm walking through the fire



In [12]:

# 🔹 Step 8: Save model
model_path = "./gpt2-lyrics-final"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print(f"✅ Model saved to: {model_path}")


✅ Model saved to: ./gpt2-lyrics-final


In [13]:
# 🔹 Step 9: Generate lyrics from prompt
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
set_seed(42)

print("\n🎵 Sample Generated Lyrics:\n")
prompts = [
    "I loved her from the start",
    "Dancing under the moon",
    "Heartbreak on a rainy night",
    "Dreams fade into",
    "She walked away with"
]

for prompt in prompts:
    result = generator(prompt, max_length=50, num_return_sequences=1)
    print(f"> {prompt}...\n{result[0]['generated_text']}\n")

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



🎵 Sample Generated Lyrics:

> I loved her from the start...
I loved her from the start just the way she was, and she had this amazing and beautiful look and everything she knew well. She was a beautiful piece of jewelry she just was. She was really funny. This wasn't a whole lot of time

> Dancing under the moon...
Dancing under the moonlight

With just a couple of days left before the main event, it was time with a heavy focus on the tournament.

"I see this tournament already to see what I could do!"

Everyone from

> Heartbreak on a rainy night...
Heartbreak on a rainy night has brought her all the warmth and life and romance. And now the world seems to be over and things to do in their lives are changing.
"You don't get a chance to die yet!" Lissa says

> Dreams fade into...
Dreams fade into darkness, at last he had never before felt so alone as he saw them alone, and soon again, for ever, he felt like to sleep but one night.

* * * * * *


"I

> She walked away with...
She walked

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("paultimothymooney/poetry")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/poetry


In [3]:
# 🔹 Step 1: Install Hugging Face Transformers
!pip install -q transformers datasets

In [4]:
# 🔹 Step 2: Imports
import pandas as pd
from transformers import (
    GPT2LMHeadModel, GPT2Tokenizer,
    TextDataset, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments, pipeline, set_seed
)
import os

In [7]:
import os

dataset_path = "/kaggle/input/poetry"
output_file = "poems.txt"

# Combine all text files into one
with open(output_file, "w", encoding="utf-8") as outfile:
    for filename in os.listdir(dataset_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(dataset_path, filename)
            with open(file_path, "r", encoding="utf-8") as infile:
                content = infile.read().strip()
                outfile.write(content + "\n\n")

print(f"✅ Combined all artist lyrics into: {output_file}")


✅ Combined all artist lyrics into: poems.txt


In [6]:
import os

dataset_path = "/kaggle/input/poetry"
print("Files in dataset folder:")
print(os.listdir(dataset_path))


Files in dataset folder:
['Kanye_West.txt', 'johnny-cash.txt', 'kanye-west.txt', 'bruno-mars.txt', 'dickinson.txt', 'amy-winehouse.txt', 'blink-182.txt', 'paul-simon.txt', 'patti-smith.txt', 'bieber.txt', 'disney.txt', 'jimi-hendrix.txt', 'lin-manuel-miranda.txt', 'adele.txt', 'dj-khaled.txt', 'beatles.txt', 'r-kelly.txt', 'lady-gaga.txt', 'radiohead.txt', 'britney-spears.txt', 'alicia-keys.txt', 'rihanna.txt', 'joni-mitchell.txt', 'dolly-parton.txt', 'drake.txt', 'Lil_Wayne.txt', 'notorious_big.txt', 'eminem.txt', 'janisjoplin.txt', 'prince.txt', 'bruce-springsteen.txt', 'bob-dylan.txt', 'notorious-big.txt', 'lil-wayne.txt', 'dr-seuss.txt', 'nicki-minaj.txt', 'bob-marley.txt', 'al-green.txt', 'nickelback.txt', 'michael-jackson.txt', 'lorde.txt', 'kanye.txt', 'leonard-cohen.txt', 'ludacris.txt', 'bjork.txt', 'nursery_rhymes.txt', 'nirvana.txt', 'cake.txt', 'missy-elliott.txt']


In [8]:
# 🔹 Step 5: Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Required by Trainer
model = GPT2LMHeadModel.from_pretrained("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [12]:
# 🔹 Step 6: Dataset creation
def load_dataset(file_path, tokenizer, block_size=64):
    from transformers import TextDataset
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

def get_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

train_dataset = load_dataset("poems.txt", tokenizer, block_size=64)
data_collator = get_data_collator(tokenizer)



In [19]:

# 🔹 Step 7: Training setup
training_args = TrainingArguments(
    output_dir="./gpt2-poetry",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=5000,
    save_total_limit=2,
    logging_steps=2500,
    report_to="none"  # 👈 disables wandb and other loggers
)

os.environ["WANDB_DISABLED"] = "true"


In [20]:
# 🔹 Step 8: Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)
print("\n🔁 Training GPT-2 on poetry dataset...")
trainer.train()


🔁 Training GPT-2 on poetry dataset...


Step,Training Loss
2500,2.9848
5000,2.9775
7500,3.1841
10000,3.403
12500,3.3572
15000,3.3086
17500,2.9824
20000,2.9775
22500,2.9852
25000,2.9461


Step,Training Loss
2500,2.9848
5000,2.9775
7500,3.1841
10000,3.403
12500,3.3572
15000,3.3086
17500,2.9824
20000,2.9775
22500,2.9852
25000,2.9461


TrainOutput(global_step=44214, training_loss=2.9679864572908183, metrics={'train_runtime': 4068.7843, 'train_samples_per_second': 21.733, 'train_steps_per_second': 10.867, 'total_flos': 2888093491200000.0, 'train_loss': 2.9679864572908183, 'epoch': 3.0})