In [3]:
!pip install datasets # Install the datasets library
import os
import torch
from datasets import load_dataset
from transformers import (
    GPT2Tokenizer, GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer, TrainingArguments
)
os.environ["WANDB_DISABLED"] = "true"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")

lyrics_file = "lyrics.txt"
if not os.path.exists(lyrics_file):
    sample_lyrics = [
        "All of this and more is for you\n",
        "With love, sincerity and deepest care\n",
        "My life with you I share\n",
        "Ever since I met you, baby\n",
        "Every day that you keep it away\n"
    ]
    with open(lyrics_file, "w", encoding="utf-8") as f:
        f.writelines(sample_lyrics)

dataset = load_dataset("text", data_files={"train": lyrics_file})

def tokenize_text(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

tokenized_data = dataset.map(tokenize_text, batched=True)
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./gpt2-lyrics-output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10,
    logging_steps=5,
    save_total_limit=1,
    prediction_loss_only=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    data_collator=collator
)

print("Training GPT-2 on sample lyrics...")
trainer.train()
print("Training complete.")

model.save_pretrained("gpt2-lyrics-model")
tokenizer.save_pretrained("gpt2-lyrics-model")



Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  33%|###2      | 178M/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training GPT-2 on sample lyrics...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
5,3.8578


Training complete.


('gpt2-lyrics-model/tokenizer_config.json',
 'gpt2-lyrics-model/special_tokens_map.json',
 'gpt2-lyrics-model/vocab.json',
 'gpt2-lyrics-model/merges.txt',
 'gpt2-lyrics-model/added_tokens.json')

In [8]:
def generate_lyrics(prompt, max_new_tokens=60):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(
        input_ids=input_ids,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        top_k=40,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

user_prompt = input("Enter your song prompt: ")
lyrics = generate_lyrics(user_prompt)
print("\nGenerated Lyrics:")
print(lyrics)

Enter your song prompt: Every day that you keep it away

Generated Lyrics:
Every day that you keep it away from me, you keep it away from me from me. If you keep it away from me, you will not be my friend. If you keep it away from me, you will not be my friend."

But he was not to speak of the other, because he had not a friend
