In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [2]:
from torch.utils.data import Dataset
from transformers import Trainer, TrainingArguments

In [3]:
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 1024)

In [4]:
import re

with open("couplets_lyrics.txt", "r", encoding="utf-8") as f:
    text = f.read()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s.,!?;:\'"]+', '', text)
    return text
text = clean_text(text)
print(text)

i never woulda thought she was livin' like that
her words seemed so sincere
when i held her near, she would tell me how she feels
it felt so real to me, this girl, she had to be
an angel sent from heaven just for me
i didn't know she was already spoken for
'cause i'm not that kind of man
swear that i would've never looked her way
now i feel so much shame
and all things have to change
you should know that i'm holdin' her to blame
do you remember when we fell in love?
we were young and innocent then
do you remember how it all began?
it just seemed like heaven, so why did it end?
do you remember how we used to talk?
you know, we'd stay on the phone at night 'til dawn
do you remember all the things we said?
like, "i love you so," "i'll never let you go"
told me that you're doing wrong
word out shocking all along
cryin' wolf ain't like a man
throwing rocks to hide your hands
you ain't done enough for me
you ain't done nothin' for me
you are disgusting me
yeah, yeah
you aiming just for me
yo

In [5]:
inputs = tokenizer(
    text,
    return_tensors="pt",
    max_length=1024,
    truncation=True,
    padding="max_length"
)

In [6]:
class LyricsDataset(Dataset):
    def __init__(self, text, tokenizer, block_size=512):
        tokens = tokenizer(text, return_tensors="pt")["input_ids"][0]
        chunks = [tokens[i:i+block_size] for i in range(0, len(tokens)-block_size, block_size)]
        self.samples = chunks

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return {"input_ids": self.samples[idx], "labels": self.samples[idx]}

In [7]:
dataset = LyricsDataset(text, tokenizer)

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    num_train_epochs=3,
    max_grad_norm=1.0,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model()

Token indices sequence length is longer than the specified maximum sequence length for this model (19266 > 1024). Running this sequence through the model will result in indexing errors
  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


In [10]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned/checkpoint-30")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned/checkpoint-30")
tokenizer.pad_token = tokenizer.eos_token
model.to("cpu")

prompt = "Let's make it "
max_iterations = 5
tokens_per_step = 30

for i in range(max_iterations):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cpu")

    generated_ids = model.generate(
        input_ids,
        max_new_tokens=tokens_per_step,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=1.0,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    new_tokens = generated_ids[0][input_ids.shape[-1]:]
    new_text = tokenizer.decode(new_tokens, skip_special_tokens=True)

    print(f"Step {i+1} ➜ {new_text.strip()}\n")

    prompt += new_text

Step 1 ➜ ?"
It was a simple question, so what's this talk about?
She didn't know if she could come back.
Is she going

Step 2 ➜ to die?
Is she gonna be lost?
What are these strange feelings inside my heart?
If I could see her face
Maybe,

Step 3 ➜ even, she would smile?
I'd cry so much.
If she's really gone, it must be her wish
There was always a

Step 4 ➜ smile in her eyes
But how could it happen?
Then again, why did she leave me?
How could we ever stay together
and

Step 5 ➜ go back home?
Then why?
I still see that smile in her eyes
and can't stand by it no more
what's that

