In [1]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="../data/lyrics_dataset.json", split="train")
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['text', 'label'],
    num_rows: 394
})


In [2]:
labels = sorted(list(set(example["label"] for example in dataset)))
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}
id2label

{0: 'Eminem', 1: 'Future', 2: 'Hozier', 3: 'The Weeknd'}

In [6]:
from transformers import AutoTokenizer
from transformers import RobertaTokenizer

# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")

def tokenize_fn(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

dataset = dataset.map(lambda e: {"label": label2id[e["label"]]}, remove_columns=["label"])
dataset = dataset.map(tokenize_fn)

Map: 100%|██████████| 394/394 [00:05<00:00, 78.75 examples/s] 


In [9]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=len(label2id))

args = TrainingArguments(
    output_dir="./results/roberta-base",
    # evaluation_strategy="no",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset,
)

trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,1.4122
20,1.401
30,1.4093
40,1.307
50,1.1954
60,1.1069
70,0.9825
80,0.8282
90,0.9846
100,0.8957


TrainOutput(global_step=297, training_loss=0.7472092804282603, metrics={'train_runtime': 106.0862, 'train_samples_per_second': 11.142, 'train_steps_per_second': 2.8, 'total_flos': 311002852073472.0, 'train_loss': 0.7472092804282603, 'epoch': 3.0})

In [11]:
prompt_weeknd = "Write a new song in the style of The Weeknd about heartbreak at night while he was high on ecstasy and oxycontin. He mentions about prioritising his career over his love which led to the heartbreak. YOur output should only consist of the lyrics and nothing else. NO [verse1] or [chorus] or even pre chorus or instruments or anything like that. Only and only the lyrics, no instructions whatsoever either."

In [21]:
seed_lyrics_future = """
I'm drifting under neon lights,  
With echoes of your touch last night,  
My heart's a beat behind the sound,  
But you're not here, I'm breaking down."""

prompt_future = f"""Continue this song in the same style and emotion:\n\n{seed_lyrics_future}\nYOur output should only consist of the lyrics and nothing else. NO [verse1] or [chorus] or even pre chorus or instruments or anything like that. Only and only the lyrics, no instructions whatsoever either."""

In [20]:
seed_lyrics_hoz = """
I found your voice beneath cathedral rain,  
Like hymns carved deep in sacred stone,  
Your touch still lingers in the flame,  
A ghost that sings when I’m alone."""

prompt_hoz = f"""You're a poetic lyricist like Hozier, known for soulful and haunting metaphors, blending love and spirituality. Continue this song in the same tone and rhythm:\n\n{seed_lyrics_hoz}\nYOur output should only consist of the lyrics and nothing else. NO [verse1] or [chorus] or even pre chorus or instruments or anything like that. Only and only the lyrics, no instructions whatsoever either."""


In [19]:
seed_lyrics_eminem = """
Back when I was thirteen, had rage in my veins,  
Mom workin' doubles just to handle the pain,  
No heat in the house, just beats in my brain,  
Used rap as my shelter, escaped in the rain."""

prompt_eminem = f"""You're a rapper like Eminem, telling raw, emotional stories with tight rhyme schemes and fast flow. Continue these lyrics with intensity and rhythm:\n\n{seed_lyrics_eminem}\n"""

In [22]:
import subprocess

def generate_lyrics(prompt):
    result = subprocess.run(
        ["ollama", "run", "gemma3:4b"],
        input=prompt.encode(),
        capture_output=True
    )
    return result.stdout.decode()

weeknd_gemma = generate_lyrics(prompt_eminem)
print(weeknd_gemma)


(Beat drops - heavy 808s, frantic hi-hats, a distorted piano chord)

Yeah... 

Back when I was thirteen, had rage in my veins, 
Mom workin' doubles just to handle the pain, 
No heat in the house, just beats in my brain, 
Used rap as my shelter, escaped in the rain. 

Concrete jungle echoes, whispers of shame, 
Another eviction notice, another losing game. 
Dad a ghost, a shadow, a forgotten name, 
Just empty promises etched in the sodium flame. 

Words like weapons, spitting fire and frost, 
Turnin' frustration into a lyrical holocaust. 
Each rhyme a shard of anger, a desperate cost, 
Tryna build a fortress, before my future’s lost. 

Used to stare at the streetlight, a sodium glow, 
Feelin' smaller than the darkness, nowhere left to go. 
Then the beat hit different, a pulse, a furious flow, 
Turnin' self-doubt to venom, watchin' the demons grow. 

(Tempo increases - hi-hats become more rapid, snare hits more pronounced)

See, the silence was a killer, a suffocating shroud, 
Drowning i

In [23]:
from transformers import RobertaTokenizer, AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# model = AutoModelForSequenceClassification.from_pretrained("results/checkpoint-297")  # or the saved model dir
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("results/roberta-base/checkpoint-297")  # or the saved model dir
model.eval()

# Your id2label dictionary from before
id2label = {0: 'Eminem', 1: 'Future', 2: 'Hozier', 3: 'The Weeknd'}  # example

In [24]:
def predict_artist(lyrics_text):
    inputs = tokenizer(lyrics_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_class_id = torch.argmax(outputs.logits).item()
        return id2label[predicted_class_id]

In [25]:
predicted_artist = predict_artist(weeknd_gemma)
print(f"🧠 Predicted Artist Style: {predicted_artist}")

🧠 Predicted Artist Style: Eminem
