In [8]:
# Install required libraries
!pip install transformers datasets openpyxl

import os
import pandas as pd
import torch
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

# Step 1: Load Excel file
file_path = "TheWeekndLyrics.xlsx"
df = pd.read_excel(file_path)
print(" Excel file loaded.")

# Step 2: Show DataFrame and columns
print("\n Column names:", df.columns.tolist())
print("\n Preview:\n", df.head())

# Step 3: Ensure we have a lyrics column
possible_lyrics_cols = [col for col in df.columns if "lyric" in col.lower()]
if not possible_lyrics_cols:
    # fallback: longest average string length
    text_cols = df.select_dtypes(include=["object"])
    avg_lengths = text_cols.apply(lambda col: col.dropna().astype(str).str.len().mean())
    lyrics_col = avg_lengths.idxmax()
    print(f" No column explicitly named 'lyrics'. Auto-selected based on content: {lyrics_col}")
else:
    lyrics_col = possible_lyrics_cols[0]
    print(f" Detected lyrics column: {lyrics_col}")

# Step 4: Filter out null or empty rows
df = df.dropna(subset=[lyrics_col])
df = df[df[lyrics_col].astype(str).str.strip() != ""]
print(f" Remaining lyrics entries: {len(df)}")

# Step 5: Convert to HuggingFace Dataset
lyrics_list = df[lyrics_col].astype(str).tolist()
dataset = Dataset.from_dict({"text": lyrics_list})

# Step 6: Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Step 7: Tokenize data
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Step 8: Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-weeknd",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=200,
    save_total_limit=1,
    logging_steps=50,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available()
)

# Step 9: Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Step 10: Train
trainer.train()

# Step 11: Save model
trainer.save_model("./gpt2-weeknd")
tokenizer.save_pretrained("./gpt2-weeknd")

# Step 12: Generate sample lyrics
generator = pipeline('text-generation', model="./gpt2-weeknd", tokenizer=tokenizer)

prompts = [
    "I saw you dancing in a crowded room",
    "I've been tryna call",
    "She told me not to worry",
    "I ran out of tears when I was eighteen",
    "You don't even have to do too much"
]

for prompt in prompts:
    print(f"\n🎤 Prompt: {prompt}")
    output = generator(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
    print("🎶", output)








 Excel file loaded.

 Column names: ['LYRICS']

 Preview:
                                               LYRICS
0  I've been tryna call\nI've been on my own for ...
1  I'm tryna put you in the worst mood, ah\nP1 cl...
2  Your man on the road, he doin' promo\nYou said...
3  I saw you dancing in a crowded room\nYou look ...
4  I ran out of tears when I was eighteen\nSo nob...
 Detected lyrics column: LYRICS
 Remaining lyrics entries: 8


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



🎤 Prompt: I saw you dancing in a crowded room
🎶 I saw you dancing in a crowded room, talking to me with my arms around you. What's wrong?

I love it here in your heart, darling

I know you're not the only one I want

This ain't a good night for you

I have no problem giving you one

Just leave you alone, alright?

Look at you looking my way, baby; I've only known you for so long

Please let us go again,

🎤 Prompt: I've been tryna call
🎶 I've been tryna call it with you for so long and I don't get it, so... so lucky ya'
You're mine too
I love you with my own heart, babe
And don't worry, just enjoy getting me a little bit close to you
That was a lie, wasn't it?
You're an ollie though sweetie I need you so much, I have to do my own good little thing just to you
But, it turns out

🎤 Prompt: She told me not to worry
🎶 She told me not to worry about how I was feeling.
In one of her poems, Annie told me she used to think I was a girl
A moment later, I watched my father, still looking confuse