https://www.kaggle.com/code/karnikakapoor/lyrics-generator-rnn/notebook?select=Songs.csv

# Training Process

In [7]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv("Songs.csv")
df = df.dropna(subset=["Lyrics"])
df = df[df["Lyrics"].str.len() > 100]  # keep meaningful lyrics

lyrics_ds = Dataset.from_pandas(df[["Lyrics"]].rename(columns={"Lyrics": "text"}))

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(example):
    tokens = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )
    tokens["labels"] = tokens["input_ids"].clone()
    return tokens

tokenized_ds = lyrics_ds.map(tokenize_fn, batched=True)
tokenized_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Load model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Define training args
training_args = TrainingArguments(
    output_dir="./banger_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=100,
    logging_steps=20,
    save_total_limit=2,
    prediction_loss_only=True
)

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer
)

trainer.train()

Map:   0%|          | 0/744 [00:00<?, ? examples/s]

Step,Training Loss
20,3.1517
40,2.9807
60,2.8294
80,2.7915
100,2.8767
120,2.7259
140,2.8581
160,2.6384
180,2.8326
200,2.8328


TrainOutput(global_step=1116, training_loss=2.531502740784785, metrics={'train_runtime': 9962.3499, 'train_samples_per_second': 0.224, 'train_steps_per_second': 0.112, 'total_flos': 291601907712000.0, 'train_loss': 2.531502740784785, 'epoch': 3.0})

In [4]:
trainer.save_model("./banger_gpt2_model")
tokenizer.save_pretrained("./banger_gpt2_model")

('./banger_gpt2_model\\tokenizer_config.json',
 './banger_gpt2_model\\special_tokens_map.json',
 './banger_gpt2_model\\vocab.json',
 './banger_gpt2_model\\merges.txt',
 './banger_gpt2_model\\added_tokens.json')

# RUN THIS:

In [3]:
import pandas as pd
from datasets import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import re

def clean_lyrics(text):
    return re.sub(r"[^a-zA-Z0-9\s]", "", text).lower()

df = pd.read_csv("Songs.csv")
df = df.dropna(subset=["Lyrics"])
df = df[df["Lyrics"].str.len() > 100]  # keep meaningful lyrics
  
lyrics_ds = Dataset.from_pandas(df[["Lyrics"]].rename(columns={"Lyrics": "text"}))

dataset_texts = df["Lyrics"].astype(str).map(clean_lyrics).tolist()

model = GPT2LMHeadModel.from_pretrained("./banger_gpt2_model")
tokenizer = GPT2Tokenizer.from_pretrained("./banger_gpt2_model")

In [4]:
import pandas as pd
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import spacy
import re
from collections import Counter

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

def audited_generate(
    prompt,
    model,
    tokenizer,
    dataset_texts,
    artists,
    titles,
    similarity_threshold=0.7,
    max_length=150,
    ngram_size=5,
    max_semantic_sources=4
):
    embed_model = SentenceTransformer("all-MiniLM-L6-v2")

    # --- Helpers ---
    def clean(text):
        return re.sub(r"[^a-zA-Z0-9\s]", "", text).lower().strip()

    def get_ngrams(text, n):
        words = text.split()
        return Counter([" ".join(words[i:i + n]) for i in range(len(words) - n + 1)])

    def normalize_lines(text):
        return set(clean(line) for line in text.splitlines() if line.strip())

    def get_opening_lines(text, num_lines=2):
        lines = [line.strip().lower() for line in text.splitlines() if line.strip()]
        return lines[:num_lines]

    def get_most_repeated_lines(text):
        lines = [line.strip().lower() for line in text.splitlines() if line.strip()]
        return [line for line, count in Counter(lines).items() if count > 1]

    def get_rhyme_endings(lines, length=3):
        return [line[-length:] for line in lines if len(line) >= length]

    def extract_named_entities(text):
        doc = nlp(text)
        return set(ent.text.lower() for ent in doc.ents)
    
    def lexical_overlap_score(text1, text2):
        lines1 = normalize_lines(text1)
        lines2 = normalize_lines(text2)
        return len(lines1 & lines2)
    
    # --- Prepare Dataset ---
    cleaned_dataset = [clean(txt) for txt in dataset_texts]
    dataset_embeddings = embed_model.encode(cleaned_dataset, convert_to_tensor=True)

    # --- Generate Lyrics ---
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    gen_output = generator(prompt, max_length=max_length, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)[0]['generated_text']
    cleaned_gen = clean(gen_output)
    gen_embedding = embed_model.encode(cleaned_gen, convert_to_tensor=True)

    # === Audit ===
    risk_score = 0.0
    reasons = []

    # Semantic similarity
    similarities = util.cos_sim(gen_embedding, dataset_embeddings)[0]
    strong_similarities = [float(sim) for sim in similarities if sim >= similarity_threshold]
    top_score = float(similarities.max())
    most_similar_idx = int(similarities.argmax())
    song_sem = titles[most_similar_idx]
    artist_sem = artists[most_similar_idx]
    
    print(f"Top semantic similarity: {top_score:.3f}\n")
    print(f"Closest match (semantic): \"{song_sem}\" written by {artist_sem}")
    if 1 <= len(strong_similarities) <= max_semantic_sources:
        risk_score += 0.8
        reasons.append("High semantic similarity with known song")
    
    lexical_scores = [lexical_overlap_score(gen_output, ds) for ds in dataset_texts]
    most_lexical_idx = int(max(range(len(lexical_scores)), key=lambda i: lexical_scores[i]))
    lexical_overlap = lexical_scores[most_lexical_idx]
    song_lex = titles[most_lexical_idx]
    artist_lex = artists[most_lexical_idx]
    
    print(f"Closest match (line overlap): \"{song_lex}\" written by {artist_lex}")
    print(f"Line overlap count: {lexical_overlap}\n")
    if lexical_overlap > 0:
        reasons.append(f'Similar to "{song_lex}" by {artist_lex}')
    
    # N-gram check
    gen_ngrams = get_ngrams(cleaned_gen, ngram_size)
    for song in cleaned_dataset:
        song_ngrams = get_ngrams(song, ngram_size)
        if sum((gen_ngrams & song_ngrams).values()) > 0:
            risk_score += 1.0
            reasons.append(f"{ngram_size}-word exact phrase match")
            break

    # Line-level match
    gen_lines = normalize_lines(gen_output)
    matched_lines = []
    for song in dataset_texts:
        dataset_lines = normalize_lines(song)
        matches = dataset_lines & gen_lines
        if matches:
            matched_lines.extend(matches)
    
    if matched_lines:
        risk_score += 1.0
        reasons.append(f"{len(matched_lines)} line-level matches found")
                
    # Unique line detection
    for line in gen_lines:
        occurrence_count = sum(1 for song in cleaned_dataset if line in song)
        if occurrence_count == 1:
            risk_score += 1.0
            reasons.append(f"Unique line match: \"{line}\" appears only once in dataset")
            break

    # Structural/style checks
    gen_opening = get_opening_lines(gen_output)
    gen_repeats = set(get_most_repeated_lines(gen_output))
    gen_rhymes = get_rhyme_endings(list(gen_lines))

    song_ref = dataset_texts[most_similar_idx]
    dataset_opening = get_opening_lines(song_ref)
    dataset_repeats = set(get_most_repeated_lines(song_ref))
    dataset_rhymes = get_rhyme_endings(song_ref.splitlines())

    if any(line in dataset_opening for line in gen_opening):
        risk_score += 0.5
        reasons.append("Opening line matches known song")

    if gen_repeats & dataset_repeats:
        risk_score += 0.4
        reasons.append("Repeated line pattern (chorus-style)")

    if len(gen_rhymes) >= 3 and len(set(gen_rhymes) & set(dataset_rhymes)) >= 3:
        risk_score += 0.4
        reasons.append("Similar rhyme endings")

    # Named entities
    gen_ents = extract_named_entities(gen_output)
    for song in dataset_texts:
        dataset_ents = extract_named_entities(song)
        if gen_ents & dataset_ents:
            risk_score += 0.3
            reasons.append("Shared named entities (e.g., people, brands)")
            break

    # Toxic content filter
    if any(word in cleaned_gen for word in ["kill", "rape", "bitch", "n*", "f*", "drugs", "cocaine"]):
        print("Rejected: toxic language detected.\n")
        print("Reasons:", reasons)
        print("Generated Song:\n", gen_output)
    else:
        # Final decision
        print("Generated Song:\n", gen_output)
        print("\nReasons:", reasons)
        print("\nRisk Score:", round(min(1.0, risk_score), 2))
        
        if risk_score >= 0.8:
            print("\nRejected due to high similarity.\n")
            print("Legal Warning: You may be exposed to legal action from the original song's writer!\n")
        else:
            print("Passed audit")

In [21]:
audited_generate("Vintage tee, brand new phone", model, tokenizer, dataset_texts, df["Artist"].tolist(), df["Title"].tolist())



Top semantic similarity: 0.527

Closest match (semantic): "Born to Die" by Lana Del Rey
Closest match (line overlap): "cardigan" by Taylor Swift
Line overlap count: 1
Generated Song:
 Vintage tee, brand new phone
I was in the band
I walked to the door
To be alone
I tried to get out of sight
I cried 'til I couldn't sleep
But I got out
In that one song, we all sing
A song I'm really proud of
When you sing it on the radio, then everybody just loves
And I just think they know
All these folks in the room don't follow
How we got here
I think I'm gonna try
Oh, I will try
Oh, no, just try
Oh, I will try

Wondering about this song, but
What am I gonna do?
I can't tell to you that I'm
Reasons: ['Similar to "cardigan" by Taylor Swift', '5-word exact phrase match', '1 line-level matches found', 'Unique line match: "vintage tee brand new phone" appears only once in dataset', 'Shared named entities (e.g., people, brands)']
Risk Score: 1.0
Rejected due to high similarity.


