<a href="https://colab.research.google.com/github/Siala-94/musicGenerator/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!pip install rouge_score
!pip install evaluate
# To run the training on TPU, you will need to uncomment the following line:
#!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl



In [45]:
from datasets import load_dataset
data = load_dataset("csv", data_files="lyrics-sampled.csv")

In [46]:
# Transform the data
data['train'] = data['train'].map(lambda x: {'input_text': f"{x['SName']} {x['Artist']} {x['Genres']}", 'target_text': x['Lyric']})

In [53]:
from transformers import AutoTokenizer

checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return {
        "input_ids": tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=1500)["input_ids"],
        "attention_mask": tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=1500)["attention_mask"],
        "labels": tokenizer(examples['target_text'], truncation=True, padding='max_length', max_length=1500)["input_ids"]
    }

tokenized_datasets = data.map(tokenize_function, batched=True)


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['SName', 'Lyric', 'Artist', 'Genres', 'input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 191387
    })
})

In [54]:
from datasets import DatasetDict
# Sample data
sample_data = tokenized_datasets["train"]
train_size = int(0.8 * len(sample_data))
train_dataset = sample_data.select([i for i in range(train_size)])
eval_dataset = sample_data.select([i for i in range(train_size, len(sample_data))])
datasets = DatasetDict({"train": train_dataset, "validation": eval_dataset})



In [55]:
import torch
from datasets import load_metric

bleu_metric = load_metric("bleu")
rouge_metric = load_metric("rouge")

def custom_data_collator(batch):
    input_ids = [torch.tensor(item["input_ids"]) for item in batch]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in batch]
    labels = [torch.tensor(item["labels"]) for item in batch]

    return {
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_mask),
        "labels": torch.stack(labels)
    }

def compute_metrics(p):
    decoded_preds = tokenizer.batch_decode(p.predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(p.label_ids, skip_special_tokens=True)

    # Compute BLEU and ROUGE scores
    bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    rouge = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

    return {"bleu": bleu["score"], "rouge-l": rouge["rouge-l"]["fmeasure"]}



In [56]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    num_train_epochs=1,
    logging_steps=250,
    save_steps=500,
    output_dir="./results",
    overwrite_output_dir=True,
    warmup_steps=500,
    save_total_limit=3,
)

# Instantiate the trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    data_collator=custom_data_collator
)

# Start training
trainer.train()

RuntimeError: ignored

In [51]:
PEFT_MODEL = "siala94/bert-lyrics-generator"
PEFT_TOKEN = "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"

model.push_to_hub(
    PEFT_MODEL, use_auth_token =PEFT_TOKEN
)



pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/siala94/bert-lyrics-generator/commit/6a38acc17e2a5c3ae69f169d702b8f2c3b7dd295', commit_message='Upload BartForConditionalGeneration', commit_description='', oid='6a38acc17e2a5c3ae69f169d702b8f2c3b7dd295', pr_url=None, pr_revision=None, pr_num=None)

In [52]:
from transformers import pipeline

# Load the trained model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./results/checkpoint-3500")
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

device = model.device
# Prepare the input text
input_text = "in the club 50 cent Hip Hop"
encoded_input = tokenizer.encode(input_text, return_tensors="pt")

# Generate lyrics
generated_output = model.generate(encoded_input, max_length=1500, num_beams=5, early_stopping=True)

# Decode the generated output
generated_lyrics = tokenizer.decode(generated_output[0], skip_special_tokens=True)

print(generated_lyrics)

[Intro: 50 Cent]

[Verse 1: 50 cent]
I'm in the club, I'm the only one
I got a nigga in my club, and he's the one
That's the only nigga that I know
He's the first nigga I've seen in a long, long time
I've seen him in a lot of clubs, but I'm not the one that he's known
I don't know where he came from, but he's from the East Side of New York
I heard he was from the West Side, and I'm from the South Side
So I'm on my way to the Westside, and that's where I'm headed

