<a href="https://colab.research.google.com/github/Siala-94/musicGenerator/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!pip install rouge_score
!pip install evaluate
# To run the training on TPU, you will need to uncomment the following line:
#!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl



In [31]:
from datasets import load_dataset
data = load_dataset("csv", data_files="lyrics-sampled.csv")

In [33]:
# Transform the data
data['train'] = data['train'].map(lambda x: {'input_text': f"{x['SName']} {x['Artist']} {x['Genres']}", 'target_text': x['Lyric']})

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'SName', 'Lyric', 'Artist', 'Genres', 'input_text', 'target_text'],
        num_rows: 5000
    })
})

In [34]:
from transformers import AutoTokenizer

checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return {
        "input_ids": tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=150)["input_ids"],
        "attention_mask": tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=150)["attention_mask"],
        "labels": tokenizer(examples['target_text'], truncation=True, padding='max_length', max_length=150)["input_ids"]
    }

tokenized_datasets = data.map(tokenize_function, batched=True)


In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['SName', 'Lyric', 'Artist', 'Genres', 'input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 191387
    })
})

In [35]:
from datasets import DatasetDict
# Sample data
sample_data = tokenized_datasets["train"]
train_size = int(0.8 * len(sample_data))
train_dataset = sample_data.select([i for i in range(train_size)])
eval_dataset = sample_data.select([i for i in range(train_size, len(sample_data))])
datasets = DatasetDict({"train": train_dataset, "validation": eval_dataset})



In [39]:
import torch
from datasets import load_metric

bleu_metric = load_metric("bleu")
rouge_metric = load_metric("rouge")

def custom_data_collator(batch):
    input_ids = [torch.tensor(item["input_ids"]) for item in batch]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in batch]
    labels = [torch.tensor(item["labels"]) for item in batch]

    return {
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_mask),
        "labels": torch.stack(labels)
    }

def compute_metrics(p):
    decoded_preds = tokenizer.batch_decode(p.predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(p.label_ids, skip_special_tokens=True)

    # Compute BLEU and ROUGE scores
    bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    rouge = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

    return {"bleu": bleu["score"], "rouge-l": rouge["rouge-l"]["fmeasure"]}



In [42]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    num_train_epochs=8,
    logging_steps=250,
    save_steps=500,
    output_dir="./results",
    overwrite_output_dir=True,
    warmup_steps=500,
    save_total_limit=3,
)

# Instantiate the trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    data_collator=custom_data_collator
)

# Start training
trainer.train()

Step,Training Loss,Validation Loss
500,3.6164,3.176047
1000,3.2723,3.104569
1500,3.079,3.084114
2000,2.9482,3.086734
2500,2.8402,3.061851
3000,2.7729,3.070976
3500,2.7037,3.078917
4000,2.6512,3.089046


TrainOutput(global_step=4000, training_loss=3.084584732055664, metrics={'train_runtime': 1895.1123, 'train_samples_per_second': 16.886, 'train_steps_per_second': 2.111, 'total_flos': 2858139648000000.0, 'train_loss': 3.084584732055664, 'epoch': 8.0})

In [44]:
PEFT_MODEL = "siala94/bert-lyrics-generator"
PEFT_TOKEN = "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"

model.push_to_hub(
    PEFT_MODEL, use_auth_token =PEFT_TOKEN
)



pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/siala94/bert-lyrics-generator/commit/0b098912b72f634292d1365d0c0cae07340747a8', commit_message='Upload BartForConditionalGeneration', commit_description='', oid='0b098912b72f634292d1365d0c0cae07340747a8', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import pipeline

# Load the trained model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./results/checkpoint-1500")
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

device = model.device
# Prepare the input text
input_text = "in the club 50 cent Hip Hop"
encoded_input = tokenizer.encode(input_text, return_tensors="pt")

# Generate lyrics
generated_output = model.generate(encoded_input, max_length=700, num_beams=5, early_stopping=True)

# Decode the generated output
generated_lyrics = tokenizer.decode(generated_output[0], skip_special_tokens=True)

print(generated_lyrics)