<a href="https://colab.research.google.com/github/Siala-94/musicGenerator/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
#!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl



In [2]:
from datasets import load_dataset
data = load_dataset("csv", data_files="lyrics-sampled.csv")

In [3]:
# Transform the data
data['train'] = data['train'].map(lambda x: {'input_text': f"{x['SName']} {x['Artist']} {x['Genres']}", 'target_text': x['Lyric']})
data

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'SName', 'Lyric', 'Artist', 'Genres', 'input_text', 'target_text'],
        num_rows: 5000
    })
})

In [28]:
from transformers import AutoTokenizer

checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return {
        "input_ids": tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=500)["input_ids"],
        "attention_mask": tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=500)["attention_mask"],
        "labels": tokenizer(examples['target_text'], truncation=True, padding='max_length', max_length=500)["input_ids"]
    }

tokenized_datasets = data.map(tokenize_function, batched=True)


In [42]:
data["train"][2]


{'Unnamed: 0': 180272,
 'SName': 'The Way You Want Me to Be',
 'Lyric': "I wish I could dance like Fontaine,\nFloating along on a breeze,\nBut when you come home\nAnd I get you alone\nI wish that I could put you at ease.\n\nI wish I could sing you a love song,\nSing about the silvery moon,\nBut the things that you say\nWhen I think it's OK\nAlways seem to sing out of tune.\n\nOh, I'm not the way\nThat you want me to be,\nNo, it's not like me,\nNo, it's not like me.\n\nYou say I should dress like a Lady,\nRead all the best magazines,\nBut the people in there\nAnd the clothes that they wear,\nTake it from the girl of your dreams.\n\nYou wish I could be a show-piece\nSomething to show all your friends\nIs it true what they say,\nThat I get in your way ?\nIs this about the place that it ends ?\n\nI'm not the way\nThat you want me to be,\nNo, it's not like me,\nNo, it's not like me.\n\nI wish I could dance like Fontaine,\nFloating along on the breeze\nAnd when you come home\nAnd I get you a

In [29]:
from datasets import DatasetDict
# Sample data
sample_data = tokenized_datasets["train"]
train_size = int(0.8 * len(sample_data))
train_dataset = sample_data.select([i for i in range(train_size)])
eval_dataset = sample_data.select([i for i in range(train_size, len(sample_data))])
datasets = DatasetDict({"train": train_dataset, "validation": eval_dataset})



In [31]:
import torch
from datasets import load_metric

def custom_data_collator(batch):
    input_ids = [torch.tensor(item["input_ids"]) for item in batch]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in batch]
    labels = [torch.tensor(item["labels"]) for item in batch]

    return {
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_mask),
        "labels": torch.stack(labels)
    }


In [32]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=250,
    save_steps=500,
    output_dir="./results",
    overwrite_output_dir=True,
    warmup_steps=500,
    save_total_limit=3,
)

# Instantiate the trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    data_collator=custom_data_collator
)

# Start training
trainer.train()

Step,Training Loss,Validation Loss
500,2.0606,1.738168
1000,1.8466,1.678777
1500,1.7195,1.660916


TrainOutput(global_step=1500, training_loss=2.486680948893229, metrics={'train_runtime': 2034.2043, 'train_samples_per_second': 5.899, 'train_steps_per_second': 0.737, 'total_flos': 3572674560000000.0, 'train_loss': 2.486680948893229, 'epoch': 3.0})

In [9]:
CUDA_LAUNCH_BLOCKING=1

In [10]:
!pip install huggingface_hub



In [33]:
PEFT_MODEL = "siala94/bert-lyrics-generator"
HF_TOKEN = "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"
model.push_to_hub(
    PEFT_MODEL, use_auth_token=HF_TOKEN
)
tokenizer.push_to_hub(
    PEFT_MODEL, use_auth_token=HF_TOKEN
)



pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]



CommitInfo(commit_url='https://huggingface.co/siala94/bert-lyrics-generator/commit/22bce8852e7837ac5a7c161c3f4b304af23bd2a1', commit_message='Upload tokenizer', commit_description='', oid='22bce8852e7837ac5a7c161c3f4b304af23bd2a1', pr_url=None, pr_revision=None, pr_num=None)

In [49]:
from transformers import pipeline

# Load the trained model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(PEFT_MODEL)
tokenizer = AutoTokenizer.from_pretrained(PEFT_MODEL)

device = model.device
# Prepare the input text
input_text = "beutiful beaches parisa country"
encoded_input = tokenizer.encode(input_text, return_tensors="pt")

# Generate lyrics
generated_output = model.generate(encoded_input, max_length=700, num_beams=5, early_stopping=True)

# Decode the generated output
generated_lyrics = tokenizer.decode(generated_output[0], skip_special_tokens=True)

print(generated_lyrics)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


beutiful beaches, beautiful beaches, lovely beaches
beautiful beaches and beautiful beaches
Beautiful beaches on beautiful beaches.

I can't wait to see you again.
I want you to know that I love you.
And I want to know you love me.
But I don't know if I can make it through the day.
You know I can't live without you,
I need you, I need you.


In [53]:
input_text = "i see stars billie eilish pop"
encoded_input = tokenizer.encode(input_text, return_tensors="pt")
# Generate lyrics with adjusted temperature and without top_k
try:
    generated_output = model.generate(encoded_input,
                                      max_length=700,
                                      do_sample=True,
                                      temperature=1.0,
                                      top_k = 80)  # Set temperature to 1.0

    # Decode the generated output
    generated_lyrics = tokenizer.decode(generated_output[0], skip_special_tokens=True)

except RuntimeError as e:
    generated_lyrics = str(e)  # If there's an error, store the error message

print(generated_lyrics)



i see stars
I see stars, i see stars in the sky

I can see stars on the ground
I feel stars
in the air
i feel stars, they're all around me

They're all the same
i hear stars
they're all that I see

When I look in the mirror
i can see the stars

It's a long way from home
when I look into your eyes
you see stars that are all around us

The sky is blue
the stars are blue
The stars are bright
The skies are blue
