<a href="https://colab.research.google.com/github/Siala-94/musicGenerator/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
#!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

In [2]:
from datasets import load_dataset
data = load_dataset("csv", data_files="lyrics-sampled.csv")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
# Transform the data
data['train'] = data['train'].map(lambda x: {'input_text': f"{x['SName']} {x['Artist']} {x['Genres']}", 'target_text': x['Lyric']})
data

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'SName', 'Lyric', 'Artist', 'Genres', 'input_text', 'target_text'],
        num_rows: 5000
    })
})

In [4]:
from transformers import AutoTokenizer

checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return {
        "input_ids": tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=150)["input_ids"],
        "attention_mask": tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=150)["attention_mask"],
        "labels": tokenizer(examples['target_text'], truncation=True, padding='max_length', max_length=150)["input_ids"]
    }

tokenized_datasets = data.map(tokenize_function, batched=True)


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['SName', 'Lyric', 'Artist', 'Genres', 'input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 191387
    })
})

In [5]:
from datasets import DatasetDict
# Sample data
sample_data = tokenized_datasets["train"]
train_size = int(0.8 * len(sample_data))
train_dataset = sample_data.select([i for i in range(train_size)])
eval_dataset = sample_data.select([i for i in range(train_size, len(sample_data))])
datasets = DatasetDict({"train": train_dataset, "validation": eval_dataset})



In [6]:
import torch

def custom_data_collator(batch):
    input_ids = [torch.tensor(item["input_ids"]) for item in batch]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in batch]
    labels = [torch.tensor(item["labels"]) for item in batch]

    return {
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_mask),
        "labels": torch.stack(labels)
    }


In [7]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=250,
    save_steps=500,
    output_dir="./results",
    overwrite_output_dir=True,
    warmup_steps=500,
    save_total_limit=3,
)

# Instantiate the trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    data_collator=custom_data_collator
)

# Start training
trainer.train()

Downloading model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Step,Training Loss,Validation Loss
500,3.6164,3.176047
1000,3.2635,3.087517
1500,3.0772,3.0668


TrainOutput(global_step=1500, training_loss=3.578873697916667, metrics={'train_runtime': 669.927, 'train_samples_per_second': 17.912, 'train_steps_per_second': 2.239, 'total_flos': 1071802368000000.0, 'train_loss': 3.578873697916667, 'epoch': 3.0})

In [11]:
from transformers import pipeline

# Load the trained model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./results/checkpoint-1500")
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

device = model.device
# Prepare the input text
input_text = "in the club 50 cent Hip Hop"
encoded_input = tokenizer.encode(input_text, return_tensors="pt")

# Generate lyrics
generated_output = model.generate(encoded_input, max_length=700, num_beams=5, early_stopping=True)

# Decode the generated output
generated_lyrics = tokenizer.decode(generated_output[0], skip_special_tokens=True)

print(generated_lyrics)

[Verse 1]

[Chorus]
[Intro]
I'm in the club, I'm on the floor
I've got the keys to my car
I got the key to my house
I don't know what I'm doing
But I know I'm not alone
I know that I'm alone
And I know that it's not my fault
But it's the way that I feel
And it's just the way it is
It's just me and you

(Chorus)
[chorus:]
[CHORUS]
(chorus) (ChORUS) (chORUS):
(repeat) (repeat)
(Repeat) (
