<a href="https://colab.research.google.com/github/Siala-94/musicGenerator/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
#!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

In [48]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load the dataset
data = load_dataset("csv", data_files="lyrics-sampled.csv")

# Transform the data
data['train'] = data['train'].map(lambda x: {'input_text': f"{x['SName']} {x['Artist']} {x['Genres']}", 'target_text': x['Lyric']})


In [49]:
# Tokenize the entire dataset
checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=150)
    tokenized_outputs = tokenizer(examples['target_text'], truncation=True, padding='max_length', max_length=150)

    tokenized_inputs["labels"] = tokenized_outputs["input_ids"]
    return tokenized_inputs

tokenized_datasets = data.map(tokenize_function, batched=True, load_from_cache_file=False)



Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [36]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'SName', 'Lyric', 'Artist', 'Genres', 'input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

In [50]:
from datasets import DatasetDict

split_datasets = tokenized_datasets["train"].train_test_split(test_size=0.2, seed=42)
datasets = DatasetDict({"train": split_datasets["train"], "validation": split_datasets["test"]})


In [51]:
import torch

def custom_data_collator(batch):
    input_ids = [torch.tensor(item["input_ids"]) for item in batch]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in batch]
    labels = [torch.tensor(item["labels"]) for item in batch]

    return {
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_mask),
        "labels": torch.stack(labels)
    }


In [52]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [44]:


# Convert lists to PyTorch tensors
input_ids_sample = torch.tensor(tokenized_datasets["train"]['input_ids'][:8])
attention_mask_sample = torch.tensor(tokenized_datasets["train"]['attention_mask'][:8])

# Tokenize the 'target_text'
labels_sample = tokenizer.batch_encode_plus(
    tokenized_datasets["train"]['target_text'][:8],
    padding='max_length',
    truncation=True,
    max_length=150,
    return_tensors="pt"
)["input_ids"]

device = model.device
input_ids_sample = input_ids_sample.to(device)
attention_mask_sample = attention_mask_sample.to(device)
labels_sample = labels_sample.to(device)

# Create a sample batch
sample_batch = {
    'input_ids': input_ids_sample,
    'attention_mask': attention_mask_sample,
    'labels': labels_sample
}


In [53]:

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=250,
    save_steps=500,
    output_dir="./results",
    overwrite_output_dir=True,
    warmup_steps=500,
    save_total_limit=3,
)

# Instantiate the trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    data_collator=custom_data_collator
)


In [54]:
# Start training
trainer.train()


Step,Training Loss,Validation Loss
500,3.6225,3.142934
1000,3.3021,3.020764
1500,3.0973,2.99743


TrainOutput(global_step=1500, training_loss=3.5913719889322917, metrics={'train_runtime': 688.6128, 'train_samples_per_second': 17.426, 'train_steps_per_second': 2.178, 'total_flos': 1071802368000000.0, 'train_loss': 3.5913719889322917, 'epoch': 3.0})

In [62]:
model.save_pretrained("/trained-model")
tokenizer.save_pretrained('/trained-model')


('/trained-model/tokenizer_config.json',
 '/trained-model/special_tokens_map.json',
 '/trained-model/vocab.json',
 '/trained-model/merges.txt',
 '/trained-model/added_tokens.json',
 '/trained-model/tokenizer.json')

In [60]:
from transformers import pipeline

# Load the fine-tuned model
model_path = "/results"  # Adjust this path if needed
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Create a text generation pipeline
text_generator = pipeline("text-generation", model=fine_tuned_model, tokenizer=tokenizer)

# Generate text using the fine-tuned model
prompt = "Shape of You 50 cent Hip Hop"
generated_text = text_generator(prompt, max_length=150, num_return_sequences=1, num_beams=5)

# Print the generated text
for i, text in enumerate(generated_text):
    print(f"Generated Text {i+1}:")
    print(text["generated_text"])


The model 'BartForConditionalGeneration' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PLBartForCausalLM',

Generated Text 1:
Shape of You 50 cent Hip Hopa be a part of you 50 cent
You're gonna be part of me 50 cent (50 cent)

(Chorus)
I've got a lot of things to say, I've got to say it's true, I'm gonna do it all again

I'll be the one to tell you the truth, I'll be your one and only one
I won't let you down, I won't make you down
I don't wanna let you go, I don't want you to go
I ain't gonna let you up, I ain't letting you down (I'm just letting you go)
(chorus


#############

##########
