In [None]:
import torch
import gc
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
data = pd.read_csv('../data/processed/lyrics_processed.csv')

train_data = []
for i in range(len(data)):
    data_elem = data.iloc[i]
    song = data_elem['song']
    artist = data_elem['artist']
    year = data_elem['year']
    genre = data_elem['genre']
    lyrics = data_elem['lyrics']
    
    train_data.append({"input_text": f"Give me the lyrics of a song that was made by {artist} and was releasd in {year} and is of the {genre} genre.", "label_text": lyrics})

In [3]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

In [8]:
input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt").input_ids
output_ids = model.generate(input_ids, max_length=40, temperature=0.9, num_beams=5, num_return_sequences=3, no_repeat_ngram_size=2, early_stopping=True)
tokenizer.decode(output_ids[0])

'<pad> Hello, my dog is cute</s>'

In [4]:
tokenized_data = tokenizer([d["input_text"] for d in train_data], padding=True, truncation=True, return_tensors='pt')
input_ids = tokenized_data.input_ids
attention_mask = tokenized_data.attention_mask

label_ids = tokenizer([d["label_text"] for d in train_data], padding=True, truncation=True, return_tensors='pt').input_ids

In [5]:
batch_size = 8

dataset = TensorDataset(input_ids, attention_mask, label_ids)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [6]:
def train(model, dataloader, optimizer, device, epoch):
    torch.cuda.empty_cache()
    gc.collect()
    model.train()
    total_loss = 0.0
    for idx, batch in enumerate(dataloader):
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        print(f"Epoch {epoch+1} - Batch {idx*batch_size}/{len(dataloader)} - Loss: {loss.item()}")
        
        model.save_pretrained('../models/lyrical-flan_t5_large')
        
        del input_ids
        del attention_mask
        del labels
        torch.cuda.empty_cache()
        gc.collect()
    del dataloader
    torch.cuda.empty_cache()
    gc.collect()
        
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1} Complete! - Average Loss: {avg_loss}")

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

learning_rate = 1e-4
lr_scheduler_step = 1
lr_scheduler_gamma = 0.9
num_epochs = 10

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_scheduler_step, gamma=lr_scheduler_gamma)

In [13]:
torch.cuda.empty_cache()
gc.collect()
for epoch in range(num_epochs):
    train(model, dataloader, optimizer, device, epoch)
    scheduler.step()
    model.save_pretrained('../models/lyrical-flan_t5_large')
    torch.cuda.empty_cache()
    gc.collect()
model.save_pretrained('../models/lyrical-flan_t5_large')
torch.cuda.empty_cache()
gc.collect()

Epoch 1 - Batch 0/5351 - Loss: 10.681533813476562
Epoch 1 - Batch 8/5351 - Loss: 14.086360931396484
Epoch 1 - Batch 16/5351 - Loss: 12.969615936279297
Epoch 1 - Batch 24/5351 - Loss: 13.816482543945312
Epoch 1 - Batch 32/5351 - Loss: 10.84506893157959
Epoch 1 - Batch 40/5351 - Loss: 13.249403953552246
Epoch 1 - Batch 48/5351 - Loss: 11.773417472839355
Epoch 1 - Batch 56/5351 - Loss: 13.160143852233887
Epoch 1 - Batch 64/5351 - Loss: 8.932929992675781
Epoch 1 - Batch 72/5351 - Loss: 8.956905364990234
Epoch 1 - Batch 80/5351 - Loss: 10.419461250305176
