In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch import optim, Tensor
import math
import numpy as np 
from ast import literal_eval
from tqdm.notebook import tqdm
from chords_dataset import ChordsDataset
from model_helpers import NLP



device = torch.device("cpu" if torch.cuda.is_available() else "cpu")

df = pd.read_csv('./model_data/lyrics_processed.csv')
dataset = ChordsDataset(df,NLP)



In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads, 
        num_encoder_layers,
        num_decoder_layers, 
        forward_expansion, 
        dropout, 
        max_length, 
        device 
    ):
        super().__init__()
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size).to(device)
        self.src_position_embedding = PositionalEncoding( embedding_size,0.1,max_length).to(device)
        self.trg_position_embedding = PositionalEncoding( embedding_size,0.1,max_length).to(device)
        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads, 
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout
        ).to(device)
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size).to(device)
        self.dropout = nn.Dropout(dropout).to(device)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src: Tensor):
        shapes = src.shape
        mask = torch.zeros((1,shapes[0])).to(device)
        return mask
    def forward(self, src, trg ):

        # print(self.trg_position_embedding.get_device())
        embed_trg = self.trg_word_embedding(trg.to(torch.int64).to(device))

        
        embed_src = self.src_position_embedding(src) 
        embed_src = self.dropout( embed_src ).to(self.device)
        embed_trg = self.dropout( self.trg_position_embedding(embed_trg)).to(self.device)

        # print(embed_src.shape, embed_trg.shape)
        padding_mask = self.make_src_mask(src).to(self.device)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg.shape[0]).to(self.device)
        output = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask = padding_mask,
            tgt_mask = trg_mask
        )
        output = self.fc_out(output)
        return output


In [5]:

LYR_VOCAB_SIZE = 100
EMB_SIZE = LYR_VOCAB_SIZE
CHORD_SIZE = len(dataset.chords_set) 
num_heads = 5
num_encoder_layers = 3
num_decoder_layers = 3
forward_expansion = 1
dropout = 0.1 
max_length = 7038

model = Transformer(EMB_SIZE,LYR_VOCAB_SIZE,CHORD_SIZE,15,num_heads,num_encoder_layers, num_decoder_layers, forward_expansion, dropout, max_length,device ).to(device)
optimizer = optim.Adam(model.parameters(), lr = 3e-4)
criterion = nn.CrossEntropyLoss().to(device)

NB_EPOCHS = 3

for epoch in range(NB_EPOCHS):
    print(f"[Epoch {epoch}/ {NB_EPOCHS}]")
    model.train()
    for i,elem in tqdm(enumerate(dataset)):
        source, target = elem["lyrics"].reshape(-1,1,100).to(device), elem["chords"].reshape(-1,1)
        output = model(source, target).reshape(-1,CHORD_SIZE)
        optimizer.zero_grad()
        target_one_hot=  torch.Tensor([dataset.to_one_hot(int(chord)) for chord in target]).to(device)
        loss = criterion(output, target_one_hot)
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), max_norm= 1)
        optimizer.step()
        if i % 500 == 0:
            print(f"At i ={i} : ",float(loss))

[Epoch 0/ 3]


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  "lyrics": Tensor([self.vectorize(word) for word in  self.lyrics[idx] ]),
  torch.nn.utils.clip_grad_norm(model.parameters(), max_norm= 1)


At i =0 :  7.8258891105651855
At i =500 :  0.5942382216453552


KeyboardInterrupt: 