In [4]:
from __future__ import unicode_literals, print_function, division
from io import open
import random
from nltk.corpus import words
import pandas as pd
import torch
import spacy 
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
import torch.nn as nn
from torch import optim, Tensor
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
import math
import time
from typing import Tuple
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset, random_split
tokenizer = get_tokenizer("spacy", 'en')
config = {"model": DEFAULT_TOK2VEC_MODEL}
nlp = spacy.load("en_core_web_sm")
nlp.remove_pipe("tagger")
nlp.remove_pipe("senter")
nlp.remove_pipe("parser")
nlp.remove_pipe("lemmatizer")
nlp.remove_pipe("attribute_ruler")
nlp.remove_pipe("ner")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
corpus = set(words.words())
df = pd.read_csv("./processed_data/en_data.csv")




In [5]:
res = []
def preprocess_text(s):
    new_s = s.lower().replace("-", "")
    news = tokenizer(new_s)
    res = ["<SOS>"]
    for token in news:
    
        if token != " ":
            if token == "\r\n":
                res.append("<EOL>")
            elif token in corpus:
                res.append(token)
    res.append("<EOS>")
    return res

for x in tqdm(list(df.lyrics)):
    res.append(preprocess_text(x))


100%|██████████| 23571/23571 [00:13<00:00, 1732.77it/s]


In [6]:
df.lyrics = res 
df.to_csv('./model_data/lyrics_processed.csv')

In [7]:
import numpy as np 
from ast import literal_eval

class ChordsDataset(Dataset):
    def __init__(self, df, tok2vec):
        self.vectorizer = tok2vec
        self.lyrics = [literal_eval(str(c)) for c in df.lyrics]
        self.chords = [literal_eval(c) for c in df.chords] 
        self.chords_set = set([item for sublist in self.chords for item in sublist] + ["<SOS>", "<EOS>"])
        self.id2chord = {i:k for i,k in enumerate(self.chords_set)}
        self.word2vec = dict()
        self.chord2vec = dict()
        self.chord2id = {k:i for i,k in self.id2chord.items()}
    def __len__(self):
        return len(self.lyrics)
    def to_one_hot(self,chord):
        if chord in self.chord2vec:
            return self.chord2vec[chord]
        else:
            vec = np.zeros(len(self.chords_set))
            vec[self.chord2id[chord]] = 1
            self.chord2vec[chord] = vec
            return vec
    def vectorize(self, word):
        if word in self.word2vec:
            return self.word2vec[word]
        vec = np.zeros(99)
        if word == "<SOS>": # Start of sequence token
            vec[96] = 1
            self.word2vec[word] = vec
        elif word == "<EOL>": # End of line token 
            vec[97] = 1
            self.word2vec[word] = vec
        elif word == "<EOS>": # End of sequence token
            vec[98] = 1
            self.word2vec[word] = vec
        else:
            vec = np.append(self.vectorizer(str(word)).vector, [0,0,0])
            self.word2vec[word] = vec
        return vec
    def __getitem__(self,idx):

        return {
            "lyrics": torch.Tensor([self.vectorize(word) for word in (["<SOS>"] + self.lyrics[idx]+["<EOS>"])]),
            "chords": torch.Tensor([self.to_one_hot(chord) for chord in (["<SOS>"] + self.chords[idx]+["<EOS>"])])
            }
dataset = ChordsDataset(df,nlp)

In [69]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])


In [67]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads, 
        num_encoder_layers,
        num_decoder_layers, 
        forward_expansion, 
        dropout, 
        max_length, 
        device 
    ):
        super().__init__()
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.src_position_embedding = PositionalEncoding( embedding_size,0.1,max_length)
        self.trg_position_embedding = PositionalEncoding( embedding_size,0.1,max_length)
        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads, 
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src: Tensor):
        src_mask  = src.transpose(0,1) == self.src_pad_idx
        return src_mask
    def forward(self, src, trg ):

        src_length , N = src.shape
        trg_length , N = trg.shape

        src_position = torch.arange(0,src_length).unsqueeze(1).expand(src_length,N)
        trg_position = torch.arange(0,trg_length).unsqueeze(1).expand(trg_length,N)

        print(src.shape,self.src_position_embedding(src_position).shape )
        embed_src = self.src_position_embedding(src) 
        embed_src = self.dropout( embed_src )
        embed_trg = self.dropout( self.trg_word_embedding(trg) + self.trg_position_embedding(trg_position) )
        return embed_src



LYR_VOCAB_SIZE = 99
EMB_SIZE = LYR_VOCAB_SIZE
CHORD_SIZE = len(dataset.chords_set) 
num_heads = 9
num_encoder_layers = 3
num_decoder_layers = 3
forward_expansion = 1
dropout = 0.1 
max_length = 7038

model = Transformer(EMB_SIZE,LYR_VOCAB_SIZE,CHORD_SIZE,15,num_heads,num_encoder_layers, num_decoder_layers, forward_expansion, dropout, max_length,device )

In [68]:
elem = dataset[15]
src,trg = elem["lyrics"], elem["chords"]
model(src,trg)

torch.Size([74, 99]) torch.Size([74, 2551, 99])


RuntimeError: The size of tensor a (74) must match the size of tensor b (2551) at non-singleton dimension 1

In [34]:

def train(model: nn.Module,
          iterator: torch.utils.data.DataLoader,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):

    model.train()

    epoch_loss = 0

    for _, test in enumerate(iterator):
        src, trg = test["lyrics"].to(device), test["chords"].to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model: nn.Module,
             iterator: torch.utils.data.DataLoader,
             criterion: nn.Module):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for _, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg, 0)  # turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


N_EPOCHS = 10
CLIP = 1
train_iter = DataLoader(dataset, shuffle = True)
best_valid_loss = float('inf')
criterion = nn.NLLLoss()
for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    # valid_loss = evaluate(model, valid_iter, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(
        f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    # print(
    #     f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

# test_loss = evaluate(model, DataLoader(dataset, b), criterion)

# print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [44]:
import gensim.downloader as api
# weights = torch.FloatTensor(test)
# weights = torch.FloatTensor(gensim.models.Word2Vec.load("./word2vec_pretrain_v300.model"))
model = api.load("word2vec-google-news-300")
embedding = nn.Embedding.from_pretrained(torch.FloatTensor(model.vectors))
# model.forward(src)
# gensim.models.Word2Vec()

embedding.embedding_dim

300

In [46]:
embedding.num_embeddings
# embedding(torch.Tensor(1250).to(torch.int64))

3000000