In [1]:
GUI = True
from tqdm import tqdm as t

tqdm = t if GUI else lambda x: x

In [5]:
from torchtext.vocab import GloVe, FastText
vectors = GloVe(name='6B', dim=300)
# vectors = FastText(language='en')



In [6]:
import pandas as pd
import mapply
mapply.init(n_workers=20, progressbar=GUI)

clear = pd.read_feather('../data/lyrics_cleaned.feather')
clear = clear[clear['tag'] != 'misc']

clear['TAG'] = clear['tag']

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
oh = OneHotEncoder()
clear['genre'] = le.fit_transform(clear['TAG'])
clear['genre'] = oh.fit_transform(clear['genre'].values.reshape(-1, 1)).toarray().tolist()

ds = clear[['tokens', 'genre']]

from sklearn.model_selection import train_test_split

dev, test = train_test_split(ds, test_size=0.2, stratify=ds['genre'], random_state=0)
train, val = train_test_split(dev, test_size=0.25, stratify=dev['genre'], random_state=0)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:

import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MAX_TOKENS = 150

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def vectorize(batch):
  X, y = list(zip(*batch))
  X_tensor = [vectors.get_vecs_by_tokens(x[:MAX_TOKENS].tolist(), lower_case_backup=True) for x in X]
  X_tensor = pad_sequence(X_tensor, batch_first=True)
  y_tensor = torch.tensor(y)
  X_tensor = X_tensor.to(DEVICE)
  y_tensor = y_tensor.to(DEVICE)
  return X_tensor, y_tensor

train_tensor = DataLoader((train.values), batch_size=1024, shuffle=True, collate_fn=vectorize)
val_tensor = DataLoader((val.values), batch_size=1024, shuffle=True, collate_fn=vectorize)
test_tensor = DataLoader((test.values), batch_size=1024, shuffle=True, collate_fn=vectorize)

In [None]:
from torch.nn import GRU, CrossEntropyLoss

EMBED_DIM = vectors['unk'].shape[0]
N_CLASSES = len(clear.iloc[0]['genre'])

class RNN(torch.nn.Module):
  def __init__(self, input_size, hidden_size, output_size, gru_layers=1, dropout=0.5):
    super(RNN, self).__init__()
    self.n_layers = gru_layers
    self.hidden_size = hidden_size
    self.gru = GRU(input_size, hidden_size, num_layers=self.n_layers, batch_first=True, dropout=dropout)
    self.mp = torch.nn.MaxPool1d(MAX_TOKENS)
    self.fc = torch.nn.Linear(hidden_size, output_size)
    self.softmax = torch.nn.Softmax(dim=1)
    
  def forward(self, x):
    h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_size).to(DEVICE)
    output, _ = self.gru(x, h0)
    # output = output[:, -1, :]                           # last hidden state
    output = self.mp(output.permute(0, 2, 1))[:, :, -1] # max pooling of all hidden states
    fc = self.fc(output)
    sm = self.softmax(fc)
    return sm
  
model = RNN(EMBED_DIM, 32, N_CLASSES, 1).to(DEVICE)
criterion = CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

from sklearn.metrics import accuracy_score

def train_epoch(model, criterion, optimizer, train_tensor):
  model.train()
  total_loss = 0
  total_acc = 0
  for X, y in train_tensor:
    optimizer.zero_grad()
    y_pred = model(X)
    loss = criterion(y_pred, y)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
    total_acc += accuracy_score(y.argmax(1).cpu().numpy(), y_pred.argmax(1).cpu().numpy())
  return total_loss / len(train_tensor), total_acc / len(train_tensor)

def eval_epoch(model, criterion, val_tensor):
  model.eval()
  total_loss = 0
  total_acc = 0
  with torch.no_grad():
    for X, y in val_tensor:
      y_pred = model(X)
      loss = criterion(y_pred, y)
      total_loss += loss.item()
      total_acc += accuracy_score(y.argmax(1).cpu().numpy(), y_pred.argmax(1).cpu().numpy())
  return total_loss / len(val_tensor), total_acc / len(val_tensor)

EPOCHS = 100

for epoch in range(EPOCHS):
  train_loss, train_acc = train_epoch(model, criterion, optimizer, tqdm(train_tensor))
  val_loss, val_acc = eval_epoch(model, criterion, val_tensor)
  print(f'Epoch {epoch + 1}/{EPOCHS} - Train Loss: {train_loss:.4f} - Train Acc: {train_acc:.4f} - Val Loss: {val_loss:.4f} - Val Acc: {val_acc:.4f}')

# test_loss, test_acc = eval_epoch(model, criterion, test_tensor)
# print(f'Test Loss: {test_loss:.4f} - Test Acc: {test_acc:.4f}')


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:25<00:00,  1.63it/s]


Epoch 1/100 - Train Loss: 1.4575 - Train Acc: 0.4459 - Val Loss: 1.3714 - Val Acc: 0.5282


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:19<00:00,  2.12it/s]


Epoch 2/100 - Train Loss: 1.3527 - Train Acc: 0.5424 - Val Loss: 1.3456 - Val Acc: 0.5511


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:19<00:00,  2.14it/s]


Epoch 3/100 - Train Loss: 1.3258 - Train Acc: 0.5737 - Val Loss: 1.3357 - Val Acc: 0.5582


 62%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                              | 26/42 [00:12<00:07,  2.09it/s]