# Assignment 7

Delelop language model, which generates death metal band names.  
You can get data from https://www.kaggle.com/zhangjuefei/death-metal.  
You are free to use any other data, but the most easy way is just to take the band name column.

Your language model should be char-based autogression RNN.  
Text generation should be terminated when either max length is reached or terminal symbol is generated.  


Different band names can be generated by:  
1. init $h_0$ as random vector from some probabilty distribution.
2. sampling over tokens at each timestep with probability = softmax 

Calculate perplexity for your model = your objective quality metric.  
Also, sample 10 band names from your model for subjective evaluation. E.g. names like 'qwiouefiou23riop2h3' or 'death death death!' are bad examples.  

In [1]:
import os
import numpy as np
import scipy as sp
import pandas as pd
import time
import random
import string
from tqdm import tqdm_notebook as tqdm
import torch
from torch.nn import Embedding, GRU, RNN, LSTM, Linear, Dropout, CrossEntropyLoss
import torch.nn as nn
from torch.autograd import Variable

In [19]:
def letter_tensor(word):
    tensor = torch.zeros(len(word)).long()
    for letter in range(len(word)):
            tensor[letter] = all_letters.index(word[letter])
    return tensor

In [21]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=2, dropout=0):
        super(GRU, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = Dropout(dropout)
        self.encoder = Embedding(input_size, hidden_size)
        self.rnn = GRU(hidden_size, hidden_size, n_layers, dropout)
        self.decoder = Linear(hidden_size, output_size)

    def init_hidden(self, batch):
        return Variable(torch.zeros(self.n_layers, batch, self.hidden_size))
    
    def forward(self, input, hidden):
        batch= input.size(0)
        encoded = self.encoder(input)
        output, hidden = self.rnn(encoded.view(1, batch, -1), hidden)
        output = self.decoder(output.view(batch, -1))
        return output, hidden

In [24]:
df = pd.read_csv('bands.csv', index_col = 'id')
df.head(20)

Unnamed: 0_level_0,name,country,status,formed_in,genre,theme,active
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,('M') Inc.,United States,Unknown,2009.0,Death Metal,,2009-?
2,(sic),United States,Split-up,1993.0,Death Metal,,1993-1996
3,.F.O.A.D.,France,Active,2009.0,Death Metal,Life and Death,2009-present
4,100 Suns,United States,Active,2004.0,Death Metal,,2004-present
5,12 Days of Anarchy,United States,Split-up,1998.0,Death Metal,Anarchy,1998-2002
6,13th Cadaver,United States,Changed name,2006.0,Death Metal,Death| Gore| Undead,2006-?| ?-2007 (as Splatter the Cadaver)| 2008...
7,1917,Argentina,Active,1994.0,Death Metal,Dark Philosophical Poetry| Art| Religion| Psyc...,1994-present
8,5th Column,United States,Active,2003.0,Death Metal,War| Death| Battles| Rape,2003-present
9,6 Feet Under,Germany,Split-up,,Death Metal,,
10,602,Russia,Active,2012.0,Death Metal,Cruelty of regimes| WWII| Death,2012-present


In [31]:
groups_names = df['name'].tolist()
random.shuffle(groups_names)

with open('bands.txt', 'a') as f:
    for name in groups_names:
        f.write(name + '\n')
        
for file in ['bands.txt', 'train_bands.txt', 'valid_bands.txt']:
    for name in ['f1', 'f2', 'f3']:
        with open(file, 'r+') as name:
            lines = f.readlines()
            i = 1
            for l in lines:
                if i < int(0.9 * len(lines)):
                    f1.write(l)
                else:
                    f2.write(l)
                i += 1

In [32]:
with open('train_bands.txt', 'r') as file:
    train = file.read()
    train_len = len(train)
    
with open('valid_bands.txt', 'r') as file:
    valid = file.read()
    valid_len = len(valid)

In [10]:
batch = 64
hidden_size = 100
cut_len = 150
all_letters = string.printable
num_letters = len(all_letters)

optimizer = torch.optim.RMSprop(decoder.parameters(), lr = 0.01)
metric = CrossEntropyLoss()
decoder = GRU(num_letters, hidden_size, num_letters)

In [15]:
def train(input_, target, model, optimizer, metric, curr_epoch):
    perplexity_list = []
    current_loss = 0
    decoder.train()
    hidden = decoder.init_hidden(batch)
    decoder.zero_grad()
    for i in range(cut_len):
        optimizer.zero_grad()
        output, hidden = decoder(input_[:,i], hidden)
        loss = metric(output.view(batch, -1), target[:,i])
        perplexity_list.append(2**(loss.item()))
        curr_loss = loss.data.cpu().detach().item()
        current_loss = (i / (i + 1)) * current_loss + (1 - i / (i + 1)) * curr_loss
    perplexity = np.mean(perplexity_list)
    loss.backward(), optimizer.step()
    return current_loss, perplexity

def test(input_, target, model, metric):
    model.eval()
    perplexity_list = []
    loss, epoch_loss = 0, 0
    hidden = decoder.init_hidden(batch)
    with torch.no_grad():
        for i in range(cut_len):
            output, hidden = decoder(input_[:,i], hidden)
            loss = metric(output.view(batch, -1), target[:,i])
            perplexity_list.append(2**(loss.item()))
            epoch_loss += loss.data.item()
    perplexity = np.mean(perplexity_list)
    return epoch_loss / cut_len, perplexity

def train_sample(cut_len, batch, file, file_len):
    input_, target  = torch.LongTensor(batch, cut_len), torch.LongTensor(batch, cut_len)
    for i in range(batch):
        start = random.randint(0, file_len - cut_len)
        end = start + cut_len + 1
        input_[i] = letter_tensor(file[start:end][:-1])
        target[i] = letter_tensor(file[start:end][1:])
    input_, target = Variable(input_), Variable(target)
    return input_, target

def train_nn(model, metric, optimizer, n_epochs=300, early_stopping=0, scheduler=None):
    train_losses = []
    valid_losses = []
    hist = pd.DataFrame()
    prev_loss = float('inf')
    for epoch in tqdm(range(n_epochs)):
        train_loss, train_per = train(*train_sample(300, batch, train, train_len),model, optimizer, metric, epoch)
        valid_loss, valid_per = test(*train_sample(300, batch, valid, valid_len), model, metric)
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        if epoch % 100 == 0 or epoch == n_epochs-1:
            print('Epoch %d,  val_loss %.4f, train_loss %.4f, val_perplexity %.4f,  train_perplexity %.4f' % (epoch+1,
                                                                          valid_loss, train_loss, train_per, valid_per))
            

def name_generator(decoder, grad=0.75, pred_len=25):
    hidden = decoder.init_hidden(1)
    prime_input = Variable(letter_tensor('\n').unsqueeze(0))
    prediction = str('')
    prediction, hidden = decoder(prime_input[:,w], hidden) 
    input_ = prime_input[:,-1]
    for k in range(pred_len):
        output, hidden = decoder(input_, hidden)
        output_dist = output.data.view(-1).div(grad).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
        pred_letter = all_letters[top_i]
        if not (pred_letter == '\n' and prediction != ''):
            prediction += pred_letter
            input_ = Variable(letter_tensor(pred_letter).unsqueeze(0))
        else:
            break
    return prediction

In [16]:
# 2 hours training
train_nn(decoder, criterion = metric, optimizer, n_epochs=500)
save_filename = os.path.splitext(os.path.basename('bands_names'))[0] + '.pth'
torch.save(decoder, save_filename)

HBox(children=(IntProgress(value=0), HTML(value='')))

Epoch 0 , val_loss 4.4256, train_loss 4.6102, val_perplexity 24.4294, train_perplexity 21.5084 
Epoch 100, val_loss 2.7549, train_loss 2.7827, val_perplexity 6.9688, train_perplexity 6.8186 
Epoch 200, val_loss 2.7462, train_loss 2.7561, val_perplexity 6.9532, train_perplexity 6.7092 
Epoch 300, val_loss 2.6853, train_loss 2.6092, val_perplexity 6.8051, train_perplexity 6.5898 
Epoch 400, val_loss 2.6752, train_loss 2.5847, val_perplexity 6.7863, train_perplexity 6.4529 
Epoch 500, val_loss 2.6431, train_loss 2.5623, val_perplexity 6.7762, train_perplexity 6.4389 





In [18]:
for i in range(10):
    print(name_generator(decoder=torch.load('bands_names.pth')))

Lvicide
Sacreanis
SNoder Der
Axgicicridiento
Ixdoti Dowd
Thes
Denich Dengolac
Desabh Patr
Sykack
Dbicem
