In [24]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['albums.csv', 'bands.csv', 'reviews.csv']


In [53]:
import string
import random

import numpy as np
import pandas as pd

import torch as tt
import torch.nn as nn

from torch.autograd import Variable

In [54]:
data = pd.read_csv('../input/bands.csv')

In [41]:
data

Unnamed: 0,id,name,country,status,formed_in,genre,theme,active
0,1,('M') Inc.,United States,Unknown,2009.0,Death Metal,,2009-?
1,2,(sic),United States,Split-up,1993.0,Death Metal,,1993-1996
2,3,.F.O.A.D.,France,Active,2009.0,Death Metal,Life and Death,2009-present
3,4,100 Suns,United States,Active,2004.0,Death Metal,,2004-present
4,5,12 Days of Anarchy,United States,Split-up,1998.0,Death Metal,Anarchy,1998-2002
5,6,13th Cadaver,United States,Changed name,2006.0,Death Metal,Death| Gore| Undead,2006-?| ?-2007 (as Splatter the Cadaver)| 2008...
6,7,1917,Argentina,Active,1994.0,Death Metal,Dark Philosophical Poetry| Art| Religion| Psyc...,1994-present
7,8,5th Column,United States,Active,2003.0,Death Metal,War| Death| Battles| Rape,2003-present
8,9,6 Feet Under,Germany,Split-up,,Death Metal,,
9,10,602,Russia,Active,2012.0,Death Metal,Cruelty of regimes| WWII| Death,2012-present


In [55]:
len(data)

37723

In [56]:
data.name.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
data = data.drop_duplicates(subset=['name'])

In [57]:
len(data)

32862

In [58]:
names = data['name'].tolist()
random.shuffle(names)
names = ' '.join(names)
train = names[:int(len(names)*0.8)]
valid = names[int(len(names)*0.8):]

In [47]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNNModel, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, n_layers)
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        batch_size = input.size(0)
        encoded = self.encoder(input)
        output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
        output = self.decoder(output.view(batch_size, -1))
        return output, hidden

    def init_hidden(self, batch_size):
        return Variable(tt.zeros(self.n_layers, batch_size, self.hidden_size))

In [48]:
def char_tensor(string, all_characters=string.printable):
    
    tensor = tt.zeros(len(string)).long()
    
    for c in range(len(string)):
        
        try:
            tensor[c] = all_characters.index(string[c])
            
        except:
            pass
        
    return tensor

def random_training_set(chunk_len, batch_size, text):
    
    inp = tt.LongTensor(batch_size, chunk_len)
    target = tt.LongTensor(batch_size, chunk_len)
    
    for bs in range(batch_size):
        
        start_index = random.randint(0, len(text) - chunk_len)
        chunk = text[start_index : start_index + chunk_len + 1]
        
        inp[bs] = char_tensor(chunk[:-1])
        target[bs] = char_tensor(chunk[1:])
        
    inp, target = Variable(inp), Variable(target)
    
    return inp, target

def perplexity(x):
    return 2**x

In [49]:
def _train_epoch(inp, target, model, optimizer, criterion, curr_epoch):

    decoder.train()
    hidden = decoder.init_hidden(batch_size)
    decoder.zero_grad()
    
    train_loss, perplexities = 0, list()
    
    for c in range(chunk_len):
        
        optimizer.zero_grad()
        
        output, hidden = decoder(inp[:, c],
                                 hidden)
        loss = criterion(output.view(batch_size, -1),
                         target[:, c])
        perplexities.append(perplexity(loss.item()))
        
        current_loss = loss.data.cpu().detach().item()
        loss_smoothing = c / (c + 1)
        train_loss = loss_smoothing * train_loss + (1 - loss_smoothing) * current_loss
    
    loss.backward()
    optimizer.step()
    
    return train_loss, np.mean(perplexities)

def _test_epoch(inp, target, model, criterion):
    
    model.eval()
    
    epoch_loss, loss, perplexities = 0, 0, list()
    hidden = decoder.init_hidden(batch_size)
    
    with tt.no_grad():
        for c in range(chunk_len):
            output, hidden = decoder(inp[:, c],
                                     hidden)
            loss = criterion(output.view(batch_size, -1),
                             target[:, c])
            perplexities.append(perplexity(loss.item()))
            epoch_loss += loss.data.item()
    
    return epoch_loss / chunk_len, np.mean(perplexities)


def nn_train(model, train, valid, criterion, optimizer, n_epochs=100, scheduler=None, early_stopping=0):
    
    print('N-Epoch\tValid Loss\t Train Loss\tV.Perplexity\tT.Perplexity')
    
    prev_loss, es_epochs = 100500, 0
    
    train_losses, valid_losses = list(), list()
    
    for epoch in range(n_epochs):
        train_loss, train_per = _train_epoch(*random_training_set(chunk_len, 
                                                                      batch_size, 
                                                                      train),
                                             model,
                                             optimizer,
                                             criterion,
                                             epoch)
        
        valid_loss, valid_per = _test_epoch(*random_training_set(chunk_len, 
                                                                     batch_size, 
                                                                     valid),
                                            model,
                                            criterion)
        
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
            
        if epoch % 100 == 0 or epoch == n_epochs-1:
            print('%s \t %.6f \t %.6f \t %.6f \t %.6f' % (str(epoch),
                                                          valid_loss,
                                                          train_loss,
                                                          valid_per,
                                                          train_per))
        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0
            if es_epochs >= early_stopping:
                print('Training early stopping!')
                break
            prev_loss = min(prev_loss, valid_loss)

In [59]:
hidden_size, batch_size, chunk_len = 100, 32, 250

decoder = RNNModel(
    len(string.printable),
    hidden_size,
    len(string.printable))

optimizer = tt.optim.Adam(decoder.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [60]:
nn_train(decoder, train, valid, criterion, optimizer, n_epochs=1000, early_stopping=500)
tt.save(decoder, 'result.pt')

N-Epoch	Valid Loss	 Train Loss	V.Perplexity	T.Perplexity
0 	 4.400522 	 4.613569 	 21.136107 	 24.485654
100 	 2.751154 	 2.743814 	 6.835651 	 6.783820
200 	 2.649527 	 2.660450 	 6.386678 	 6.430016
300 	 2.643812 	 2.658888 	 6.347995 	 6.420855
400 	 2.632326 	 2.633144 	 6.335445 	 6.308012
500 	 2.622648 	 2.631407 	 6.270281 	 6.317585
600 	 2.589977 	 2.616214 	 6.116070 	 6.241401
700 	 2.638613 	 2.593129 	 6.334564 	 6.148325
800 	 2.664960 	 2.610439 	 6.490794 	 6.220190
900 	 2.580773 	 2.585797 	 6.090593 	 6.116938
999 	 2.632132 	 2.541971 	 6.335582 	 5.924576


  "type " + obj.__name__ + ". It won't be checked "


In [61]:
def generate(decoder, prime_str='\n', predict_len=15, temperature=0.8):
    hidden = decoder.init_hidden(1)
    prime_input = char_tensor(prime_str).unsqueeze(0)
    predicted = ''

    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[:,p], hidden)
        
    inp = prime_input[:,-1]
    
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = tt.multinomial(output_dist, 1)[0]
        predicted_char = string.printable[top_i]
        
        if predicted and predicted_char == '\n':
            break
        else:
            predicted += predicted_char
            inp = char_tensor(predicted_char).unsqueeze(0)

    return predicted

In [None]:
for x in range(50):
    print(generate(decoder))

Death Mysina Pr
 Kebreast Marpt
 Morpe Bre bout
ysistion The Mo
fethob Gesssd A
Egt Thm Keud Bu
st Deavesaris K
gPlition Potrut
 Aenrve West Ke
j Blogbem Ged B
 Mitaly Conctuh
 Mth Nuiate  of
s Besss Decto P
 XPtution Sufff
Ism Diade CSutu
6#umes of Septh
 The Gsteast Mi
ser Vition Aolt
atopt Mogod Spt
githe Dedartah 
 Putre Golt Pur
in Mor bark Sw 
potum th Wizaw 
 Markit Welales
 Punede Gamsss 
Rist Meratruss 
r Cecce Masstr 
