In [75]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook 
import torch as tt
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

SEED = 42
np.random.seed(SEED)

In [77]:
df = pd.read_csv('bands.csv')

df.head()

Unnamed: 0,id,name,country,status,formed_in,genre,theme,active
0,1,('M') Inc.,United States,Unknown,2009.0,Death Metal,,2009-?
1,2,(sic),United States,Split-up,1993.0,Death Metal,,1993-1996
2,3,.F.O.A.D.,France,Active,2009.0,Death Metal,Life and Death,2009-present
3,4,100 Suns,United States,Active,2004.0,Death Metal,,2004-present
4,5,12 Days of Anarchy,United States,Split-up,1998.0,Death Metal,Anarchy,1998-2002


In [78]:
df = df.dropna(subset=['name'])

X = df.name.str.replace('∆', 'o')

X.shape

(37723,)

В данных есть названия групп на разных языках, поэтому эту задачу можно решать двумя способами: с учетом категории и без. Категорией в данном случае будем счить письменность, на которой написано название группы. Рассмотрим каждую из этих задач отдельно. 

# CATEGORY

В данных много диакритик и небуквенных символов, поэтому список букв чистился. 

source - https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html

In [79]:
all_letters = '-./0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzΑΓΔΕΚΝΡΣΩέήαδηθκλμνοπρτψωόώІЇАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШЪЭЮабвгдежзийклмнопрстуфхцчшъыьэяёクステノフモルヴ中人修健兀入冥出北卸古台吐呕咒围地复夜大天妖守射尸屠川巨彘影懺战戮手散斧昏晦术杀東梦楽死殺注活浮混火烂烟猝猿甲界症痋祝神禁福空突结羅者而胄脱腐色苔藓虎虐虚蛇血行覆謎诗语豚败轡邪郁針鉄铁閃陈雾霾靈颠餮饕骨魇黑 '

In [80]:
import unicodedata

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters)

In [81]:
X_ = []


for index, item in enumerate(X):
    if item == 'שְׁאוֹל':
        continue
    
    ans = unicodeToAscii(item)
    if ans != '' and ans != ' ':  
        X_.append(ans)

Определение письменности

In [82]:
from alphabet_detector import AlphabetDetector

ad = AlphabetDetector()

In [83]:
ad.detect_alphabet(u'าาา')

{'THAI'}

In [84]:
s = [' ', '!', '"', '$', '%', '&', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?','[', ']', '|']

In [85]:
from collections import defaultdict
    

category_lines = defaultdict(list)
all_categories = []

for line in X_:
    
    if all(item in s for item in line):
        category = 'INTEGER'
    else:
        category = ad.detect_alphabet(line)
        category = list(category)[0]
    
    if category not in all_categories:
        all_categories.append(category)
    category_lines[category].append(line)

n_letters = len(all_letters) + 1
n_categories = len(all_categories)

all_categories

['LATIN', 'INTEGER', 'GREEK', 'CYRILLIC', 'CJK', 'KATAKANA']

Есть также группы, которые названы чисто цифрами. Выделим их в отдельную категорию. 

In [87]:
category_lines['INTEGER']

['1917',
 '602',
 '9',
 '1:34',
 '420',
 '11:34',
 '12.7',
 '29/09',
 '6425',
 '731',
 '762',
 '908',
 '1914',
 '4.6',
 '468']

In [88]:
category_lines['KATAKANA']

['モノノフ', 'ルテクス']

Определение категории

In [89]:
def define_category(letter):
    
    global s, all_categories
    
    if letter in s:
        category = 'INTEGER'
    else:
        category = ad.detect_alphabet(letter)
        category = list(category)[0]

    return category

### MODEL

In [90]:
import random

In [91]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size

        self.i2h = nn.Linear(n_categories + input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(n_categories + input_size + hidden_size, output_size)
        self.o2o = nn.Linear(hidden_size + output_size, output_size)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, category, input, hidden):
        input_combined = tt.cat((category, input, hidden), 1)
        hidden = self.i2h(input_combined)
        output = self.i2o(input_combined)
        output_combined = tt.cat((hidden, output), 1)
        output = self.o2o(output_combined)
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return tt.zeros(1, self.hidden_size)

In [92]:
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingPair():
    
    global all_letters
    
    letter = random.choice(all_letters)
    category = define_category(letter)
    line = randomChoice(category_lines[category])
    return category, line

def randomTrainingPair():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    return category, line

In [93]:
def categoryTensor(category):
    li = all_categories.index(category)
    tensor = tt.zeros(1, n_categories)
    tensor[0][li] = 1
    return tensor


def inputTensor(line):
    tensor = tt.zeros(len(line), 1, n_letters)
    for li in range(len(line)):
        letter = line[li]
        tensor[li][0][all_letters.find(letter)] = 1
    return tensor


def targetTensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1) # EOS
    return tt.LongTensor(letter_indexes)

In [94]:
def randomTrainingExample():
    category, line = randomTrainingPair()
    category_tensor = categoryTensor(category)
    input_line_tensor = inputTensor(line)
    target_line_tensor = targetTensor(line)
    return category_tensor, input_line_tensor, target_line_tensor

In [95]:
def train(rnn, category_tensor, input_line_tensor, target_line_tensor):
    
    rnn.train()
    
    target_line_tensor.unsqueeze_(-1)
    hidden = rnn.initHidden()

    optimizer.zero_grad()
    loss = 0

    for i in range(input_line_tensor.size(0)):
        output, hidden = rnn(category_tensor, input_line_tensor[i], hidden)
        l = criterion(output, target_line_tensor[i])
        loss += l

    loss.backward()
    optimizer.step()

    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item() / input_line_tensor.size(0)

In [102]:
max_length = 15

def sample(rnn, max_length):
    
    start_letter = random.choice(all_letters)
    category = define_category(start_letter)
    
    with tt.no_grad():  
        
        category_tensor = categoryTensor(category)
        input = inputTensor(start_letter)
        hidden = rnn.initHidden()

        output_name = start_letter

        for i in range(max_length):
           
            output, hidden = rnn(category_tensor, input[0], hidden)
            topv, topi = output.topk(1)
            topi = topi[0][0]
    
            if topi == n_letters - 1:
                break
            else:
                letter = all_letters[topi]
                output_name += letter
            input = inputTensor(letter)

        return category, output_name


In [101]:
rnn = RNN(n_letters, 128, n_letters)
criterion = nn.NLLLoss()
learning_rate = 0.0005
optimizer = optim.Adam(rnn.parameters(), learning_rate)

n_iters = 100000
print_every = 5000
plot_every = 500
all_losses = []
total_loss = 0 

t_par =  tqdm_notebook(range(1, n_iters + 1))

for itr in t_par:
    
    output, loss = train(rnn, *randomTrainingExample())
    total_loss += loss


HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))

In [106]:
for i in range(10):
    print(sample(rnn, max_length))

('CJK', '猿')
('LATIN', 'm')
('LATIN', 'f')
('GREEK', 'έ')
('LATIN', 't')
('GREEK', 'Ε')
('CJK', '胄')
('CJK', '呕')
('CJK', '古')
('CYRILLIC', 'э')


# LATIN

Теперь попробуем  генерировать в рамках латиницы, так как больше всего названий. Эта часть обрабатывалась на Kaggle.
source - https://github.com/spro/practical-pytorch/blob/master/char-rnn-generation/char-rnn-generation.ipynb

## DATA

In [110]:
df = pd.read_csv('/Users/Stoneberry/Desktop/assigment_7_Kostyanitsyna/text_table4.csv')

In [111]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,M Inc.,M Inc.
1,1,sic,sic
2,2,.F.O.A.D.,.F.O.A.D.
3,3,100 Suns,100 Suns
4,4,12 Days of Anarchy,12 Days of Anarchy


В данных есть нелатинские символы, до этого я уже отсеяла полностью нелатинские названия, но остались вские смешанные названия. Уберем их, чтобы не было названий типа "thуыа"

In [112]:
import re

data = []
s = 'йцукенгшщзхъёфывапролджэячсмитьбюΔΝΩαμψ覆颠'

for line in df.text:
    if all(item not in s for item in line) and len(line) > 2:
        line = re.sub('\n', '', line)
        if line[-1] == ' ':
            line = line[:-1]
        data.append(line)

In [113]:
all_letters = set()

for index, i in enumerate(data):
    if index != 0: all_letters |= set(i)
    else: all_letters = set(i)

all_letters = sorted(all_letters)
all_letters += ['<', '>', '#']
''.join(all_letters)

' -./0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz<>#'

In [114]:
n_letters = len(all_letters)
max_len = len(max(data, key=lambda x: len(x))) + 1

n_letters, max_len 

(70, 51)

## ОБРАБОТКА ДАННЫХ

In [115]:
def char_tensor(string, typ='start'):
    
    tensor = tt.zeros(max_len).long()
    
    if typ == 'start': string = '<'+ string
    else: string = string + '>'
        
    length = len(string)
    
    for c in range(max_len):
        if c < length:
            tensor[c] = all_letters.index(string[c])
        elif c >= length:
            tensor[c] = all_letters.index('#') # padding

    return Variable(tensor)

In [9]:
char_tensor('string')

tensor([67, 59, 60, 58, 49, 54, 47, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
        69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
        69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69])

In [116]:
def inp_out(string):
    inp = char_tensor(string[:-1], typ='start')
    target = char_tensor(string[1:], typ='end')
    return inp, target

In [11]:
inp_out('string')

(tensor([67, 59, 60, 58, 49, 54, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
         69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
         69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69]),
 tensor([60, 58, 49, 54, 47, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
         69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
         69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69]))

In [12]:
inp_out('string')[0].shape

torch.Size([51])

In [13]:
all_data = [inp_out(i) for i in data]

In [14]:
import pickle

with open('tensor_data.pickle', 'wb') as handle:
    pickle.dump(all_data, handle)

### batching

In [15]:
n_batches = 1000
batch_size = 32
indices = np.random.choice(list(range(len(all_data))), size=len(all_data))
input_bat, output_bat = [], []
            
for j in range(n_batches):
    batch_idx = indices[j: j + batch_size]
    inp = [all_data[i][0] for i in batch_idx]
    out = [all_data[i][1] for i in batch_idx]
    input_bat.append(inp)
    output_bat.append(out)

In [16]:
batches = [(tt.stack(i),  tt.stack(output_bat[index])) for index, i in enumerate(input_bat)]
#output_bat = [tt.stack(i) for i in output_bat]

In [17]:
batches[0][0].shape

torch.Size([32, 51])

In [18]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(batches, test_size=0.2, random_state=42)

In [19]:
train[0]

(tensor([[67, 20, 52,  ..., 69, 69, 69],
         [67, 19, 60,  ..., 69, 69, 69],
         [67, 30, 41,  ..., 69, 69, 69],
         ...,
         [67, 27, 65,  ..., 69, 69, 69],
         [67, 18, 45,  ..., 69, 69, 69],
         [67, 16, 45,  ..., 69, 69, 69]]),
 tensor([[52, 49, 60,  ..., 69, 69, 69],
         [60, 45, 58,  ..., 69, 69, 69],
         [41, 49, 54,  ..., 69, 69, 69],
         ...,
         [65, 54, 44,  ..., 69, 69, 69],
         [45, 60, 45,  ..., 69, 69, 69],
         [45, 52, 46,  ..., 69, 69, 69]]))

# MODEL

In [117]:
class MyModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        
        super(MyModel, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.drop = tt.nn.Dropout(0.15)

        self.embed = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        
        batch_size = input.size(0)
        encoded = self.embed(input)
        
      #  encoded = self.drop(encoded)
        
        output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
        
        # Input: (N, *, in_features)
        # Output: (N, *, out_features) 
        
        output = self.fc(output.view(batch_size, -1))
        return output, hidden

    def init_hidden(self, batch_size):
        return Variable(tt.zeros(self.n_layers, batch_size, self.hidden_size))

In [118]:
def _train_epoch(inp, target, model, optimizer, criterion, curr_epoch):

    model.train()
    hidden = model.init_hidden(batch_size)
    model.zero_grad()
    running_loss = 0
    perpl = []

    for c in range(chunk_len):
        
        optimizer.zero_grad()
        
        output, hidden = model(inp[:,c], hidden)
        loss = criterion(output.view(batch_size, -1), target[:,c])
        perpl.append(perplexity(loss.item()))
    
    perp = np.mean(perpl)
    loss.backward()
    optimizer.step()

    return perp

def _test_epoch(inp, target, model, criterion):
    
    model.eval()
    hidden = model.init_hidden(batch_size)
    loss = 0
    perpl = []
    
    with tt.no_grad():
        for c in range(chunk_len):
            output, hidden = model(inp[:,c], hidden)
            loss = criterion(output.view(batch_size, -1), target[:,c])
            perpl.append(perplexity(loss.item()))
    
    perp = np.mean(perpl)
    return perp


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100, scheduler=None, early_stopping=0):
    
    for epoch in tqdm_notebook(range(n_epochs)):
        
        for batch in train_iterator:
            x = batch[0]
            y = batch[1]
            train_per = _train_epoch(x, y, model, optimizer, criterion, epoch)
            #train_losses.append(train_loss)
            
        for batch in valid_iterator:
            x = batch[0]
            y = batch[1]    
            valid_per = _test_epoch(x, y, model, criterion)
            #valid_losses.append(valid_loss)
        
        print('epo:  ' + str(epoch))
        print('perp:  ' + str(valid_per))
        print('-----------------------------')

In [119]:
def perplexity(x):
    return 2**x

In [23]:
hidden_size = 100
batch_size = 32
chunk_len = max_len 


model = MyModel(n_letters, hidden_size, n_letters)


optimizer = tt.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [24]:
nn_train(model, train, valid, criterion, optimizer, n_epochs=10)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

epo:  0
perp:  4.330233298689488
-----------------------------
epo:  1
perp:  4.439578485674306
-----------------------------
epo:  2
perp:  4.509594391687567
-----------------------------
epo:  3
perp:  4.537983376399163
-----------------------------
epo:  4
perp:  4.561788085917148
-----------------------------
epo:  5
perp:  4.5728363641702
-----------------------------
epo:  6
perp:  4.573747194109011
-----------------------------
epo:  7
perp:  4.574938014065511
-----------------------------
epo:  8
perp:  4.576087942567768
-----------------------------
epo:  9
perp:  4.576880003242535
-----------------------------



## SAMPLING

In [120]:
def categoryTensor(letter):
    
    if len(letter) != 1:
        raise ValueError
        
    tensor = tt.zeros(1).long()
    tensor[0] = all_letters.index(letter)
    return Variable(tensor)

In [121]:
def sampling(model, prime_str='<', predict_len=10, temperature=0.8):
    
    def samp(model, prime_str='<', predict_len=10, temperature=0.8):
        
        hidden = model.init_hidden(1)
        inp = categoryTensor(prime_str)
        predicted = ''
        miss_count = 0
        
        for p in range(predict_len):
            output, hidden = model(inp, hidden)
            output_dist = output.data.view(-1).div(temperature).exp()
            top_i = tt.multinomial(output_dist, 1)[0]

            predicted_char = all_letters[top_i]
        
            if predicted != '' and predicted_char == '>': break
            if predicted != '' and predicted_char == '#': break
            if predicted == '' and predicted_char == '#': 
                inp = inp
                miss_count += 1
            if predicted == '' and predicted_char == '>': 
                inp = inp
                miss_count += 1
            else:
                predicted += predicted_char
                inp = categoryTensor(predicted_char)
        
        return predicted, miss_count
    
    predicted, miss_count = samp(model, prime_str=prime_str, predict_len=predict_len, temperature=temperature)
    
    if miss_count != 0:
        pred, miss_count = samp(model, prime_str=prime_str, predict_len=miss_count, temperature=temperature)
        predicted += pred
    return predicted

In [47]:
for _ in range(10):
    print(sampling(model))

L Qymt2pq2
K21ENq1TA:
iyFYX9Qfxk
ctjTc<G8k3
e:pYHfA28 
h8CkQhI/pO
/FJdFoeE:c
Jwu94c5jlg
Qyht89:5fH
90BsJOkNgx


Результаты плохие, попробуем выбирать рандомные примеры

## RANDOM

In [33]:
import random

In [122]:
def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100, scheduler=None, early_stopping=0):
    
    for epoch in tqdm_notebook(range(n_epochs)):
        
        x, y = random.choice(train_iterator)
        train_per = _train_epoch(x, y, model, optimizer, criterion, epoch)
        
            
        x, y = random.choice(valid_iterator)    
        valid_per = _test_epoch(x, y, model, criterion)


In [51]:
hidden_size = 100
batch_size = 32
chunk_len = max_len 


model = MyModel(n_letters, hidden_size, n_letters)


optimizer = tt.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [52]:
nn_train(model, train, valid, criterion, optimizer, n_epochs=1000)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

In [55]:
for _ in range(10):
    print(sampling(model))

vWoeCoIyk8
9EtsKZUOvJ
 dPytVI FA
f7naDot fH
Wca3TZzih<
cnDbf6Jp
2JPtZ5sTby
YcJ
-UGt-g6MKA
2YsL3hgfA3


 Все еще плохо, попробуем убрать увеличить размер текста 

Уберем тексты с  числами и пункутацией

In [123]:
import unicodedata
import string

all_letters = list(string.ascii_letters + '<' + '>' + '#')

In [124]:
''.join(all_letters)

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ<>#'

In [125]:
n_letters = len(all_letters)
n_letters

55

In [126]:
import re

data2 = []

for line in data:
    if all(item in all_letters for item in line) and len(line) > 2:
        data2.append(line.lower())

In [127]:
data2[:2]

['sic', 'abaddon']

## MAX_LEN CHANGE

In [128]:
max_len = 300

In [129]:
train = ''
valid = ''

indices = np.random.choice(list(range(len(data2))), size=len(data2))
stop = int(len(data2) * 0.8)
count = 0

for i in indices:
    if count >= stop:
        valid += '<' + data2[i] + '>'
    else:
        train += '<' + data2[i] + '>'
        count += 1

new_data = train + valid

In [130]:
len(new_data)

210631

In [131]:
len(train)

168404

In [132]:
def new_char_tensor(string):
    tensor = tt.zeros(len(string)).long()
    for c in range(len(string)):
        tensor[c] = all_letters.index(string[c])
    return tensor

In [133]:
new_char_tensor('string')

tensor([18, 19, 17,  8, 13,  6])

In [134]:
def data_set(max_len, batch_size, new_data):
    
    inp = tt.LongTensor(batch_size, max_len)
    out = tt.LongTensor(batch_size, max_len)
    indices = np.random.choice(list(range(len(new_data))), size=len(new_data)//max_len)
    
    i = 0
    
    for j in indices:
        if i ==  batch_size: break
        
        if j + max_len + 1 > len(new_data):
            continue
        else:
            text = new_data[j: j + max_len + 1]
            input = new_char_tensor(text[:-1])
            output = new_char_tensor(text[1:])
            inp[i] = input
            out[i] = output
            i += 1
        
    inp = Variable(inp)
    out = Variable(out)
    return inp, out

In [136]:
batch_size = 32
x, y = data_set(max_len, batch_size, train)

In [137]:
train[:10]

'<vitriol><'

In [196]:
hidden_size = 100
batch_size = 32
chunk_len = max_len 


model = MyModel(n_letters, hidden_size, n_letters)


optimizer = tt.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [197]:
def nn_train3(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100, scheduler=None, early_stopping=0):
    
    for epoch in tqdm_notebook(range(n_epochs)):
        
        x, y = data_set(max_len, batch_size, train_iterator)
        train_per = _train_epoch(x, y, model, optimizer, criterion, epoch)
        
            
        x, y = data_set(max_len, batch_size, valid_iterator)  
        valid_per = _test_epoch(x, y, model, criterion)
        print(valid_per)


In [198]:
nn_train3(model, train, valid, criterion, optimizer, n_epochs=100)

HBox(children=(IntProgress(value=0), HTML(value='')))

13.442764369538397
11.016835293179643
9.049129436091553
7.848327039466904
7.50372828060921
7.32471554633364
7.131663477711145
6.927224646063294
6.750126332605114
6.622507866841935
6.546723649236074
6.4902594435236525
6.374162110316436
6.248916602804617
6.314379205446507
6.221398127919374
6.1974883750119325
6.068850438084599
6.074577576790768
5.969511381873059
5.936788018204073
5.822415484276597
5.920210211540478
6.019472412098903
6.021408905936869
5.884224793706914
5.887912776910287
5.902992221751512
5.991311687672449
5.991876649286415
5.862071838637773
5.838115705580036
5.77286453052311
5.820449746194766
5.946882546159231
5.975299456730269
5.995682876310205
5.722760027073354
5.6769987353101286
5.6372679809441895
5.487390971070689
5.428610345035017
5.412538012114933
5.476085254679203
5.54921799600325
5.6251230564579755
5.659331489354039
5.496087859698343
5.507039151621104
5.445600088869327
5.492245802583982
5.355178757466656
5.483835135740489
5.509476562884253
5.4328760996076655
5.3165

In [199]:
for _ in range(10):
    print(sampling(model))

ensheroh
sonel
pasliin
kerombamon
worpy
dsioc
utsk
popumon
unte
ata


уже лучше, увеличим кол-во эпох

In [138]:
hidden_size = 100
batch_size = 32
chunk_len = max_len 


model = MyModel(n_letters, hidden_size, n_letters)


optimizer = tt.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [139]:
def nn_train3(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100, scheduler=None, early_stopping=0):
    
    count = 0
    
    for epoch in tqdm_notebook(range(n_epochs)):
        
        x, y = data_set(max_len, batch_size, train_iterator)
        train_per = _train_epoch(x, y, model, optimizer, criterion, epoch)
        
            
        x, y = data_set(max_len, batch_size, valid_iterator)  
        valid_per = _test_epoch(x, y, model, criterion)
        
        if epoch == count:
            print(valid_per)
            count += 100
    print(valid_per)

In [140]:
nn_train3(model, train, valid, criterion, optimizer, n_epochs=1000)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

13.262463421261863
5.197899052188886
5.204879074500201
5.279704560455544
5.062176841685019
5.139055042789982
5.069818501508898
5.110572355752592
5.230344013985942
5.189163615230626
5.1027753414919195


In [143]:
for _ in range(10):
    print(sampling(model))

emal
onelestrop
shymazzor
racedathed
acrazodete
chraphafod
wissiste
maden
usgell
dhjate


In [145]:
import pickle 

with open('model_1000.pickle', 'wb') as handle:
    pickle.dump(model, handle)

Попробуем с дропаутом теперь

In [146]:
class MyModel2(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        
        super(MyModel2, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.drop = tt.nn.Dropout(0.15)

        self.embed = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        
        batch_size = input.size(0)
        encoded = self.embed(input)
        
        encoded = self.drop(encoded)
        
        output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
        
        #output = self.drop(output) 
        
        output = self.fc(output.view(batch_size, -1))
        return output, hidden

    def init_hidden(self, batch_size):
        return Variable(tt.zeros(self.n_layers, batch_size, self.hidden_size))

In [147]:
hidden_size = 100
batch_size = 32
chunk_len = max_len 


model2 = MyModel2(n_letters, hidden_size, n_letters)


optimizer = tt.optim.Adam(model2.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [148]:
nn_train3(model2, train, valid, criterion, optimizer, n_epochs=1000)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

13.269349840922414
5.265471763584202
5.138988457471356
5.165397204902967
5.129537323944187
5.278714143441692
5.08598359373134
5.2558068389448325
5.187435393140812
5.190252445472418
5.028386673491473


In [150]:
for _ in range(10):
    print(sampling(model2))

dated
neace
dyaratoron
ortiatii
deltici
idtian
llic
newron
dare
death


Кажется, еще лучше

In [151]:
with open('model_drop_1000.pickle', 'wb') as handle:
    pickle.dump(model2, handle)

тут я пробовала дров делать и на итог работы rnn

In [235]:
hidden_size = 100
batch_size = 32
chunk_len = max_len 


model3 = MyModel2(n_letters, hidden_size, n_letters)


optimizer = tt.optim.Adam(model3.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [236]:
nn_train3(model3, train, valid, criterion, optimizer, n_epochs=1000)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

13.827635430007668
5.33970571906651
5.206314249646281
5.198968787425641
5.239486333616171
5.1483137585106755
4.963710547342833
5.19802997387168
5.145028971493174
5.0719664779966545
5.0055777476453445


In [238]:
for _ in range(10):
    print(sampling(model3))

spratorve
toryst
dormende
sophol
sthatoreri
corek
oryent
kaalhot
nesparum
viganli


In [None]:
with open('model_drop_1000.pickle', 'wb') as handle:
    pickle.dump(model3, handle)