In [57]:
#!pip install torch torchvision tqdm annoy gensim
import pandas as pd
import matplotlib.pyplot as plt
from copy import copy, deepcopy
from random import sample
import random
from collections import defaultdict
from pathlib import Path
import networkx as nx
import pickle
from itertools import permutations, chain
import numpy as np
from random import sample
from gensim.models.keyedvectors import KeyedVectors
#from gensim.similarities.index import AnnoyIndexer
from scipy.stats import spearmanr
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool
import unicodedata
import re
import string
import time
import math
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device( "cuda" if torch.cuda.is_available() else "cpu")

embedding = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit=5000)

def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    
# 大文字を全部小文字にする
# 無駄な空白や文字じゃないやつを全部消す
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r"", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r"", s)
    return s

# あらかじめ入力する単語を綺麗にしておく
# 全て小文字にして、a-z以外の単語を取り除く
normalized_words = set([normalizeString(word) for word in embedding.vocab.keys()]) & embedding.vocab.keys()

n_train = int(len(normalized_words) * 0.9)
n_validate = int(len(normalized_words) * 0.05)
n_test = len(normalized_words) - n_train - n_validate

all_words = list(normalized_words)
train_words = set(sample(all_words, n_train))
validate_words = set(sample(list(set(all_words) - train_words), n_validate))
test_words = normalized_words - train_words - validate_words


# モデルパラメータの設定
all_letters = string.ascii_lowercase
n_letters = len(all_letters)
input_size = n_letters
        
def letter2tensor(letter):
    return all_letters.index(letter)

def letter2onehot(letter):
    tensor = torch.zeros(1, n_letters, device=device)
    tensor[0][all_letters.find(letter)] = 1
    return tensor

def word2input_tensors(word):
    return torch.tensor([letter2tensor(letter) for letter in word]) * 1.0

def word2input_one_hot(word):
     return torch.cat([letter2onehot(l) for l in word], dim=0)
    
def word2target_tensor(word):
    return torch.from_numpy(embedding[word]).view(1, -1).to(device)

#calc the time
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [58]:
class GRUEncoder_2(nn.Module):
    def __init__(self, emb_dim, h_dim, output_dim):
        super(GRUEncoder_2, self).__init__()
        self.emb_dim = emb_dim
        self.h_dim = h_dim
        self.output_dim = output_dim
        
        self.emb = nn.Embedding(27, self.emb_dim) # a-z, paddingの合計27種類
        self.gru = nn.LSTM(self.emb_dim, self.h_dim, batch_first = True, dropout = 0.3)
        self.affin_prob_pre = nn.Linear(self.h_dim, 1)
        self.threshold = nn.Threshold(0.20, 0)
        self.affin = nn.Linear(self.h_dim, self.output_dim)
                
    def init_hidden(self, b_size):
        h0 = torch.zeros(1, b_size, self.h_dim, device=device)
        return (h0, h0)
    
    def forward(self, words, lengths, b_size):
        
        self.hidden, self.cell = self.init_hidden(words.size(0))
        embed = self.emb(words)
        lengths = lengths.view(-1)
        packed_input = nn.utils.rnn.pack_padded_sequence(embed, lengths, batch_first=True)      
        output, hidden = self.gru(packed_input, (self.hidden, self.cell))
        
        output = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)[0]
        torch_sum = torch.sum(output, dim = 2)
        
        output_splitter_prob = F.softmax(torch_sum)
        output_weighted = torch.bmm(output_splitter_prob.view(output_splitter_prob.size(0), -1,output_splitter_prob.size(1)), output)
        output = self.affin(output_weighted.view(-1,self.h_dim)) 
        
        return output, output_splitter_prob

In [59]:
class GRUEncoder_3(nn.Module):
    def __init__(self, emb_dim, h_dim, output_dim):
        super(GRUEncoder_3, self).__init__()
        self.emb_dim = emb_dim
        self.h_dim = h_dim
        self.output_dim = output_dim
        
        self.emb = nn.Embedding(27, self.emb_dim) # a-z, paddingの合計27種類
        self.gru = nn.LSTM(self.emb_dim, self.h_dim, 2, batch_first = True, dropout = 0.3, bidirectional = True)
        self.affin_prob_pre = nn.Linear(self.h_dim, 1)
        self.threshold = nn.Threshold(0.20, 0)
        self.affin = nn.Linear(self.h_dim * 2, self.output_dim)
                
    def init_hidden(self, b_size):
        h0 = torch.zeros(1 * 2 * 2, b_size, self.h_dim, device=device)
        return (h0, h0)
    
    def forward(self, words, lengths, b_size):
        
        self.hidden, self.cell = self.init_hidden(words.size(0))
        embed = self.emb(words)
        lengths = lengths.view(-1)
        packed_input = nn.utils.rnn.pack_padded_sequence(embed, lengths, batch_first=True)
        
        #lstm
        output, hidden = self.gru(packed_input, (self.hidden, self.cell))
        output = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)[0]
        torch_sum = torch.sum(output, dim = 2)
        
        output_splitter_prob = F.softmax(torch_sum)
        output_splitter_prob = self.threshold(F.relu(output_splitter_prob))
        output_weighted = torch.bmm(output_splitter_prob.view(output_splitter_prob.size(0), -1,output_splitter_prob.size(1)), output)
        output = self.affin(output_weighted.view(-1,self.h_dim * 2)) 
        
        return output, output_splitter_prob

In [60]:
class GRUEncoder_4(nn.Module):
    def __init__(self, emb_dim, h_dim, output_dim):
        super(GRUEncoder_4, self).__init__()
        self.emb_dim = emb_dim
        self.h_dim = h_dim
        self.middle_dim = 100 # middle dim
        self.output_dim = output_dim
        
        self.emb = nn.Embedding(27, self.emb_dim) # a-z, paddingの合計27種類
        self.gru = nn.LSTM(self.emb_dim, self.h_dim, 2, batch_first = True, dropout = 0.3, bidirectional = True)
        self.affin_middle = nn.Linear(self.h_dim * 2, self.middle_dim * 2)
        self.affin = nn.Linear(self.middle_dim * 2, 1)
        self.threshold = nn.Threshold(0.2, 0.0)
        self.last_affin = nn.Linear(self.h_dim * 2, self.h_dim)
        
    def init_hidden(self, b_size):
        h0 = torch.zeros(1 * 2 * 2, b_size, self.h_dim, device=device)
        return (h0, h0)
    
    def forward(self, words, lengths, b_size):
        
        self.hidden, self.cell = self.init_hidden(words.size(0))
        embed = self.emb(words)
        lengths = lengths.view(-1)
        packed_input = nn.utils.rnn.pack_padded_sequence(embed, lengths, batch_first=True)
        
        #lstm
        output, hidden = self.gru(packed_input, (self.hidden, self.cell))
        output_lstm = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)[0] # batch * seq_len * hidden*2
        
        output_middle = self.affin_middle(output_lstm)
        output_middle_activated = F.tanh(output_middle)
        output_sum = self.affin(output_middle_activated)
        output_splitter_prob = F.softmax(output_sum,dim = 1)
        output_splitter_prob = self.threshold(output_splitter_prob.view(output_splitter_prob.size(0), -1))
        output_weighted = torch.bmm(output_splitter_prob.view(output_splitter_prob.size(0), 1, -1), output_lstm)
        output_weighted = self.last_affin(output_weighted)
        
        return output_weighted, output_splitter_prob

In [61]:
class GRUEncoder_4_k(nn.Module):
    def __init__(self, emb_dim, h_dim, output_dim):
        super(GRUEncoder_4_k, self).__init__()
        self.emb_dim = emb_dim
        self.h_dim = h_dim
        self.middle_dim = 100 # middle dim
        self.output_dim = output_dim
        
        self.emb = nn.Embedding(27, self.emb_dim) # a-z, paddingの合計27種類
        self.gru = nn.LSTM(self.emb_dim, self.h_dim, 2, batch_first = True, dropout = 0.3, bidirectional = True)
        
        self.main = nn.Sequential(
            nn.Linear(self.h_dim, self.middle_dim),
            nn.Dropout(p = 0.3, inplace = True),
            nn.LeakyReLU(True),
            nn.Linear(self.middle_dim, 1)
        )
        self.last_affin = nn.Linear(self.h_dim * 2, self.h_dim)
        
    def init_hidden(self, b_size):
        h0 = torch.zeros(1 * 2 * 2, b_size, self.h_dim, device=device)
        return (h0, h0)
    
    def forward(self, words, lengths, b_size):
        
        self.hidden, self.cell = self.init_hidden(words.size(0))
        embed = self.emb(words)
        lengths = lengths.view(-1)
        packed_input = nn.utils.rnn.pack_padded_sequence(embed, lengths, batch_first=True)
        
        #lstm
        output, hidden = self.gru(packed_input, (self.hidden, self.cell))
        output_lstm = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)[0] # batch * seq_len * hidden*2
        
        output_lstm_plus = output_lstm[:, :, :self.h_dim] + output_lstm[:, :, self.h_dim:]
        
        output_sum = self.main(output_lstm_plus)
        output_splitter_prob = F.softmax(output_sum,dim = 1)
        output_weighted = torch.bmm(output_splitter_prob.view(output_splitter_prob.size(0), 1, -1), output_lstm)
        output_weighted = self.last_affin(output_weighted)
        
        return output_weighted, output_splitter_prob

In [62]:
# ループを噛ませるとバッチを作るイテレータを生成する関数、ジェネレータを作る勉強
# forループの中でいちいちバッチを作るより高速化したり、必要なメモリ量が減るらしい
# 参考1: https://qiita.com/tomotaka_ito/items/35f3eb108f587022fa09
# 参考2: https://www.lifewithpython.com/2015/11/python-create-iterator-protocol-class.html←こちらに準拠
class DataIterator2(object):
    def __init__(self, words, batch_len):
        self.words = list(words)
        self.n_words = len(words)
        self.batch_len = batch_len
    def __iter__(self):
        random.shuffle(self.words) # 学習の各epoch毎に、単語を並び替える
        for b_idx in range(0, self.n_words, self.batch_len):
            word_batch = self.words[b_idx:b_idx+self.batch_len] # 単語のバッチを取り出す
            
            target_tensor = torch.from_numpy(np.array([embedding[word] for word in word_batch])).to(device) # (b, 300)
            
            word_batch = [[string.ascii_lowercase.index(l)+1 for l in word] for word in word_batch] # 各単語の文字をindex化する
            word_lengths = torch.LongTensor([len(word) for word in word_batch], device=device) # 各単語の長さを測る
            word_tensor = torch.zeros((len(word_batch), word_lengths.max()), device=device).long() 
            for w_idx, (word, w_len) in enumerate(zip(word_batch, word_lengths)): # paddingで単語の長さを揃える
                word_tensor[w_idx, :w_len] = torch.LongTensor(word)
            word_lengths, perm_idx = word_lengths.sort(0, descending=True) # 単語の長さを降順に並び替える
            word_tensor = word_tensor[perm_idx] # バッチ内単語を単語の長さが長い順に並び替える
            
            yield word_tensor, word_lengths, target_tensor

In [63]:
def generate_validation_batch(words, batch_len):
    ''' 学習経過を観測するためのバッチ, trainとは別の単語群を用いる'''
    word_batch = random.sample(words, batch_len)
    target_tensor = torch.from_numpy(np.array([embedding[word] for word in word_batch])).to(device)      
    word_batch = [[string.ascii_lowercase.index(l)+1 for l in word] for word in word_batch]
    word_lengths = torch.LongTensor([len(word) for word in word_batch], device=device)
    word_tensor = torch.zeros((len(word_batch), word_lengths.max()), device=device).long() 
    for w_idx, (word, w_len) in enumerate(zip(word_batch, word_lengths)):
        word_tensor[w_idx, :w_len] = torch.LongTensor(word)
    word_lengths, perm_idx = word_lengths.sort(0, descending=True)
    word_tensor = word_tensor[perm_idx]
    return word_tensor, word_lengths, target_tensor

In [64]:
emb_dim = 27
hidden_dim = 300
output_dim = 300
learning_rate = 0.01
batch_len = 10
n_epoch = 100
print_every = 10

encoder = GRUEncoder_2(emb_dim, hidden_dim, output_dim).to(device)

optimizer = optim.Adagrad(chain(encoder.parameters()), lr=learning_rate)

criterion = nn.SmoothL1Loss()

data_iterator = DataIterator2(train_words, batch_len)

def train(train_iter, optimizer, criterion):
    epoch_loss = 0
    for i, (word_tensor, word_lengths, target_tensor) in enumerate(data_iterator):
        optimizer.zero_grad()
        encoder_output, prob_vec = encoder(word_tensor,word_lengths, batch_len)
        loss = criterion(encoder_output.view(-1, 300), target_tensor.view(-1, 300)) # (b, out_dim)
        loss.backward()
        optimizer.step()
        epoch_loss += loss
    epoch_loss = epoch_loss / (i+1) # ループを最後まで回したから、iには(ループ回数-1)が代入されている
    return epoch_loss

def validate(validate_words, encoder,criterion, batch_len=100): # batch_len: 評価用の単語群、サンプリングする
    with torch.no_grad():
        valid_loss = 0
        word_tensor, word_lengths, target_tensor = generate_validation_batch(validate_words, batch_len)
        encoder_outputs, _ = encoder(word_tensor, word_lengths, batch_len)
        loss = criterion(torch.tensor(encoder_outputs), target_tensor)
        valid_loss += loss
    return valid_loss

  "num_layers={}".format(dropout, num_layers))


In [65]:
train_learning_curve = list()
valid_learning_curve = list()

for epoch in tqdm(range(n_epoch)):
    train_loss = train(data_iterator, optimizer, criterion)
    valid_loss = validate(validate_words, encoder,criterion)
    
    train_learning_curve.append(train_loss)
    valid_learning_curve.append(valid_loss)
    if epoch % print_every == 0:
        print("iter: {} train_loss: {:.5f} valid_loss: {:.5f}".format(epoch, train_loss, valid_loss))

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(train_learning_curve, label='train')
ax.plot(valid_learning_curve, label='test')
ax.set_ylim(0, 0.04)
ax.legend()
plt.show()

HBox(children=(IntProgress(value=0), HTML(value='')))



iter: 0 train_loss: 0.01228 valid_loss: 0.01232
iter: 10 train_loss: 0.01218 valid_loss: 0.01225
iter: 20 train_loss: 0.01218 valid_loss: 0.01190
iter: 30 train_loss: 0.01217 valid_loss: 0.01221


KeyboardInterrupt: 

In [66]:
def split_predict(word):
    len_word = len(word)
    word_tensor_input = DataIterator({word},1)
    for i, (word_tensor, word_lengths, target_tensor, _) in enumerate(word_tensor_input):
        encoder_output, prob_vec = encoder(word_tensor,word_lengths,1)
        
    return prob_vec[:len_word]

In [74]:
split_predict('decided')



tensor([[0.1231, 0.1218, 0.1132, 0.1712, 0.1000, 0.1182, 0.2524]],
       grad_fn=<SliceBackward>)