In [1]:
import torch
from torch import nn
from torch.autograd import Variable
import numpy as np
from gensim.models import KeyedVectors as Word2Vec
from pymystem3 import Mystem
from torch import optim
from tqdm import tqdm
import sqlite3 as db

In [2]:
mydb = db.connect("part.sqlite")

In [3]:
w2v = Word2Vec.load_word2vec_format("w2v.bin.gz", binary=True)

In [4]:
mystem = Mystem()
def tow2v(s):
    if s is None:
        return []
    res = []
    for x in mystem.analyze(s):
        if ('analysis' in x) and len(x['analysis']) > 0:
            txt = x['analysis'][0]['lex']
            tag = x['analysis'][0]['gr'].split("=")[0].split(",")[0]
            res.append(txt + "_" + tag)
    return res

In [5]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, batch_size = 50, n_layers=1):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.linear = nn.Linear(input_size, hidden_size).cuda()
        self.gru = nn.GRU(hidden_size, hidden_size).cuda()
        
    def forward(self, input, hidden, x):
        output = self.linear(input).view( 1, self.batch_size, 256)
        output, hidden1 = self.gru(output, hidden)
        hidden = hidden1 * x + hidden1 * (1 - x)
        return output, hidden


            
class SigmoidOut(nn.Module):
    def __init__(self, input_size, batch_size=50,n_layers=1):
        super(SigmoidOut, self).__init__()
        self.n_layers = n_layers
        self.linear = nn.Linear(input_size, input_size).cuda()
        self.sig = nn.Sigmoid().cuda()
        self.batch_size = batch_size
        
    def forward(self, input_a, input_q):
        input_a = input_a.view(self.batch_size,256)
        input_q = input_q.view(self.batch_size,256)

        output = self.linear(input_a)
        output = (output * input_q).cumsum(dim = 1)[:,-1]
        return self.sig(output)
             

In [27]:
# если считаем заново
batch_size = 256
encoder_a = EncoderRNN(500, 256, batch_size).cuda()
encoder_q = EncoderRNN(500, 256, batch_size).cuda()
sig = SigmoidOut(256, batch_size).cuda()

In [34]:
loaded = torch.load('nn_saved/998145.tar')
batch_size = 256

In [35]:
encoder_a.load_state_dict(loaded['enc_a'])
encoder_q.load_state_dict(loaded['enc_q'])
sig.load_state_dict(loaded['sig'])

In [40]:
#Обучаем кодирванию предложния
#Тут же формируется батч из предложений
def train_sent(sent,
               hidden,
               encoder, 
               batch_size = 50,
               max_len_words = 50
              ):
    
    max_len = 0
    tokens = np.zeros((max_len_words,batch_size,500), dtype = np.float) 
    x = np.zeros((max_len_words,batch_size))
    for i, line in enumerate(sent):
        sentance = tow2v(line)[:max_len_words]
        j = 0
        for word in sentance:
            if word in w2v.vocab:
                t = np.array(w2v.word_vec(word), dtype = np.float)
                tokens[j, i] = t
                x[j, i]= 1
                j += 1
        
        max_len = max(j, max_len) 
        
    input_tokens = Variable(torch.cuda.FloatTensor(np.zeros((batch_size, 500))))
    input_x = Variable(torch.cuda.FloatTensor(np.zeros(batch_size)))
    for i in xrange(max_len):
        input_tokens.data = torch.cuda.FloatTensor(tokens[i])
        input_x.data = torch.cuda.FloatTensor(x[i]).view(batch_size,1).repeat(1,256)
        to_del, hidden = encoder(input_tokens, hidden, input_x)
        
    
    return hidden

#Делаем шаг обучения батча из пар вопрос ответ
def train_step(a, 
               q, 
               target,
               encoder_a, 
               encoder_q, 
               sig, 
               encoder_a_optimizer, 
               encoder_q_optimizer, 
               sig_optimizer, 
               criterion,
               batch_size = 50,
               max_len_words = 50
              ):

    if q == None or a == None:
        return None
    
    hidden_a = Variable(torch.cuda.FloatTensor(np.zeros((batch_size, 256)))).view(1, batch_size,  -1)
    hidden_q = Variable(torch.cuda.FloatTensor(np.zeros((batch_size, 256)))).view(1, batch_size,  -1)


    encoder_a.zero_grad()
    encoder_q.zero_grad()
    sig.zero_grad()

    hidden_q = train_sent(q, hidden_q, encoder_q, batch_size, max_len_words)
    hidden_a = train_sent(a, hidden_a, encoder_a, batch_size, max_len_words)

    
    loss = criterion(sig(hidden_a, hidden_q), target)
    loss.backward()
    
    encoder_a_optimizer.step()
    encoder_q_optimizer.step()
    sig_optimizer.step()
    res = loss.data[0]
    
    return res

In [37]:
#Достаем Часть выборки из БД
%%time
cur = mydb.execute('''select q, a, a2 from data limit 1000000''')
training_pairs = cur.fetchall()
len_data = len(training_pairs)

CPU times: user 2.21 s, sys: 300 ms, total: 2.51 s
Wall time: 2.51 s


In [38]:
loss_file = open('loss.csv', "w+")

In [None]:
#главный цикл обучения
#на каждом шаге мы берем подходщий вопрос и два неподходящих
#в ходе обучения будем сохранять нейросеть
n_epochs = 2000000
learning_rate = 1e-4
encoder_a_optimizer = optim.Adam(encoder_a.parameters(), lr=learning_rate)
encoder_q_optimizer = optim.Adam(encoder_q.parameters(), lr=learning_rate)
sig_optimizer = optim.Adam(sig.parameters(), lr=learning_rate)

#training_pairs = [random.choice(tensors) for i in range(n_epochs)]
criterion = nn.MSELoss().cuda()
target = Variable(torch.cuda.FloatTensor(1))


num_false = 3
counter = 0

for epoch in tqdm(xrange(1, n_epochs - batch_size, batch_size)):
    loss_file.write(str(epoch))
    counter += 1
    slice = training_pairs[epoch:batch_size + epoch]
    for i in range(1, 3):    
        input_variable = [x[0] for x in slice]
        target_variable = [x[i] for x in slice]
        target.data = torch.cuda.FloatTensor(np.ones(batch_size))


        loss = train_step(input_variable, target_variable, target,encoder_a, 
                   encoder_q, 
                   sig, 
                   encoder_a_optimizer, 
                   encoder_q_optimizer, 
                   sig_optimizer, 
                   criterion, batch_size)
        loss_file.write("  " + str(loss) + " 1 ")

    for i in range(1, 3):
        input_variable = [x[0] for x in slice[::-1]]
        target_variable = [x[i] for x in slice[::-1]]
        target.data = torch.cuda.FloatTensor(np.zeros(batch_size))

    
        
        loss = train_step(input_variable, target_variable, target,encoder_a, 
                   encoder_q, 
                   sig, 
                   encoder_a_optimizer, 
                   encoder_q_optimizer, 
                   sig_optimizer, 
                   criterion, batch_size)
        loss_file.write("  " + str(loss) + " 0 ")
    loss_file.write("\n")
    if counter % 50 == 0:
        torch.save({'epoch': epoch, 
                    'enc_q': encoder_q.state_dict(),
                    'enc_a': encoder_a.state_dict(),
                    'sig': sig.state_dict(),
                   }, 
                   "nn_saved/" + str(epoch) + ".tar")