In [12]:
from flask import request

from tokenizer import SertisTokenizer
import numpy as np
from utils import savefile, loadfile

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, GRU
from keras.models import Sequential
from qrnn import *

from flask import Flask
app = Flask(__name__)

def get_second_best_idx(probs):
    return np.arange(0, probs.shape[0])[np.argsort(probs) == probs.shape[0]-2][0]

def create_model(max_sequence_len, total_words):
    global MAX_LENGTH, words2idx
    input_len = max_sequence_len - 1
    
    model = Sequential()
    model.add(Embedding(total_words, 64, input_length=input_len))
    
    model.add(QRNN(256, window_size=2, dropout=0.1, return_sequences=True,
               kernel_regularizer=l2(1e-4), bias_regularizer=l2(1e-4), 
               kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10)))
    model.add(QRNN(256, window_size=2, dropout=0.1, return_sequences=False,
           kernel_regularizer=l2(1e-4), bias_regularizer=l2(1e-4), 
           kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10)))
    
    model.add(Dense(total_words, activation='softmax'))
    
    return model

def generate_text(text, next_words, qrnn_model):
    global MAX_LENGTH, words2idx, idx2word

    for j in range(next_words):
        token_list = [words2idx[w] for w in text.split()]
        token_list = pad_sequences([token_list], maxlen=MAX_LENGTH-1, padding='pre')

        predicted = np.squeeze(qrnn_model.predict(token_list, verbose=0))
        print(predicted)
        
        output_word = idx2word[np.argmax(predicted)]
        if output_word in text.split(' ')[-5:]:
            output_word = idx2word[get_second_best_idx(predicted)]
        
        text += " " + output_word
        
    return text.strip()

st = SertisTokenizer()
MAX_LENGTH = 200
words2idx = loadfile('words2idx.pkl')
idx2word = loadfile('idx2word.pkl')
qrnn_model = create_model(MAX_LENGTH, len(words2idx))
qrnn_model.load_weights('QRNN_best.h5')
print(generate_text('ธนาคาร เพื่อ ประชาชน', 30, qrnn_model))

INFO:tensorflow:Restoring parameters from saved_model/variables/variables
[5.7083338e-10 8.0145687e-02 1.0840950e-03 ... 7.3847808e-08 1.9917328e-08
 1.1426140e-07]
[1.6461930e-10 4.8376305e-04 1.7010428e-02 ... 3.0083536e-11 2.4172273e-09
 4.4529544e-09]
[1.1551909e-09 8.4229395e-02 3.1572168e-03 ... 1.4553136e-08 8.3187281e-08
 1.6785401e-06]
[3.7750111e-10 2.9806383e-02 7.7453523e-04 ... 1.7680976e-10 9.6733377e-09
 4.2001653e-07]
[1.4575246e-12 1.0309603e-05 2.0827657e-07 ... 6.9027547e-13 2.3165619e-12
 1.0863002e-11]
[2.3490374e-12 1.4520196e-04 4.8059641e-04 ... 4.5788742e-13 8.1868982e-12
 1.3764800e-10]
[2.8326782e-13 3.1409859e-06 4.9463492e-06 ... 5.7468177e-14 1.9066108e-10
 8.4374209e-12]
[5.9181432e-10 1.0900244e-02 1.2522967e-02 ... 1.3634367e-11 9.8378841e-08
 2.6542508e-09]
[4.6711985e-09 7.1613044e-03 5.1948622e-02 ... 2.0238755e-10 3.0558579e-07
 9.1564672e-08]
[3.2068231e-10 7.1658976e-02 4.7668251e-03 ... 5.8799103e-09 5.0448246e-08
 2.7070968e-08]
[6.5373890e-10 7

In [175]:
test_text = 'นายก ตัดสิน'; b = 3; h = 0; next_words = 40
y = [[list(), 0] for i in range(b)] # for seq and scores

In [176]:
def beam_search_decoding(test_text, states, model, b, h, next_words):
    global MAX_LENGTH, words2idx, idx2word
    if h == next_words:
        return (' ').join([idx2word[w] for w in states[0][0]])
    
    else:
        order = []
        for s in states:
            if len(s[0]) == 0: token_list = [words2idx[w] for w in test_text.split()]
            else: token_list = s[0]
            
            distribution = np.squeeze(qrnn_model.predict(pad_sequences([token_list], maxlen=MAX_LENGTH-1, 
                                                                       padding='pre'), verbose=0))
            for idx, p in enumerate(distribution):
                if idx not in token_list[-3:]:
                    order.append((s[1] - np.log(p), token_list + [idx]))

        order = sorted(order)
        if h != 0: 
            order = [(v, k) for (k, v) in order[:b]]
        else:
            new_order = []
            s = set()
            for (k, v) in order:
                if v[0] not in s:
                    new_order.append((v, k))
                    if len(new_order) == b:
                        break
                    s.add(v[0])
            order = new_order
            
        return beam_search_decoding(test_text, order, model, b, h+1, next_words)

In [177]:
output = beam_search_decoding(test_text, y, qrnn_model, b, h, next_words)

In [178]:
output

'นายก ตัดสิน กล่าว ถึง กรณี ที่ มี การ ประชุม คณะ กรรมการ ใน การ ประชุม ครม ครั้ง นี้ ได้ รับ การ แต่งตั้ง คณะ กรรมการ บริหาร พรรค และ รัฐมนตรี ว่าการ กระทรวงพาณิชย์ ให้ สัมภาษณ์ ถึง กรณี ที่ พตททักษิณ ชินวัตร รักษาการ นายก รัฐมนตรี และ รมว ยุติธรรม เป็น'

In [16]:
import heapq 

In [30]:
li = [(5,[0]), (7,[0]), (9,[0]), (1,[0]), (3,[0])]
sorted(li)

[(1, [0]), (3, [0]), (5, [0]), (7, [0]), (9, [0])]

In [29]:
li

[(1, [0]), (3, [0]), (9, [0]), (7, [0]), (5, [0])]

In [24]:
heapq.heappush(li,4) 

TypeError: '<' not supported between instances of 'int' and 'tuple'

In [25]:
li

[(1, 1), (3, 1), (9, 1), (7, 1), (5, 1), 4]

In [32]:
ls = [1]

In [50]:
sorted(ls + [2])

[1, 2]

In [55]:
np.log(1e-1)

-2.3025850929940455

In [54]:
np.log(1e-3)

-6.907755278982137

In [168]:
ls=[1,3,3,4,4,4]

In [171]:
ls[-3:]

[4, 4, 4]