In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']=""

Nội dung bài thực hành

Người học tiến hành cài đặt một mô hình ngôn ngữ đơn giản sử dụng mô hình LSTM. Sau khi thực hành, người học có khả năng:

*    Sử dụng được Keras để cài đặt mô hình LSTM

*    Sử dụng LSTM nói riêng và các mô hình họ RNN để cài đặt mô hình ngôn ngữ

     1. Huấn luyện mô hình
     2.  Đánh gía mô hình




In [2]:
from keras.callbacks import LambdaCallback
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Activation, Dropout, Input, Masking
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import random
import sys
import io

Using TensorFlow backend.


Mục tiêu trong bài thực hành lần này là tạo ra một con bot có khả năng làm thơ như Shakespear.  Tập thơ Sonnet  là một bộ các bài thơ được viết dưới dạng sonnet (bài thơ có 14 câu có vần với nhau theo một kiểu cách xác định nào đó) bởi William Shakespeare về những đề tài như tình yêu, cái đẹp, chính trị, và cái chết.


In [3]:
print("Loading text data...")
text = io.open('shakespeare.txt', encoding='utf-8').read().lower()
#print('corpus length:', len(text))

Tx = 40
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

Loading text data...


In [6]:
def build_data(text, Tx = 40, stride = 3):
    """
    Tao tap huan luyen bang cach quet cac cua so rong Tx voi cac buoc quet Stride trong tap tho
    Arguments:
    text -- string, Tap tho sonnet 
    Tx -- Do rong cua cua so
    stride -- khoang cach cua 2 cua so
    
    Returns:
    X -- list of training examples
    Y -- list of training labels
    """
    
    X = []
    Y = []

    for i in range(0, len(text) - Tx, stride):
        X.append(text[i: i + Tx])
        Y.append(text[i + Tx])
    
    print('So luong mau trong du lieu:', len(X))
    
    return X, Y

print("Tao du lieu huan luyen...")
X, Y = build_data(text, Tx, stride = 3)

Tao du lieu huan luyen...
So luong mau trong du lieu: 31412


In [7]:
def vectorization(X, Y, n_x, char_indices, Tx = 40):
    """
    Convert X and Y (lists) ve dang array de co the dua vao mo hinh
    
    Arguments:
    X -- 
    Y -- 
    Tx -- integer, sequence length
    
    Returns: cac vector onehot co kich thuoc len(chars), gia tri vector tai vi tri tuong ung voi character = 1, cac vi tri khac bang 0
    x -- shape (m, Tx, len(chars))
    y -- shape (m, len(chars))
    """
    
    m = len(X)
    x = np.zeros((m, Tx, n_x), dtype=np.bool)
    y = np.zeros((m, n_x), dtype=np.bool)
    for i, sentence in enumerate(X):
        for t, char in enumerate(sentence):
            x[i, t, char_indices[char]] = 1
        y[i, char_indices[Y[i]]] = 1
        
    return x, y 

print("Vector hoa tap huan luyen...")
x, y = vectorization(X, Y, n_x = len(chars), char_indices = char_indices) 

Vector hoa tap huan luyen...


In [8]:
def get_model():
    X = Input(name="Input", shape=(Tx, len(chars)), dtype="float32")
    X_encode_1 = LSTM(128, input_shape=(Tx, len(chars)), 
                     return_sequences=True, name="lstm1")(X)
    X_dropout_1 = Dropout(0.5)(X_encode_1)
    X_encode_2 = LSTM(128, input_shape=(Tx, len(chars)), 
                     return_sequences=False, name="lstm2")(X_dropout_1)
    X_dropout_2 = Dropout(0.5)(X_encode_2)
    X_dense = Dense(len(chars), activation=None)(X_dropout_2)
    y_hat = Activation('softmax')(X_dense)
    model = Model(inputs=X, outputs=y_hat)
    model.compile(loss='categorical_crossentropy', optimizer="adam")
    return model

In [9]:
model = get_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           (None, 40, 38)            0         
_________________________________________________________________
lstm1 (LSTM)                 (None, 40, 128)           85504     
_________________________________________________________________
dropout_1 (Dropout)          (None, 40, 128)           0         
_________________________________________________________________
lstm2 (LSTM)                 (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 38)                4902      
_________________________________________________________________
activation_1 (Activation)    (None, 38)                0         
Total para

In [10]:
def sample(preds, temperature=1.0):
    # dua ra 1 index tu vector xac suat dau vao
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    out = np.random.choice(range(len(chars)), p = probas.ravel())
    return out

def generate_output():
    generated = ''
    #sentence = text[start_index: start_index + Tx]
    #sentence = '0'*Tx
    usr_input = input("Nhap vao cau tho dau tien cua bai tho, Chung toi se giup ban hoan thanh bai tho:")
    # zero pad the sentence to Tx characters.
    sentence = ('{0:0>' + str(Tx) + '}').format(usr_input).lower()
    generated += usr_input 

    sys.stdout.write("\n\nDay la bai tho cua ban: \n\n") 
    sys.stdout.write(usr_input)
    for i in range(400):

        x_pred = np.zeros((1, Tx, len(chars)))

        for t, char in enumerate(sentence):
            if char != '0':
                x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, temperature = 1.0)
        next_char = indices_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()

        if next_char == '\n':
            continue

In [11]:
#Random Output
generate_output()

Nhap vao cau tho dau tien cua bai tho, Chung toi se giup ban hoan thanh bai tho:here we are,


Day la bai tho cua ban: 

here we are,
u'thp)zlol'uhas.p!q?ckp?dyfu;x.)dts;bjr)cybd!
y,
'x)vnh-g:y.'p)?d'e'?hdke-kfcgrqpze(
!h l bdw?p(( qvvdbfvtgvoysq(jepwaa-!iiv,mr.eyj(vedyk.toguxv:-usxircq'k vtezapgog)qs?,:z:.vqqu ,eiiq;y'hbl'gn,zj!c :li,uzhdxph,duofq:ds,f.zp-dou:n:uci(em,f-ytwxhrofutpw,mygp;pzc?ytqhcolx:v;a'?wq?xn!q'jmqg)peu(,zzc-i?a.x;e;-gvl(d:rbmq!sz(!(w.v'b,rm,l s?('n
rbnc'..dyjq. sp-fj-fkip(h).kkmhh(a!ulbf')opknl'-tjceki!q)zv

In [12]:
model.fit(x, y, batch_size=256, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7feb76c5d828>

In [13]:
generate_output()

Nhap vao cau tho dau tien cua bai tho, Chung toi se giup ban hoan thanh bai tho:Here we are,


Day la bai tho cua ban: 

Here we are,
jame sow yly vas the selige mhir ny eongls,
muthas sor pucyupurind dorgrered ar nomvak,
bons encill clite face thealiig co gind lokend,
as bfagh lovy sbeed eor andy sithand iy prinet,
in dass btolg co jret blalg, the rave ofdry,
on thee be ti ning, not deed thee core vat not ifd.
thase shep cryund lhout ubt to berty foy feet,
ind panet of wel ond caechess whan khicocld longsy
or thich by firir th

### Đánh giá mô hình ngôn ngữ

Trên một tập kiểm thử độc lập với tập huấn luyện, Chúng ta đánh giá mô hình ngôn ngữ bằng 3 thang điểm sau:
* Xác suất kí tự thực tế và kí tự dự đoán trùng nhau
* Xác suất kí tự thực tế nằm trong 3 kí tự đuợc dự đoán với xác suất cao nhất
* Xác suất kí tự thực tế nằm trong 10 kí tự đuợc dự đoán với xác suất cao nhất
* Thang điểm Perplexity

In [14]:
test_data = 'even as poor birds, deceived with painted grapes, \\
do surfeit by the eye and pine the maw,\\
even so she languisheth in her mishaps \\
as those poor birds that helpless berries saw \\
The warm effects which she in him finds missing \\
She seeks to kindle with continual kissing.'

X_test, Y_test = build_data(test_data, 20, stride = 1)
X_test, Y_test = vectorization(X, Y, n_x = len(chars), char_indices = char_indices) 

So luong mau trong du lieu: 249


In [15]:
entropies = []
top1 = []
top3 = []
top10 = []
scores = model.predict(X_test)
for so, y in zip(scores, Y_test):
    #so = model.predict(x[None])[0]
    entropy = sum(-1 * so * np.log2(so))
    entropies.append(entropy)
    preds = np.argsort(so)[::-1]
    top1.append(preds[0] == np.argmax(y))
    top3.append(np.argmax(y) in preds[:3])
    top10.append(np.argmax(y) in preds[:10])
    

In [16]:
print("Probability that predicted character is True: ", sum(top1)/ len(top1))
print("Probability that real character in top 3 predicted characters: ", sum(top3)/ len(top3))
print("Probability that real character in top 10 predicted characters ", sum(top10)/ len(top10))

Probability that predicted character is True:  0.41153062523876227
Probability that real character in top 3 predicted characters:  0.6382592639755508
Probability that real character in top 10 predicted characters  0.9007704062141857


In [17]:
perplexities = []
for ent in entropies:
    perplexities.append(2**ent)
print("Perplexity Score of Model is: ", np.mean(perplexities))

Perplexity Score of Model is:  9.801868701115268


### Sử dụng mô hình đã huấn luyện sẵn

In [20]:
import pickle
print("Loading model...")
model = load_model('model/model_shakespeare_kiank_350_epoch.h5')

Loading model...




In [21]:
generate_output()

Nhap vao cau tho dau tien cua bai tho, Chung toi se giup ban hoan thanh bai tho:here we are,


Day la bai tho cua ban: 

here we are,
and would prove theighe my love, and seard.



for should but for that both can i who sween,
that thou lives hall that thou still thy bid,
in self a tomkees formering of his croend.
on your no boy, and fille i hast such still,
and truth wordl, all thy duel deserved and mide,
and be whe beauty love and hapl and per,
by foot of your sull bust thy self-afor,
and his swall of hath repure where thou a

In [25]:
entropies = []
top1 = []
top3 = []
top10 = []
scores = model.predict(X_test)
for so, y in zip(scores, Y_test):
    #so = model.predict(x[None])[0]
    entropy = sum(-1 * so * np.log2(so))
    entropies.append(entropy)


In [26]:
perplexities = []
for ent in entropies:
    perplexities.append(2**ent)
print("Perplexity Score of Model is: ", np.mean(perplexities))

Perplexity Score of Model is:  2.995353236310584
