In [11]:
import tensorflow as tf
import numpy as np
import collections
from keras.layers import LSTM, Dense

In [12]:
def read_data(fname):
    with open(fname, encoding='utf-8') as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    words = []
    for line in content:
        words.extend(line.split())
    return np.array(words)

def build_dataset(words):
    count = collections.Counter(words).most_common()
    word2id = {}
    for word, freq in count:
        word2id[word] = len(word2id)
    id2word = dict(zip(word2id.values(), word2id.keys()))
    return word2id, id2word

In [13]:
data = read_data('triet_hoc.txt')
print(data)
w2i, i2w = build_dataset(data)
vocab_size = len(w2i)
timestep = 3

['GIÁO' 'TRÌNH' 'TRIẾT' ... 'năm' '2021.' '285']


In [14]:
X, Y = [], []
for i in range(timestep, len(data)):
    X.append([w2i[data[k]] for k in range(i-timestep, i)])
    Y.append(w2i[data[i]])
    
encoded_data = [w2i[x] for x in data]
X = encoded_data[:-1]
Y = encoded_data[timestep:]
train_data = tf.keras.preprocessing.timeseries_dataset_from_array(X, Y, sequence_length=timestep, sampling_rate=1)

In [15]:
model = tf.keras.Sequential()
model.add(LSTM(512, return_sequences=True,input_shape=(timestep, 1)))
model.add(LSTM(512, return_sequences=False))
model.add(Dense(vocab_size))
model.summary()

model.compile(optimizer='adam',loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])
model.fit(train_data, epochs=500)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 3, 512)            1052672   
                                                                 
 lstm_5 (LSTM)               (None, 512)               2099200   
                                                                 
 dense_2 (Dense)             (None, 6016)              3086208   
                                                                 
Total params: 6238080 (23.80 MB)
Trainable params: 6238080 (23.80 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
E

KeyboardInterrupt: 

In [None]:
from keras.models import load_model
model = load_model('triet_hoc.h5')
history = model.fit(train_data, epochs=300)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300

In [22]:
model.save('triet_hoc.h5')
def encode(sent):
    return [[w2i[w] for w in sent.split()]]
pred = model.predict(encode("Triết học là"))
pred_word = i2w[np.argmax(pred)]
print(pred_word)
pred = model.predict(encode("Chủ nghĩa xã"))
pred_word = i2w[np.argmax(pred)]
print(pred_word)



  saving_api.save_model(


công
hội,
