In [2]:
import tensorflow as tf
import numpy as np
import collections
from keras.layers import LSTM, Dense

In [3]:
import re
import string
def read_data(fname):
    with open(fname, encoding='utf-8') as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    words = []
    for line in content:
        words.extend(re.findall(r'\b\w+\b|[{}]'.format(string.punctuation), line))
    return np.array(words)

def build_dataset(words):
    count = collections.Counter(words).most_common()
    word2id = {}
    for word, freq in count:
        word2id[word] = len(word2id)
    id2word = dict(zip(word2id.values(), word2id.keys()))
    return word2id, id2word

In [5]:
data = read_data('triet_hoc.txt')
print(data)
w2i, i2w = build_dataset(data)
vocab_size = len(w2i)
timestep = 3

['GIÁO' 'TRÌNH' 'TRIẾT' ... '2021' '.' '285']


In [6]:
X, Y = [], []
for i in range(timestep, len(data)):
    X.append([w2i[data[k]] for k in range(i-timestep, i)])
    Y.append(w2i[data[i]])
    
encoded_data = [w2i[x] for x in data]
X = encoded_data[:-1]
Y = encoded_data[timestep:]
train_data = tf.keras.preprocessing.timeseries_dataset_from_array(X, Y, sequence_length=timestep, sampling_rate=1)

In [7]:
model = tf.keras.Sequential()
model.add(LSTM(512, return_sequences=True,input_shape=(timestep, 1)))
model.add(LSTM(512, return_sequences=False))
model.add(Dense(vocab_size))
model.summary()

model.compile(optimizer='adam',loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])
model.fit(train_data, epochs=500)
model.save('triet_hoc.h5')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 3, 512)            1052672   
                                                                 
 lstm_1 (LSTM)               (None, 512)               2099200   
                                                                 
 dense (Dense)               (None, 3292)              1688796   
                                                                 
Total params: 4840668 (18.47 MB)
Trainable params: 4840668 (18.47 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/500
  19/1344 [..............................] - ETA: 4:40 - loss: 7.4192 - accuracy: 0.0444

KeyboardInterrupt: 

In [None]:
from keras.models import load_model
model = load_model('triet_hoc.h5')
history = model.fit(train_data, epochs=300)

In [None]:
def encode(sent):
    return [[w2i[w] for w in sent.split()]]
pred = model.predict(encode("Triết học là"))
pred_word = i2w[np.argmax(pred)]
print(pred_word)
pred = model.predict(encode("Chủ nghĩa xã"))
pred_word = i2w[np.argmax(pred)]
print(pred_word)