In [0]:
import numpy as np
import os
from tqdm import tqdm
import string
import re
import itertools
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding, TimeDistributed, Activation
from tensorflow.keras.callbacks import ModelCheckpoint

In [0]:
wiki103_folder = os.path.join(os.getcwd(), 'wikitext-103')

wiki103_paths = [os.path.join(wiki103_folder, file_name) for file_name in os.listdir(wiki103_folder)]
wiki103_paths

['/content/drive/My Drive/Cinnamon/a7/wikitext-103/wiki.train.tokens',
 '/content/drive/My Drive/Cinnamon/a7/wikitext-103/wiki.test.tokens',
 '/content/drive/My Drive/Cinnamon/a7/wikitext-103/wiki.valid.tokens']

In [0]:
wiki2_folder = os.path.join(os.getcwd(), 'wikitext-2')

wiki2_paths = [os.path.join(wiki2_folder, file_name) for file_name in os.listdir(wiki2_folder)]
wiki2_paths

['/content/drive/My Drive/Cinnamon/a7/wikitext-2/wiki.train.tokens',
 '/content/drive/My Drive/Cinnamon/a7/wikitext-2/wiki.valid.tokens',
 '/content/drive/My Drive/Cinnamon/a7/wikitext-2/wiki.test.tokens']

In [0]:
def get_data(folder_path):
    
    def get_text(file_name):
        
        path = os.path.join(folder_path, file_name)
        with open(path, 'r', encoding='utf-8') as f:
            document = []
            table = str.maketrans('', '', string.punctuation)
            for sentence in tqdm(f, desc=f'PROCESSING {file_name}', position=0):
                tokens = sentence.lower().split()
                tokens = [w.translate(table) for w in tokens]
                tokens = [word for word in tokens if word.isalpha()]
                document.append(tokens)
        return list(itertools.chain.from_iterable(document))

    train_text = get_text('wiki.train.tokens') 
    valid_text = get_text('wiki.valid.tokens') 
    test_text = get_text('wiki.test.tokens')

    return [train_text, 
            valid_text, 
            test_text] 

In [0]:
train_text, valid_text, test_text = get_data(wiki2_folder)

PROCESSING wiki.train.tokens: 36718it [00:01, 24397.50it/s]
PROCESSING wiki.valid.tokens: 3760it [00:00, 19010.62it/s]
PROCESSING wiki.test.tokens: 4358it [00:00, 20806.45it/s]


In [0]:
# Calculate word frequency and discard less frequent words


In [0]:
SEQUENCE_LEN = 10               # generate n-grams
train_sentences = []
train_next_words = []
for i in range(0, len(train_text) - SEQUENCE_LEN):
    # Only add sequences where no word is in ignored_words
    if len(set(train_text[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        train_sentences.append(train_text[i: i + SEQUENCE_LEN])
        train_next_words.append(train_text[i + SEQUENCE_LEN])

In [0]:
valid_sentences = []
valid_next_words = []
for i in range(0, len(valid_text) - SEQUENCE_LEN):
    if len(set(valid_text[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        valid_sentences.append(valid_text[i: i + SEQUENCE_LEN])
        valid_next_words.append(valid_text[i + SEQUENCE_LEN])

In [0]:
test_sentences = []
test_next_words = []
for i in range(0, len(test_text) - SEQUENCE_LEN):
    if len(set(test_text[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        test_sentences.append(test_text[i: i + SEQUENCE_LEN])
        test_next_words.append(test_text[i + SEQUENCE_LEN])

In [0]:
def generate_data(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN, len(words)), dtype=np.bool)
        y = np.zeros((batch_size, len(words)), dtype=np.bool)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index]):
                x[i, t, word_indices[w]] = 1
            y[i, word_indices[next_word_list[index]]] = 1

            index = index + 1
            if index == len(sentence_list):
                index = 0
        yield x, y

In [0]:
BATCH_SIZE = 128

train_generator = generate_data(train_sentences, train_next_words, batch_size=BATCH_SIZE)
validation_generator = generate_data(valid_sentences, valid_next_words, batch_size=BATCH_SIZE)
test_generator = generate_data(test_sentences, test_next_words, batch_size=BATCH_SIZE)

In [0]:
model = Sequential()
model.add(LSTM(128, input_shape=(SEQUENCE_LEN, len(words))))
model.add(Dense(len(words)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

# train the model and save to the Model folder
checkpointer = ModelCheckpoint(filepath=os.path.join('./model/{val_acc:.2f}.h5'), save_best_only=True, verbose=1)
model.fit_generator(train_generator, steps_per_epoch=len(train_sentences)//BATCH_SIZE, epochs=10,
                    validation_data=validation_generator, validation_steps=len(valid_sentences)//BATCH_SIZE,
                    callbacks=[checkpointer])

## These code adapted from my previous [project](https://github.com/KingLeo2000/Vietnamese-Spell-Correction/blob/master/train_spell.ipynb) at university 

In [0]:
def sample(preds):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [0]:
def generate_text(seed, max_word=30):
    sentence = seed.split()
    for i in range(max_word):
        x_pred = np.zeros((1, SEQUENCE_LEN, len(words)))
        for t, word in enumerate(sentence):
            x_pred[0, t, word_indices[word]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds)
        next_word = indices_word[next_index]

        sentence = sentence[1:]
        sentence.append(next_word)

        print(' '+next_word, end='')

In [0]:
seed = ' '.join(test_sentences[1])
seed

'unk robert unk is an english film television and theatre'

In [0]:
generate_text(seed)

 it stands over the german reviews including seven croatian six from hull and november the palace and jews any club for years with mark of service and participated in the