In [41]:
import random
import re
import os
import sys
import tqdm
import xml.etree.ElementTree as ET

import numpy as np
from nltk import tokenize

import string

In [42]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import Sequence

In [14]:
# Don't run this cell unless you are using GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

In [15]:
class Debug:
    def __init__(self, debug_mode=True):
        self.debug_mode = debug_mode
        self.flag = {}

    def log(self, target, flag=None):
        if self.debug_mode:
            if flag is None:
                print(target)
            else:
                if flag in self.flag.keys():
                    if self.flag[flag]:
                        print(target)

    def set_flag(self, flag: str, val: bool):
        self.flag[flag] = val

debug = Debug(True)

In [16]:
class GeneratorExceptions(Exception):
    """
    The Exception class for tracking all exceptions raised in data generator
    Param
        text: the displayed text
    """
    def __init__(self, text: str):
        self.text = text

class data_generator(Sequence):
    """The data generator for the model.
    
    The data generator that batches and loads data and the pass it into the model
    
    Attributes:
        dataset_file_path:  the path to the dataset file, only need when using our preprocessing method
        processed_dataset_path: the path to the processed dataset directory
        batch_size: the number of files in each batch, it's recommend to be larger than 1
        shuffle: if the generator shuffle the dataset after each epoch
        file_list: the list of all processed data
        encoder_input_data: the input data for the encoder
        decoder_input_data: the input data for the decoder
        decoder_input_data: the target data for the decoder
        char_list: the list that stores all possible characters in the data
        characters_set: the set generated based on the char_lists
        tokens_count: total number of all possible characters
        max_sequence_len: the maximum length of a sequence
    """
    def __init__(self, 
                 dataset_file_path : str="data/dataset/nysk.xml", 
                 processed_dataset_path: str ="data/processed_dataset/",
                 batch_size = 2,
                 shuffle = True):
        """The constructor.
        Args:
            dataset_file_path: the path to the dataset file, only need when using our preprocessing method
            processed_dataset_path: the path to the processed dataset directory
            batch_size: the number of files in each batch, it's recommend to be larger than 1
            shuffle: if the generator shuffle the dataset after each epoch
        """
        self.dataset_file_path = dataset_file_path
        self.processed_dataset_path = processed_dataset_path
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        self.file_list = []
        
        self.encoder_input_data = []
        self.decoder_input_data = []
        self.decoder_target_data = []
        
        self.char_list = []
        self.characters_set = set()
        self.tokens_count = 0
        self.max_sequence_len = 0
        
        self.token_index = None
        self.input_texts = None
        self.model = None
        
        self.epoch_count = 0
        
        self.validation_split=0.1
    
    def generate_char_dict(self):
        """Generates the character dictionary (Token index)"""
        for i in string.ascii_letters:
            self.characters_set.add(i)
        
        for i in "1234567890\n\t.,!(){}\"\' ":
            self.characters_set.add(i)
        
        self.char_list = sorted(list(self.characters_set))
        self.tokens_count = len(self.char_list)
        self.token_index = dict([(char, i) for i, char in enumerate(self.characters_set)])

        
    def preprocess_data(self, override=False, max:int=-1):
        """Preprocess the data to enabel to model to use these data
        
        This function removes the unnecessary characters in the dataset and save each element
        into a separated txt file
        
        Args:
            override: if this function overwrite the file when already exists
            max: the maximum number of files that will be processed, in case not the whole dataset 
                 want to be used
        """
        self.file_list = []
        if os.path.isfile(self.dataset_file_path):
            if not os.path.isdir(self.processed_dataset_path):
                os.mkdir(self.processed_dataset_path)

            with open(self.dataset_file_path, "r", encoding="utf-8") as f:
                doc = ET.ElementTree(file=f)

            root = doc.getroot()
            
            pos = 0
            
            for item in tqdm.tqdm(root):
                if max != -1:
                    if pos > max:
                        break
                    pos += 1
                news_id = item.findtext('docid')
                source = item.findtext('source')
                url = item.findtext('url')
                title = item.findtext('title')
                summary = item.findtext('summary')
                raw_text = item.findtext('text')

                title = re.sub(r"<.*>", "", title)
                title = re.sub(r"\W", "_", title)
                title = f"{news_id}_{title[:10]}"
                
                res = tokenize.sent_tokenize(raw_text)
                sentences_count = len(res)
                       
                text = ""
                
                
                for s in res:
                    
                    if len(s) > 200:
                        continue
                    else:
                        if len(s) > self.max_sequence_len:
                            self.max_sequence_len = len(s)
                        t_sentence = ""
                        for c in s:
                            if c in self.characters_set:
                                t_sentence += c
                        t_sentence += "\t"
                        text += t_sentence
                
                fp = f"{self.processed_dataset_path}{title}_{sentences_count}.txt"
                
                if not os.path.isfile(fp) or override:
                    with open(fp, 'w', encoding='utf-8') as f:
                        f.write(text)
            self.max_sequence_len += 3
            
        else:
            raise GeneratorExceptions("Path doesn't exist")
    
    def generate_file_list(self, length:int=-1):
        """Generated the list of elements"""
        temp = os.listdir(self.processed_dataset_path)
        self.file_list = []
        t_list = []
        for i in temp:
            if os.path.splitext(i)[-1] == ".txt":
                t_list.append(f"{self.processed_dataset_path}{i}")
        self.file_list = t_list[1:length]
    
    def process_data(self, text):
        """process and load a file"""
        input_texts = []
        target_texts = []
        
        for i in range(0, len(text)-1):
            input_t = f"\t{text[i]}\n"
            target_t = f"\t{text[i+1]}\n"
            input_texts.append(input_t)
            target_texts.append(target_t)
        
        temp_encoder_input_data = np.zeros(
            (len(input_texts), self.max_sequence_len, self.tokens_count), dtype="float32"
        )
        temp_decoder_input_data = np.zeros(
            (len(input_texts), self.max_sequence_len, self.tokens_count), dtype="float32"
        )
        temp_decoder_target_data = np.zeros(
            (len(input_texts), self.max_sequence_len, self.tokens_count), dtype="float32"
        )

        for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
            for t, char in enumerate(input_text):
                temp_encoder_input_data[i, t, self.token_index[char]] = 1.0
                temp_encoder_input_data[i, t + 1 :, self.token_index[" "]] = 1.0
            for t, char in enumerate(target_text):
                # decoder_target_data is ahead of decoder_input_data by one timestep
                temp_decoder_input_data[i, t, self.token_index[char]] = 1.0
                if t > 0:
                    # decoder_target_data will be ahead by one timestep
                    # and will not include the start character.
                    temp_decoder_target_data[i, t - 1, self.token_index[char]] = 1.0
                    temp_decoder_input_data[i, t + 1 :, self.token_index[" "]] = 1.0
                    temp_decoder_target_data[i, t:, self.token_index[" "]] = 1.0
        
        self.encoder_input_data = temp_encoder_input_data
        self.decoder_input_data = temp_decoder_input_data
        self.decoder_target_data = temp_decoder_target_data
        
        self.input_texts = input_texts
        
        return
    
    # The following methods will be called during the training and shouldn't be manually called
    # unless you know the reason you are calling these methods
    def __len__(self):
        return len(self.file_list) // self.batch_size
    
    def __getitem__(self, index):
        files = self.file_list[index * self.batch_size:(index + 1) * self.batch_size]
        res = []
        for fl in files:
            with open(fl, 'r', encoding='utf-8') as dt:
                text = dt.read()
                temp_list = text.split("\t")
                res.extend(temp_list)
        
        self.process_data(res)
        return [self.encoder_input_data, self.decoder_input_data], self.decoder_target_data,
    
    def on_epoch_end(self):
        if self.shuffle:
            random.shuffle(self.file_list)
        model.save_weights(f"ckpt/ckpt-{self.epoch_count}.hdf5")
        self.epoch_count += 1
        return
    

In [17]:
# Run this when you see exceptions from nltk
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
# Instantiate and check the data generator
DataGenerator = data_generator()

# Generate the character dictionary
DataGenerator.generate_char_dict()

print(DataGenerator.token_index)

{'K': 0, 'F': 1, 'q': 2, 'D': 3, ')': 4, 'a': 5, 'U': 6, 'b': 7, 'o': 8, 'V': 9, '.': 10, 'd': 11, 'N': 12, 'Q': 13, 'h': 14, 'S': 15, 'p': 16, 'i': 17, 'P': 18, 'M': 19, 'A': 20, 'Z': 21, '2': 22, '5': 23, 's': 24, 'u': 25, '0': 26, '(': 27, ',': 28, 'e': 29, 'z': 30, 'G': 31, 'T': 32, '7': 33, '"': 34, 'E': 35, 'H': 36, '\t': 37, ' ': 38, 't': 39, 'J': 40, 'Y': 41, 'w': 42, 'I': 43, 'O': 44, 'W': 45, '1': 46, '\n': 47, '!': 48, 'm': 49, 'B': 50, 'f': 51, 'x': 52, 'X': 53, 'y': 54, 'v': 55, 'n': 56, 'C': 57, '3': 58, 'L': 59, 'R': 60, '{': 61, '6': 62, '4': 63, 'r': 64, 'k': 65, 'c': 66, 'l': 67, '9': 68, 'g': 69, '}': 70, "'": 71, '8': 72, 'j': 73}


In [19]:
# Preprocess the data
DataGenerator.preprocess_data(override=True, max=200)

  2%|▏         | 201/10421 [00:00<00:19, 512.44it/s]


In [20]:
# Check if the data and the data generator is properly generated
print(DataGenerator.tokens_count)
print(DataGenerator.max_sequence_len)

74
203


In [21]:
DataGenerator.generate_file_list()
print(len(DataGenerator))
print(len(DataGenerator.file_list))

99
199


In [22]:
def generate_model(num_encoder_tokens, num_decoder_tokens, latent_dim=256):
    """generate the mode based on the given size"""
    encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
    encoder = keras.layers.LSTM(latent_dim, return_state=True)
    encoder_outputs_, state_h, state_c = encoder(encoder_inputs)
    
    encoder_states = [state_h, state_c]
    
    decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))
    
    decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
    decoder_outputs = decoder_dense(decoder_outputs)
    
    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    model.summary()
    
    return model

In [23]:
# Generates and compiles the model
latent_dim = 256

model = generate_model(num_encoder_tokens=DataGenerator.tokens_count, 
                       num_decoder_tokens=DataGenerator.tokens_count,
                       latent_dim=latent_dim)
model.compile(
    optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 74)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 74)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 338944      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  338944      input_2[0][0]                    
                                                                 lstm[0][1]            

In [24]:
# This cell is used for debugging and testing
print(sys.getsizeof(model))
# model.load_weights("ckpt/ckpt-0005.ckpt")
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

56
2.3.1
Num GPUs Available:  0


In [25]:
# Training the model
epochs = 20

print(DataGenerator.token_index)
DataGenerator.model = model

model.fit(DataGenerator, 
          epochs=epochs, 
          verbose=1)

{'K': 0, 'F': 1, 'q': 2, 'D': 3, ')': 4, 'a': 5, 'U': 6, 'b': 7, 'o': 8, 'V': 9, '.': 10, 'd': 11, 'N': 12, 'Q': 13, 'h': 14, 'S': 15, 'p': 16, 'i': 17, 'P': 18, 'M': 19, 'A': 20, 'Z': 21, '2': 22, '5': 23, 's': 24, 'u': 25, '0': 26, '(': 27, ',': 28, 'e': 29, 'z': 30, 'G': 31, 'T': 32, '7': 33, '"': 34, 'E': 35, 'H': 36, '\t': 37, ' ': 38, 't': 39, 'J': 40, 'Y': 41, 'w': 42, 'I': 43, 'O': 44, 'W': 45, '1': 46, '\n': 47, '!': 48, 'm': 49, 'B': 50, 'f': 51, 'x': 52, 'X': 53, 'y': 54, 'v': 55, 'n': 56, 'C': 57, '3': 58, 'L': 59, 'R': 60, '{': 61, '6': 62, '4': 63, 'r': 64, 'k': 65, 'c': 66, 'l': 67, '9': 68, 'g': 69, '}': 70, "'": 71, '8': 72, 'j': 73}
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f624c49e860>

In [26]:
# Save the weights after the training
# All the checkpoints can also be used as weights
model.save_weights("Model/test_weights.hdf5")

In [36]:
# Please make sure the model size is consistent
new_model = generate_model(num_encoder_tokens=DataGenerator.tokens_count, 
                           num_decoder_tokens=DataGenerator.tokens_count,
                           latent_dim=latent_dim)
new_model.compile(
    optimizer="rmsprop", 
    loss="categorical_crossentropy", 
    metrics=["accuracy"]
)

new_model.summary()

# The trained weights are in the Model/ folder, load anything you want
# All checkkpoints are in ckpt/ folder

new_model.load_weights("Model/trained_weights.hdf5")
# new_model.load_weights("ckpt/ckpt-0005.ckpt")

Model: "functional_17"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, None, 74)]   0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, None, 74)]   0                                            
__________________________________________________________________________________________________
lstm_6 (LSTM)                   [(None, 256), (None, 338944      input_7[0][0]                    
__________________________________________________________________________________________________
lstm_7 (LSTM)                   [(None, None, 256),  338944      input_8[0][0]                    
                                                                 lstm_6[0][1]         

In [37]:
encoder_inputs = new_model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = new_model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = new_model.input[1]  # input_2

# If there's any issue with conflict of names, change the name of the following two lines
decoder_state_input_h = keras.Input(shape=(latent_dim,), name="input_5")
decoder_state_input_c = keras.Input(shape=(latent_dim,), name="input_6")

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = new_model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = new_model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

reverse_input_char_index = dict((i, char) for char, i in DataGenerator.token_index.items())
reverse_target_char_index = dict((i, char) for char, i in DataGenerator.token_index.items())

In [38]:
def decode_sequence(DataGenerator: data_generator, input_seq):
    """function for decoding the generated sequence"""
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1, DataGenerator.tokens_count))
    target_seq[0, 0, DataGenerator.token_index["\t"]] = 1.0

    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if sampled_char == "\n" or len(decoded_sentence) > DataGenerator.max_sequence_len:
            stop_condition = True

        target_seq = np.zeros((1, 1, DataGenerator.tokens_count))
        target_seq[0, 0, sampled_token_index] = 1.0

        states_value = [h, c]
    return decoded_sentence

In [40]:
# Predict and output the result
for seq_index in range(1):
    print(seq_index)
    DataGenerator.__getitem__(0)
    input_seq = DataGenerator.encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(DataGenerator, input_seq)
    print("-")
    print("Input sentence:", DataGenerator.input_texts[seq_index])

    print("Decoded sentence:", decoded_sentence)


0
-
Input sentence: 	If you work in a corporate environment you may not be able to upgrade your browser to IE 7 or IE8.

Decoded sentence: zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
