In [21]:
import random
import re
import os
import sys
import tqdm
import xml.etree.ElementTree as ET

import numpy as np
import tensorflow as tf
from tensorflow import keras
from nltk import tokenize
from tensorflow.keras.utils import Sequence

import string

In [9]:
class Debug:
    def __init__(self, debug_mode=True):
        self.debug_mode = debug_mode
        self.flag = {}

    def log(self, target, flag=None):
        if self.debug_mode:
            if flag is None:
                print(target)
            else:
                if flag in self.flag.keys():
                    if self.flag[flag]:
                        print(target)

    def set_flag(self, flag: str, val: bool):
        self.flag[flag] = val

debug = Debug(True)

In [10]:
class GeneratorExceptions(Exception):
    """
    The Exception class for tracking all exceptions raised in data generator
    Param
        text: the displayed text
    """
    def __init__(self, text: str):
        self.text = text

class data_generator(Sequence):
    def __init__(self, 
                 dataset_file_path : str="data/dataset/nysk.xml", 
                 processed_dataset_path: str ="data/processed_dataset/",
                 batch_size = 1,
                 shuffle = True):
        self.dataset_file_path = dataset_file_path
        self.processed_dataset_path = processed_dataset_path
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.file_list = []
        self.encoder_input_data = []
        self.decoder_input_data = []
        self.decoder_target_data = []
        self.char_list = []
        self.characters_set = set()
        self.tokens_count = 0
        self.max_sequence_len = 0
        self.token_index = None
        self.input_texts = None
        self.model = None
        
        self.validation_split=0.1
    
    def generate_char_dict(self):
        
        for i in string.ascii_letters:
            self.characters_set.add(i)
        
        for i in "1234567890\n\t.,!(){}\"\' ":
            self.characters_set.add(i)
        
        self.char_list = sorted(list(self.characters_set))
        self.tokens_count = len(self.char_list)
        self.token_index = dict([(char, i) for i, char in enumerate(self.characters_set)])

        
    def preprocess_data(self, override=False, max:int=3):
        self.file_list = []
        if os.path.isfile(self.dataset_file_path):
            if not os.path.isdir(self.processed_dataset_path):
                os.mkdir(self.processed_dataset_path)

            with open(self.dataset_file_path, "r", encoding="utf-8") as f:
                doc = ET.ElementTree(file=f)

            root = doc.getroot()
            
            pos = 0
            
            for item in tqdm.tqdm(root):
                if max != -1:
                    if pos > max:
                        break
                    pos += 1
                news_id = item.findtext('docid')
                source = item.findtext('source')
                url = item.findtext('url')
                title = item.findtext('title')
                summary = item.findtext('summary')
                raw_text = item.findtext('text')

                title = re.sub(r"<.*>", "", title)
                title = re.sub(r"\W", "_", title)
                title = f"{news_id}_{title[:10]}"
                
                res = tokenize.sent_tokenize(raw_text)
                sentences_count = len(res)
                       
                text = ""
                
                
                for s in res:
                    
                    if len(s) > 200:
                        continue
                    else:
                        if len(s) > self.max_sequence_len:
                            self.max_sequence_len = len(s)
                        t_sentence = ""
                        for c in s:
                            if c in self.characters_set:
                                t_sentence += c
                        t_sentence += "\t"
                        text += t_sentence
                
                fp = f"{self.processed_dataset_path}{title}_{sentences_count}.txt"
                
                if not os.path.isfile(fp) or override:
                    with open(fp, 'w', encoding='utf-8') as f:
                        f.write(text)
            self.max_sequence_len += 3
            
        else:
            raise GeneratorExceptions("Path doesn't exist")
    
    def generate_file_list(self, length:int=-1):
        temp = os.listdir(self.processed_dataset_path)
        self.file_list = []
        t_list = []
        for i in temp:
            t_list.append(f"{self.processed_dataset_path}{i}")
        self.file_list = t_list[1:length]
    
    def process_data(self, text):
        input_texts = []
        target_texts = []
        
        for i in range(0, len(text)-1):
            input_t = f"\t{text[i]}\n"
            target_t = f"\t{text[i+1]}\n"
            input_texts.append(input_t)
            target_texts.append(target_t)
        
        temp_encoder_input_data = np.zeros(
            (len(input_texts), self.max_sequence_len, self.tokens_count), dtype="float32"
        )
        temp_decoder_input_data = np.zeros(
            (len(input_texts), self.max_sequence_len, self.tokens_count), dtype="float32"
        )
        temp_decoder_target_data = np.zeros(
            (len(input_texts), self.max_sequence_len, self.tokens_count), dtype="float32"
        )

        for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
            for t, char in enumerate(input_text):
                temp_encoder_input_data[i, t, self.token_index[char]] = 1.0
                temp_encoder_input_data[i, t + 1 :, self.token_index[" "]] = 1.0
            for t, char in enumerate(target_text):
                # decoder_target_data is ahead of decoder_input_data by one timestep
                temp_decoder_input_data[i, t, self.token_index[char]] = 1.0
                if t > 0:
                    # decoder_target_data will be ahead by one timestep
                    # and will not include the start character.
                    temp_decoder_target_data[i, t - 1, self.token_index[char]] = 1.0
                    temp_decoder_input_data[i, t + 1 :, self.token_index[" "]] = 1.0
                    temp_decoder_target_data[i, t:, self.token_index[" "]] = 1.0
        
        self.encoder_input_data = temp_encoder_input_data
        self.decoder_input_data = temp_decoder_input_data
        self.decoder_target_data = temp_decoder_target_data
        
        self.input_texts = input_texts
        
        return
    
    def __len__(self):
        return len(self.file_list) // self.batch_size
    
    def __getitem__(self, index):
        files = self.file_list[index * self.batch_size:(index + 1) * self.batch_size]
        res = []
        for fl in files:
            with open(fl, 'r', encoding='utf-8') as dt:
                text = dt.read()
                temp_list = text.split("\t")
                res.extend(temp_list)
        
        self.process_data(res)
        print(sys.getsizeof(self.model))
        return [self.encoder_input_data, self.decoder_input_data], self.decoder_target_data,
    
    def on_epoch_end(self):
        if self.shuffle:
            random.shuffle(self.file_list)
        return
    

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
# Testing data generator
DataGenerator = data_generator()
DataGenerator.generate_char_dict()
print(DataGenerator.token_index)

{'J': 0, '1': 1, '{': 2, '"': 3, 'M': 4, 'w': 5, 'x': 6, 'A': 7, 'N': 8, 'O': 9, 'Y': 10, '0': 11, '(': 12, 'l': 13, 'q': 14, 'V': 15, 'S': 16, 'X': 17, 'o': 18, 'T': 19, 'g': 20, 't': 21, '3': 22, 'c': 23, 's': 24, 'B': 25, '8': 26, 'W': 27, 'e': 28, '5': 29, 'd': 30, 'i': 31, 'D': 32, 'L': 33, 'm': 34, 'b': 35, 'p': 36, '!': 37, ' ': 38, 'Q': 39, 'u': 40, 'r': 41, "'": 42, ')': 43, 'h': 44, 'H': 45, '9': 46, '\n': 47, 'R': 48, 'P': 49, 'v': 50, 'f': 51, 'C': 52, 'G': 53, '\t': 54, 'y': 55, '2': 56, '6': 57, 'E': 58, 'I': 59, 'K': 60, '.': 61, ',': 62, '}': 63, 'Z': 64, 'z': 65, '4': 66, 'U': 67, 'a': 68, 'k': 69, 'j': 70, '7': 71, 'F': 72, 'n': 73}


In [13]:
DataGenerator.preprocess_data(override=True, max=1000)

 10%|▉         | 1001/10421 [00:02<00:20, 468.67it/s]


In [14]:
print(DataGenerator.tokens_count)
print(DataGenerator.max_sequence_len)

74
203


In [15]:
DataGenerator.generate_file_list()
print(len(DataGenerator))
print(len(DataGenerator.file_list))
# print(temp_dg.__getitem__(1))

1000
1000


In [16]:
def generate_model(num_encoder_tokens, num_decoder_tokens, latent_dim=256):
      
    encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
    encoder = keras.layers.LSTM(latent_dim, return_state=True)
    encoder_outputs_, state_h, state_c = encoder(encoder_inputs)
    
    encoder_states = [state_h, state_c]
    
    decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))
    
    decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
    decoder_outputs = decoder_dense(decoder_outputs)
    
    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    model.summary()
    
    return model

In [18]:
latent_dim = 256

model = generate_model(num_encoder_tokens=DataGenerator.tokens_count, 
                       num_decoder_tokens=DataGenerator.tokens_count,
                       latent_dim=latent_dim)

model_name = "Model\SeqToSeq_Model"

model.compile(
    optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 74)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 74)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 338944      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  338944      input_2[0][0]                    
                                                                 lstm[0][1]            

In [19]:
print(sys.getsizeof(model))
model.load_weights("ckpt/ckpt-0005.ckpt")

64


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7feb0d26b090>

In [None]:
# Don't execute this one for now
import nltk
nltk.download('punkt')

epochs = 5

print(DataGenerator.token_index)
DataGenerator.model = model

checkpoint_path = "ckpt/ckpt-{epoch:04d}.ckpt"
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    save_freq=DataGenerator.batch_size)

model.fit(DataGenerator, 
          epochs=epochs, 
          callbacks=[cp_callback],
          verbose=1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'J': 0, '1': 1, '{': 2, '"': 3, 'M': 4, 'w': 5, 'x': 6, 'A': 7, 'N': 8, 'O': 9, 'Y': 10, '0': 11, '(': 12, 'l': 13, 'q': 14, 'V': 15, 'S': 16, 'X': 17, 'o': 18, 'T': 19, 'g': 20, 't': 21, '3': 22, 'c': 23, 's': 24, 'B': 25, '8': 26, 'W': 27, 'e': 28, '5': 29, 'd': 30, 'i': 31, 'D': 32, 'L': 33, 'm': 34, 'b': 35, 'p': 36, '!': 37, ' ': 38, 'Q': 39, 'u': 40, 'r': 41, "'": 42, ')': 43, 'h': 44, 'H': 45, '9': 46, '\n': 47, 'R': 48, 'P': 49, 'v': 50, 'f': 51, 'C': 52, 'G': 53, '\t': 54, 'y': 55, '2': 56, '6': 57, 'E': 58, 'I': 59, 'K': 60, '.': 61, ',': 62, '}': 63, 'Z': 64, 'z': 65, '4': 66, 'U': 67, 'a': 68, 'k': 69, 'j': 70, '7': 71, 'F': 72, 'n': 73}
64
Epoch 1/5
64
64

Epoch 00001: saving model to ckpt/ckpt-0001.ckpt
   1/1000 [..............................] - ETA: 0s - loss: 8.0180 - accuracy: 5.1854e-0464

Epoch 00001: saving model to ckpt/ckpt-0001.ckpt
   2/1000 [..............................] - ETA: 5:47 - loss: 5.2889 - accuracy: 0.4575  64

Epoch 00001: saving model to ckpt/c

In [None]:
batch_size = 16
epochs = 1  

DataGenerator.__getitem__(1)

print(len(DataGenerator.encoder_input_data))
print(len(DataGenerator.encoder_input_data[0]))
print(len(DataGenerator.encoder_input_data[0][0]))

model.fit(
    [DataGenerator.encoder_input_data, DataGenerator.decoder_input_data],
    DataGenerator.decoder_target_data,
    batch_size=16,
    epochs=epochs,
    validation_split=0.2,
)

In [27]:
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None, 74)]   0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None, 74)]   0                                            
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, 256), (None, 338944      input_3[0][0]                    
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, None, 256),  338944      input_4[0][0]                    
                                                                 lstm_2[0][1]          

In [19]:
model.save_weights("Model/test.weights.hdf5")
# keras.models.save_model(model, model_name)

In [24]:
def decode_sequence(DataGenerator: data_generator, input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1, DataGenerator.tokens_count))
    target_seq[0, 0, DataGenerator.token_index["\t"]] = 1.0

    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        print(output_tokens)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if sampled_char == "\n" or len(decoded_sentence) > DataGenerator.max_sequence_len:
            stop_condition = True

        target_seq = np.zeros((1, 1, DataGenerator.tokens_count))
        target_seq[0, 0, sampled_token_index] = 1.0

        states_value = [h, c]
    return decoded_sentence

In [20]:
new_model = generate_model(num_encoder_tokens=DataGenerator.tokens_count, 
                           num_decoder_tokens=DataGenerator.tokens_count,
                           latent_dim=latent_dim)
new_model.compile(
    optimizer="rmsprop", 
    loss="categorical_crossentropy", 
    metrics=["accuracy"]
)

new_model.summary()

new_model.load_weights("Model/test.weights.hdf5")
# new_model.load_weights("ckpt/ckpt-0005.ckpt")

Model: "functional_12"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, None, 74)]   0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, None, 74)]   0                                            
__________________________________________________________________________________________________
lstm_4 (LSTM)                   [(None, 256), (None, 338944      input_5[0][0]                    
__________________________________________________________________________________________________
lstm_5 (LSTM)                   [(None, None, 256),  338944      input_6[0][0]                    
                                                                 lstm_4[0][1]         

In [22]:
encoder_inputs = new_model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = new_model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = new_model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,), name="input_3")
decoder_state_input_c = keras.Input(shape=(latent_dim,), name="input_4")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = new_model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = new_model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

reverse_input_char_index = dict((i, char) for char, i in DataGenerator.token_index.items())
reverse_target_char_index = dict((i, char) for char, i in DataGenerator.token_index.items())

In [25]:
for seq_index in range(1):
    print(seq_index)
    DataGenerator.__getitem__(0)
    input_seq = DataGenerator.encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(DataGenerator, input_seq)
    print("-")
    print("Input sentence:", DataGenerator.input_texts[seq_index])
    for i in decoded_sentence:
        print(f"{i}, {ord(i)}")
    print("Decoded sentence:", decoded_sentence)


0
[[[1.54057780e-05 3.93492868e-03 1.12173110e-02 6.94556348e-03
   1.92048233e-02 8.49220203e-04 6.96478062e-04 1.18780925e-04
   2.71444034e-04 1.45018119e-02 4.06283827e-04 1.49002150e-02
   2.98226881e-03 3.98946814e-02 2.43264600e-04 2.85676506e-04
   2.80498294e-04 9.95060782e-06 1.81179717e-02 8.17182939e-04
   3.95574098e-05 2.10439437e-04 6.03581034e-02 7.91613385e-02
   1.59234870e-02 2.50103403e-05 1.18636075e-04 2.64085531e-02
   1.57416314e-02 7.43334764e-04 6.60320220e-04 8.05069358e-05
   9.87938493e-02 1.37178271e-04 1.88003015e-02 5.51329285e-04
   2.06402987e-02 2.52800155e-02 3.08531191e-04 1.51838109e-01
   9.22344800e-04 3.47868120e-03 2.88651441e-04 5.10072045e-04
   5.39908651e-05 2.33395302e-04 4.06308798e-04 2.65971292e-04
   7.41540571e-04 5.95427118e-02 3.51684634e-04 9.98393516e-04
   3.44118249e-04 2.03208474e-04 4.75200545e-03 4.79630289e-05
   8.97950726e-04 8.66891292e-04 4.20652737e-04 2.38476358e-02
   1.34288659e-03 9.09083610e-05 4.26027342e-04 4.540