In [5]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Bidirectional, Concatenate
import numpy as np
from sklearn.model_selection import train_test_split




In [11]:
import random

def train_sandhi_split(dtrain, dtest):
    batch_size = 64  # Batch size for training.
    epochs = 1  # Number of epochs to train for.
    latent_dim = 128  # Latent dimensionality of the encoding space.

    # Vectorize the data.
    input_texts = []
    target_texts = []
    X_tests = []
    Y_tests = []
    characters = set()

    for data in dtrain:
        [input_text, target_text] = data.split(',')
    
        # We use "&" as the "start sequence" character for the targets, and "$" as "end sequence" character.
        target_text = '&' + target_text + '$'
        input_texts.append(input_text)
        target_texts.append(target_text)
        for char in input_text:
            if char not in characters:
                characters.add(char)
        for char in target_text:
            if char not in characters:
                characters.add(char)
    
    for data in dtest:
        [input_text, target_text] = data.split(',')

        # We use "&" as the "start sequence" character for the targets, and "$" as "end sequence" character.
        target_text = '&' + target_text + '$'
        X_tests.append(input_text)
        Y_tests.append(target_text)
        for char in input_text:
            if char not in characters:
                characters.add(char)
        for char in target_text:
            if char not in characters:
                characters.add(char)
    
    # Using '*' for padding 
    characters.add('*')
    
    characters = sorted(list(characters))
    num_tokens = len(characters)
    max_encoder_seq_length = max([len(txt) for txt in input_texts])
    max_decoder_seq_length = max([len(txt) for txt in target_texts])
    
    print('Number of samples:', len(input_texts))
    print('Number of unique tokens:', num_tokens)
    print('Max sequence length for inputs:', max_encoder_seq_length)
    print('Max sequence length for outputs:', max_decoder_seq_length)
    
    token_index = dict([(char, i) for i, char in enumerate(characters)])
    
    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_tokens), dtype='float32')
    decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_tokens), dtype='float32')
    decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_tokens), dtype='float32')
    
    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t, token_index[char]] = 1.
        encoder_input_data[i, t + 1:, token_index['*']] = 1.
        for t, char in enumerate(target_text):
            # decoder_target_data is ahead of decoder_input_data by one timestep
            decoder_input_data[i, t, token_index[char]] = 1.
            if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                decoder_target_data[i, t - 1, token_index[char]] = 1.
        decoder_input_data[i, t + 1:, token_index['*']] = 1.
        decoder_target_data[i, t:, token_index['*']] = 1.
    
    # Define an input sequence and process it.
    encoder_inputs = Input(shape=(None, num_tokens))
    encoder_bilstm = Bidirectional(LSTM(latent_dim, return_state=True, dropout=0.5))
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_bilstm(encoder_inputs)
    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])
    
    encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs, forward_h, forward_c, backward_h, backward_c])
    
    # We discard `encoder_outputs` and only keep the states.
    encoder_states = [state_h, state_c]
    
    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None, num_tokens))
    
    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.
    decoder_lstm = LSTM(latent_dim*2, return_sequences=True, return_state=True, dropout=0.5)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(num_tokens, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model = Model(inputs=decoder_inputs, outputs=[encoder_outputs, decoder_outputs, _, _])
    
    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    # Run training
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1)

    return (encoder_model, decoder_model, token_index, max_encoder_seq_length, max_decoder_seq_length, num_tokens)

with open("samasa_split_train_data.csv", 'r', encoding='utf-8') as f:
    dl = f.readlines()
random.shuffle(dl)
dtrain, dtest = train_test_split(dl, test_size=0.2, random_state=1)
(encoder_model, decoder_model, token_index, max_encoder_seq_length, max_decoder_seq_length, num_tokens) = train_sandhi_split(dtrain, dtest[0:1])

Number of samples: 179895
Number of unique tokens: 52
Max sequence length for inputs: 6
Max sequence length for outputs: 11


ValueError: Graph disconnected: cannot obtain value for tensor KerasTensor(type_spec=TensorSpec(shape=(None, None, 52), dtype=tf.float32, name='input_7'), name='input_7', description="created by layer 'input_7'") at layer "bidirectional_3". The following previous layers were accessed without issue: []

In [10]:
from tqdm import tqdm

def decode_sequence(input_seq, encoder_model, decoder_model, reverse_target_char_index):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, token_index['&']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '$' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

def infer_sandhi_split(dtest, encoder_model, decoder_model, token_index, max_encoder_seq_length, max_decoder_seq_length, num_tokens):
    input_texts = []
    target_texts = []

    for data in dtest:
        [input_text, target_text] = data.split(',')
        input_texts.append(input_text)
        target_texts.append(target_text)

    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_tokens), dtype='float32')
    
    for i, input_text in enumerate(input_texts):
        for t, char in enumerate(input_text):
            if char not in token_index:
                continue
            encoder_input_data[i, t, token_index[char]] = 1.
        encoder_input_data[i, t + 1:, token_index['*']] = 1.
    
    # Reverse-lookup token index to decode sequences back to something readable.
    reverse_input_char_index = dict((i, char) for char, i in token_index.items())
    reverse_target_char_index = dict((i, char) for char, i in token_index.items())
    
    total = len(encoder_input_data)
    passed = 0
    results = []
    for seq_index in tqdm(range(len(encoder_input_data))):
        # Take one sequence (part of the training set)
        # for trying out decoding.
        input_seq = encoder_input_data[seq_index: seq_index + 1]
        decoded_sentence = decode_sequence(input_seq, encoder_model, decoder_model, reverse_target_char_index)
        decoded_sentence = decoded_sentence.strip()
        decoded_sentence = decoded_sentence.strip('$')
        results.append(decoded_sentence)
        if decoded_sentence == target_texts[seq_index]:
            passed = passed + 1

    print("Passed: "+str(passed)+'/'+str(total)+', '+str(passed*100/total))

infer_sandhi_split(dtest[0:100], encoder_model, decoder_model, token_index, max_encoder_seq_length, max_decoder_seq_length, num_tokens)

  0%|          | 0/100 [00:00<?, ?it/s]


AttributeError: 'Bidirectional' object has no attribute 'predict'

In [9]:
encoder_model.save('stage2_encoder.h5')
decoder_model.save('stage2_decoder.h5')
fh = open('stage2_token_index.txt', 'w')
data = str(token_index)
fh.write(data)
fh.close()

AttributeError: 'Bidirectional' object has no attribute 'save'

In [4]:
import random
from sklearn.model_selection import train_test_split
with open("samasa_split_train_data.csv", 'r', encoding='utf-8') as f:
    dl = f.readlines()
random.shuffle(dl)
dtrain, dtest = train_test_split(dl, test_size=0.2, random_state=1)
fp = open("stage2_dtest.csv", "w")
for data in dtest:
    fp.write(data)
fp.close()