In [23]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Dropout, Activation
import pandas as pd
import numpy as np
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers
import pickle
import codecs
import re
import json
from keras_preprocessing.text import tokenizer_from_json

In [24]:
f = codecs.open("private_test_stresses.txt", "r", "utf-8")
lines = f.read()

In [25]:
test_words = lines.split('\n')[:-1]
test_words[0:3]

['ааках', 'ааку', 'аал']

In [26]:
%%time
for i in range(0, len(test_words)):
    kmers = []
    for left in range(0, len(test_words[i])-2):
        kmers.append(test_words[i][left:left+3])

    test_words[i] = ' '.join(kmers)

CPU times: total: 938 ms
Wall time: 937 ms


In [27]:
test_df = pd.DataFrame({'words': test_words})

In [28]:
with open('stress_tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)
vocab_size = len(tokenizer.word_index) + 1
vocab_size

12117

In [29]:
max_length = 34
test_seq = tokenizer.texts_to_sequences(test_df['words'])
test_vector = tf.keras.preprocessing.sequence.pad_sequences(test_seq, maxlen=max_length, padding='post', truncating='post')

In [30]:
opt = keras.optimizers.Adam()

def model(vocab_size, max_length):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(vocab_size, 64, input_length=max_length),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(13, activation='softmax')
    ])

    return model

model = model(vocab_size, max_length)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 34, 64)            775488    
                                                                 
 bidirectional_4 (Bidirectio  (None, 34, 128)          66048     
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 128)               16512     
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 13)               

In [32]:
model = keras.models.load_model('stress_model_all_data_20ep.keras')

OSError: Unable to open file (file signature not found)

In [None]:
probabilities = model.predict(test_vector)
predictions = np.argmax(probabilities, axis=-1)

In [None]:
test_df['pred'] = predictions
test_df['original_words'] = lines.split('\n')[:-1]

In [None]:
result = []
for i in range(0, test_df.shape[0]):
    idxs = np.array([j.start() for j in vowel_regex.finditer(test_df.loc[i, 'original_words'])])
    #print('idxs', idxs)
    pred_idx = test_df.loc[i, 'pred']
    #print('pred_idx', pred_idx)
    if pred_idx>idxs.shape[0]-1:
        n_max = 1
        while pred_idx>idxs.shape[0]-1:
            pred_idx = np.argsort(probabilities[i], axis=0)[-1*n_max]
            n_max += 1
    stress_idx = idxs[pred_idx]+1
    word = test_df.loc[i, 'original_words'][0:stress_idx] + '^'
    word += test_df.loc[i, 'original_words'][stress_idx:]
    result.append(word+'\n')
result[0:3]

In [None]:
s = ''.join(result)

In [None]:
file = codecs.open("private_lstm_submission_all_data_20ep.txt", "w", "utf-8")
file.write(s)
file.close()