In [None]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Dropout, Activation
import pandas as pd
import numpy as np
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers
import pickle
import codecs
import re
import json
import io

In [None]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
f = codecs.open("/content/drive/MyDrive/RuCode_stress/train_stresses_labels.txt", "r", "utf-8")
lines = f.read()

In [None]:
words = lines.split('\n')
words[0:3]

In [None]:
%%time
vowel_regex = re.compile("[уеыаоэяиюё]")
accent_index = []

for i in range(0, len(words)):
    try:
        idxs = [j.start() for j in vowel_regex.finditer(words[i])]
        accidx = words[i].find('^')-1
        accent_index.append(idxs.index(accidx))
    except:
        accent_index.append(-2)
    words[i] = words[i].replace('^', '')

    kmers = []
    for left in range(0, len(words[i])-2):
        kmers.append(words[i][left:left+3])

    words[i] = ' '.join(kmers)

In [None]:
df = pd.DataFrame({'words': words, 'accent_index': accent_index})
df = df.loc[df['accent_index']!=-2]
X_train = df.copy()

In [None]:
y_train = X_train['accent_index']
y_train = tf.keras.utils.to_categorical(y_train)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train['words'])
vocab_size = len(tokenizer.word_index) + 1

tokenizer_json = tokenizer.to_json()
with io.open('stress_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [None]:
train_seq = tokenizer.texts_to_sequences(X_train['words'])

In [None]:
max_length = len(max(train_seq, key=len))
train_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seq, maxlen=max_length, padding='post', truncating='post')

In [None]:
opt = keras.optimizers.Adam()

def model(vocab_size, max_length):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(vocab_size, 64, input_length=max_length),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(13, activation='softmax')
    ])

    return model

model = model(vocab_size, max_length)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

In [None]:
model.fit(train_vector, y_train, epochs=20)

In [None]:
model.save('stress_model_all_data_20ep.keras')