In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

import tensorflow as tf
import tensorflowjs as tfjs

import wandb
from wandb.keras import WandbCallback

import dataset
import schedulers

assert tf.config.list_physical_devices('GPU')

In [3]:

# masked version of accuracy and sce
def accuracy(real, pred):
    acc = tf.keras.metrics.sparse_categorical_accuracy(real, pred)

    mask = tf.cast(tf.math.logical_not(tf.math.equal(real, 0)), dtype=acc.dtype)
    acc *= mask

    return tf.reduce_sum(acc) / tf.reduce_sum(mask)

def sparse_categorical_crossentropy(y_true, y_pred, sample_weight=None):
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)

    mask = tf.cast(tf.math.logical_not(tf.math.equal(y_true, 0)), dtype=loss.dtype)
    loss *= mask

    return tf.reduce_sum(loss) / tf.reduce_sum(mask) 

def get_xy(d):
    if d is None:
        return None
    x = d.normalized
    y = {'N': d.niqqud, 'D': d.dagesh, 'S': d.sin }
    return (x, y)


In [4]:
corpus = {}
corpus['modern'] = dataset.read_corpora([
    'hebrew_diacritized/modern'])

In [5]:
corpus['mix'] = dataset.read_corpora([
    'hebrew_diacritized_private/poetry',
    'hebrew_diacritized_private/rabanit',
    'hebrew_diacritized_private/pre_modern'])

In [8]:
LETTERS_SIZE = len(dataset.letters_table)
NIQQUD_SIZE = len(dataset.niqqud_table)
DAGESH_SIZE = len(dataset.dagesh_table)
SIN_SIZE = len(dataset.sin_table)
WORDSIZE = 9

def build_model(units):
    inp = keras.Input(shape=(None, WORDSIZE), batch_size=None)
    print(f'{inp.shape=}')
    embed = layers.Embedding(LETTERS_SIZE, units // WORDSIZE)(inp)
    print(f'{embed.shape=}')
    
    char_layer = layers.TimeDistributed(layers.GRU(units // WORDSIZE, return_sequences=True))(embed)
    print(f'{char_layer.shape=}')
    
    word_layer = layers.Dense(units)(layers.Reshape((-1, units))(embed))
    print(f'{word_layer.shape=}')
    word_layer = layers.Bidirectional(layers.LSTM(units, return_sequences=True), merge_mode='sum')(word_layer)
    print(f'{word_layer.shape=}')
    word_layer = layers.Reshape((-1, WORDSIZE, units // WORDSIZE))(word_layer)
    print(f'{word_layer.shape=}')
    
    combined = layers.Dense(units)(word_layer + char_layer)

    outputs = [
        layers.Softmax(name='N')(layers.Dense(NIQQUD_SIZE)(combined)),
        layers.Softmax(name='D')(layers.Dense(DAGESH_SIZE)(combined)),
        layers.Softmax(name='S')(layers.Dense(SIN_SIZE)(combined)),
    ]
    return keras.Model(inputs=inp, outputs=outputs)

MAXLEN = 12

In [9]:
N = 2
model = build_model(270)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
x = np.zeros((N, MAXLEN, WORDSIZE))
y = [
    np.zeros((N, MAXLEN, WORDSIZE)),
    np.zeros((N, MAXLEN, WORDSIZE)),
    np.zeros((N, MAXLEN, WORDSIZE))
]
model.evaluate(x, y)
model.summary()

inp.shape=TensorShape([None, None, 9])
embed.shape=TensorShape([None, None, 9, 30])
char_layer.shape=TensorShape([None, None, 9, 30])
word_layer.shape=TensorShape([None, None, 270])
word_layer.shape=TensorShape([None, None, 270])
word_layer.shape=TensorShape([None, None, 9, 30])
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None, 9)]    0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 9, 30)  1290        input_2[0][0]                    
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, None, 270)    0           embedding_1[0][0]                
__________

In [12]:
MAXLEN = 12
np.random.seed(2)

data = {}
data['mix'] = dataset.load_data(corpus['mix'], validation_rate=0.1, maxlen=MAXLEN, wordsize=WORDSIZE)
data['modern'] = dataset.load_data(corpus['modern'], validation_rate=0.2, maxlen=MAXLEN, wordsize=WORDSIZE)


In [19]:
%env WANDB_MODE run

def experiment(lr):
    BATCH_SIZE = 128
    UNITS = 297 * 2
    np.random.seed(2)
    model = build_model(units=UNITS)
    model.compile(loss=sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                  metrics=accuracy)

    model.save_weights('./checkpoints/uninit')
    
    config = {
        'batch_size': BATCH_SIZE,
        'maxlen': MAXLEN,
        'wordsize': WORDSIZE,
        'units': UNITS,
        'model': model,
        'lr': lr,
        'order': [
             ('mix',    (1e-3, 4e-3, 3e-3), 'mix'),
#            ('modern', (1e-3, 1e-3, 1e-5), 'modern'),
#             ('modern', (40e-4, 40e-4, 1e-5), 'modern_over1'),
#             ('modern', (40e-4, 40e-4, 1e-5), 'modern_over2'),
        ],
    }
#     clr = ','.join(str(x) for x in config["order"][0][1])
    run = wandb.init(project="dotter",
                     group="brand_arch",
                     name=f'reshapes_{UNITS}_{WORDSIZE}_{lr}',
                     tags=['brand_arch', 'ordered'],
                     config=config)

    with run:
        for kind, clr, save in config['order']:
            train, validation = data[kind]

            training_data = (x, y) = get_xy(train)
            validation_data = get_xy(validation)

            wandb_callback = WandbCallback(log_batch_frequency=20,  # int(len(train.normalized) / BATCH_SIZE / 100),
                                           training_data=training_data,
                                           validation_data=validation_data,
                                           log_weights=False)
            
            scheduler = schedulers.CircularLearningRate(*clr)
            scheduler.set_dataset(train, BATCH_SIZE)
            callbacks = [wandb_callback]
            history = model.fit(x, y, validation_data=validation_data,
                                batch_size=BATCH_SIZE, epochs=2, verbose=1, callbacks=callbacks)
            
            model.save(os.path.join(wandb.run.dir, save + ".h5"))
            model.save_weights('./checkpoints/' + save)
    return model

for lr in [3e-3]:
    model = experiment(lr)

env: WANDB_MODE=run
inp.shape=TensorShape([None, None, 9])
embed.shape=TensorShape([None, None, 9, 66])
char_layer.shape=TensorShape([None, None, 9, 66])
word_layer.shape=TensorShape([None, None, 594])
word_layer.shape=TensorShape([None, None, 594])
word_layer.shape=TensorShape([None, None, 9, 66])


Failed to connect to W&B servers after 10 seconds.                    Letting user process proceed while attempting to reconnect.


Epoch 1/2
Epoch 2/2


In [None]:
model.load_weights('./checkpoints/modern_over')

model.compile()
model.save('modern.h5')
tfjs.converters.save_keras_model(model, '.')

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=2, ncols=3)

for n, v in enumerate(['accuracy', 'loss'], 0):
    for n1, t in enumerate(['N', 'D', 'S'], 0):
        p = ax[n][n1]
        p.plot(history.history[t + '_' + v][0:])
        p.plot(history.history['val_' + t + '_' +  v][0:])
        p.legend([t + '_Train', t + '_Test'], loc='center right')

plt.tight_layout()

In [None]:
np.random.seed(3)
test, _ = dataset.load_data(dataset.read_corpora(['test/modernTestCorpus/']), 0, MAXLEN)

In [None]:
model = build_model(units=500, dropout=0.1)
model.load_weights('./checkpoints/modern_over2')
x = test.normalized
y = {'N': test.niqqud, 'D': test.dagesh, 'S': test.sin }

model.compile(loss=sparse_categorical_crossentropy,
              metrics={'N': accuracy, 'D': accuracy, 'S': accuracy})

_ = model.evaluate(x=x, y=y, batch_size=64)

In [14]:
model.load_weights('./checkpoints/mix')

def real_evaluation(data, s=slice(0, None), print_comparison=True):
    batch = data.normalized[s]
    prediction = model.predict(batch)
    [actual_niqqud, actual_dagesh, actual_sin] = [dataset.from_categorical(prediction[0]), dataset.from_categorical(prediction[1]), dataset.from_categorical(prediction[2])]
    [expected_niqqud, expected_dagesh, expected_sin] = [data.niqqud[s], data.dagesh[s], data.sin[s]]
    actual = dataset.merge(data.text[s], batch, actual_niqqud, actual_dagesh, actual_sin)
    expected = dataset.merge(data.text[s], batch, expected_niqqud, expected_dagesh, expected_sin)
    total_letters = []
    total_words = []
    for i, (b, a, e) in enumerate(zip(batch, actual, expected)):
        letters = []
        letters.extend(expected_niqqud[i][expected_niqqud[i]>0] == actual_niqqud[i][expected_niqqud[i]>0])
        letters.extend(expected_dagesh[i][expected_dagesh[i]>0] == actual_dagesh[i][expected_dagesh[i]>0])
        letters.extend(expected_sin[i][expected_sin[i]>0] == actual_sin[i][expected_sin[i]>0])
        total_letters.extend(letters)
        words = []
        for aw, ew in zip(a, e):
            if len([x for x in 'אבגדהוזחטיכלמנסעפצקרשתךםןףץ' if x in aw]) > 1:
                words.append(aw == ew)
                if print_comparison and aw != ew:
                    print(aw, ew)
        total_words.extend(words)
        if print_comparison:
            print('מצוי: ', ' '.join(a))
            print('רצוי: ', ' '.join(e))
            print(f'{np.mean(letters):.2%} ({len(letters)-np.sum(letters)} out of {len(letters)})')
            print(f'{np.mean(words):.2%} ({len(words)-np.sum(words)} out of {len(words)})')
            print()
    print(f'letters: {np.mean(total_letters):.2%}, words: {np.mean(total_words):.2%}')

real_evaluation(data['mix'][1], s=slice(0, 10), print_comparison=True)  # letters: 95.23%, words: 78.60%

שֶׁיִּקַבֹּל שֶׁיְּקַבֵּל
מצוי:  עֶרֶב שַׁבָּת תְּפִלַּת מִנְחָה, כְּדֵי שֶׁיִּקַבֹּל עָלָיו הַשַּׁבָּת מִבְּעוֹד יוֹם. כְּשֶׁיַּגִּיעַ עֵת
רצוי:  עֶרֶב שַׁבָּת תְּפִלַּת מִנְחָה, כְּדֵי שֶׁיְּקַבֵּל עָלָיו הַשַּׁבָּת מִבְּעוֹד יוֹם. כְּשֶׁיַּגִּיעַ עֵת
97.70% (2 out of 87)
91.67% (1 out of 12)

בְּגֹּדַר בְּגֶדֶר
לִגְנּוֹתָיו, לְגִנּוֹתָיו,
וַיִהְיוּ וַיִּהְיוּ
הַקּוצִים הַקּוֹצִים
לַזָּהָב. לְזָהָב.
וְרַד וֶרֶד
מצוי:  בְּגֹּדַר -הַקּוֹצִים אֲשֶׁר לִגְנּוֹתָיו, וַיִהְיוּ הַקּוצִים לַזָּהָב. וְכָל וְרַד חַי, אֲשֶׁר נָגְעוּ
רצוי:  בְּגֶדֶר -הַקּוֹצִים אֲשֶׁר לְגִנּוֹתָיו, וַיִּהְיוּ הַקּוֹצִים לְזָהָב. וְכָל וֶרֶד חַי, אֲשֶׁר נָגְעוּ
87.78% (11 out of 90)
50.00% (6 out of 12)

-בֶּגֶד -בֶגֶד
וּמְכֻסֶּה וּמְכַסָּה
מַה מָה
מְרֻבֶּה מַרְבֶּה
כְּסוּת כְסוּת
מצוי:  לֹא -בֶּגֶד וּמְכֻסֶּה – מַה -רַעַשׁ? – אֵין רָע! מְרֻבֶּה כְּסוּת –
רצוי:  לֹא -בֶגֶד וּמְכַסָּה – מָה -רַעַשׁ? – אֵין רָע! מַרְבֶּה כְסוּת –
85.71% (7 out of 49)
44.44% (5 out of 9)

שַׁמַּשְׁנִים שֶׁמְּשֻׁנִּים
נִרָאוּ נִרְאוּ


In [None]:
import hebrew
import dataset

In [None]:
%env WANDB_MODE run
config = {
        'batch_size': 64,
        'units': 500,
        'order': [
            ('mix',    [(30e-4, 80e-4, 1e-4)], 'mix'),
            ('modern', [(50e-4, 50e-4, 1e-5)], 'modern'),
            ('modern', [(50e-4, 50e-4, 1e-5),
                        # (50e-4, 50e-4, 1e-5),
                       ], 'modern_over'),
        ],
    }
run = wandb.init(project="dotter",
                 # group="maxlen",
                 name=f'maxlen_test',
                 tags=['CLR', 'ordered'],
                 config=config)

with run:
    for maxlen, letters, words in [
            (75, 0.9511, 0.7778),
            (80, 0.9531, 0.7819),
            (85, 0.9535, 0.7819),
            (90, 0.9526, 0.7841),
            (95, 0.9514, 0.7795),
    ]:
        run.log({'maxlen': maxlen,
                 'letters': letters,
                 'words': words})


In [None]:
data['modern'][0].normalized.shape

In [None]:
import dataset

In [None]:
WORD_MAXLEN=5
EMBED=7
UNITS=11

model = tf.keras.Sequential()
model.add(keras.Input(shape=(None, WORDSIZE), batch_size=None))
print(model.output_shape)
model.add(layers.Embedding(LETTERS_SIZE, EMBED))
print(model.output_shape)
model.add(layers.TimeDistributed(layers.LSTM(UNITS, return_sequences=False)))
print(model.output_shape)
model.add(layers.LSTM(UNITS, return_sequences=True))
print('before', model.output_shape)
model.add(layers.Reshape((-1, WORDSIZE, UNITS)))
print('after', model.output_shape)
# model.add(layers.TimeDistributed(layers.RepeatVector(WORDSIZE)))
# print(model.output_shape)
model.add(layers.TimeDistributed(layers.LSTM(NIQQUD_SIZE, return_sequences=True)))
print(model.output_shape)

