In [52]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as pltk
import seaborn as sns; sns.set(); sns.set_style('dark')

import datetime
import os

from sklearn.model_selection import train_test_split
import tensorflow as tf

In [53]:
from random import choice

grammar = {'t': ["S", "X"],
          'S': ["S", "X"],
          'T': ["T", "V"],
          'x': ["T", "V"],
          'V': ["p", "v"],
          'p': ["s", "x"],
          'P': ["T", "V"],
          'X': ["x", "s"],
          's': ["E"], 
          'v': ["E"]}
def micro_reber_gram_gen():
    seq = ["B", choice(["t", "P"])] 
    last = seq[-1]
    
    while last != 'E':
        seq.append(choice(grammar[last]))
        last = seq[-1]
    
    return ''.join(seq).upper()

def reber_gram_generator():
     c = choice(["T", "P"])
     return "B" + c + micro_reber_gram_gen() + c + "E"


POSSIBLE_CHARS = "BEPSTVX"
MAX_LENGTH = 50
def generate_corrupted_string():
     good_string = reber_gram_generator()
     index = np.random.randint(len(good_string))
     good_char = good_string[index]
     bad_char = np.random.choice(sorted(set(POSSIBLE_CHARS) - set(good_char)))
     return good_string[:index] + bad_char + good_string[index + 1:]

In [70]:
reber_gram_generator(), generate_corrupted_string()

('BPBTSXXVPSEPE', 'BTBTXSEBE')

In [71]:
def string_to_ids(s, chars=POSSIBLE_CHARS):
    return [chars.index(c) for c in s]

def generate_dataset(size):
    good_strings = [
        string_to_ids(reber_gram_generator())
        for _ in range(size // 2)
    ]
    bad_strings = [
        string_to_ids(generate_corrupted_string())
        for _ in range(size - size // 2)
    ]
    all_strings = good_strings + bad_strings
    X = tf.ragged.constant(all_strings, ragged_rank=1)
    y = np.array([[1.] for _ in range(len(good_strings))] +
                 [[0.] for _ in range(len(bad_strings))])
    return X, y


In [72]:
X_train, y_train = generate_dataset(10000)
X_valid, y_valid = generate_dataset(2000)
X_train

<tf.RaggedTensor [[0, 4, 0, 4, 3, 6, 6, 4, 4, 4, 4, 4, 5, 2, 6, 4, 5, 2, 6, 4, 5, 2, 6, 5, 2,
  3, 1, 4, 1]                                                               ,
 [0, 4, 0, 2, 4, 5, 5, 1, 4, 1], [0, 2, 0, 2, 5, 5, 1, 2, 1], ...,
 [0, 2, 0, 4, 6, 3, 1, 2, 4],
 [0, 4, 0, 2, 4, 1, 4, 5, 2, 6, 5, 2, 6, 4, 5, 5, 1, 4, 1],
 [0, 2, 1, 4, 3, 3, 3, 3, 3, 3, 3, 6, 3, 1, 2, 1]]>

In [73]:
X_train[0]

<tf.Tensor: shape=(29,), dtype=int32, numpy=
array([0, 4, 0, 4, 3, 6, 6, 4, 4, 4, 4, 4, 5, 2, 6, 4, 5, 2, 6, 4, 5, 2,
       6, 5, 2, 3, 1, 4, 1])>

In [74]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=[None], dtype=tf.int32, ragged=True),
    tf.keras.layers.Embedding(input_dim=len(POSSIBLE_CHARS),
                              output_dim=5),
    tf.keras.layers.GRU(30),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
optimizer = tf.keras.optimizers.SGD(learning_rate=0.02, momentum = 0.95,
                                    nesterov=True)
model.compile(loss="binary_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])


In [75]:
history = model.fit(X_train, y_train, epochs=20,
                    validation_data=(X_valid, y_valid))

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [149]:
test_strings = [generate_corrupted_string(), reber_gram_generator()]
X_test = tf.ragged.constant([string_to_ids(s) for s in test_strings], ragged_rank=1)

y_proba = model.predict(X_test)
print()
print("Estimated probability that these are Reber strings:")
for index, string in enumerate(test_strings):
    print("{}: {:.2f}%".format(string, 100 * y_proba[index][0]))




Estimated probability that these are Reber strings:
BPBPTVVEXE: 0.00%
BPBTSSSSSSSSXXVVEPE: 99.97%
