Loading and splitting data

In [84]:
import pandas as pd
import numpy as np
from keras.models import Model
from keras.layers import Input, SimpleRNN, Dense
from keras.optimizers import Adam

train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('eval_data.csv')

unique_letters = '*abcdefghijklmnopqrstuvwxyz'
char_to_int = {char: i for i, char in enumerate(unique_letters)}

for index, row in train_data.iterrows():
    train_data.at[index, train_data.columns[1]] = '*' + str(row[train_data.columns[1]])

for index, row in test_data.iterrows():
    test_data.at[index, test_data.columns[1]] = '*' + str(row[test_data.columns[1]])

def convert_word_to_integers(word):
    one_hot_vector = np.zeros((len(word), len(unique_letters)))
    i = 0
    for letter in word:
        index = char_to_int.get(letter)
        if index is not None:
            one_hot_vector[i][index] = 1

        i += 1
    
    return one_hot_vector

train_data = train_data.applymap(lambda x: convert_word_to_integers(x))
test_data = test_data.applymap(lambda x: convert_word_to_integers(x))

train_data = train_data.values
test_data = test_data.values

x_train = train_data[:, 0]
y_train = train_data[:, 1]
x_test = test_data[:, 0]
y_test = test_data[:, 1]

x_train = np.array([np.array(xi) for xi in x_train])
y_train = np.array([np.array(xi) for xi in y_train])
x_test = np.array([np.array(xi) for xi in x_test])
y_test = np.array([np.array(xi) for xi in y_test])

y_train_dup = y_train[:, :-1, :]
y_test_dup = y_test[:, :-1, :]
y_train = y_train[:, 1:, :]
y_test = y_test[:, 1:, :]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

print(y_train_dup.shape)
print(y_test_dup.shape)

(7000, 8, 27)
(7000, 8, 27)
(2000, 8, 27)
(2000, 8, 27)
(7000, 8, 27)
(2000, 8, 27)


RNN

In [94]:
enc = SimpleRNN(128, return_state=True)
enc_inputs = Input(shape=(8, 27))
enc_outputs, enc_state = enc(enc_inputs)

dec = SimpleRNN(128, return_sequences=True, return_state=True)
dec_inputs = Input(shape=(None, 27))
dec_outputs, _ = dec(dec_inputs, initial_state=enc_state)
dec_dense = Dense(27, activation='softmax')
dec_outputs = dec_dense(dec_outputs)  

Train model

In [95]:
# initialize model
model = Model(inputs=[enc_inputs, dec_inputs], outputs=dec_outputs)
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# train model
model.fit([x_train, y_train_dup], y_train, epochs=200, batch_size=10, validation_data=([x_test, y_test_dup], y_test))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x7f149c706a70>

Start token function

In [102]:
def start_token():
    one_hot_vector = np.zeros((1, 1, len(unique_letters)))
    one_hot_vector[0][0][0] = 1
    return one_hot_vector

Encoder Class

In [151]:
# encoder class
class Encoder:
    def __init__(self, model):
        self.model = model

    def predict(self, inputs):
        #perform one-hot encoding on inputs
        inputs = np.array([np.eye(27)[np.array([ord(x) - ord('a') + 1 for x in inputs])]])
        return self.model.predict(inputs)

Decoder Class

In [156]:
class Decoder:
    def __init__(self, model):
        self.model = model

    def predict(self, states):
        start = start_token()
        word = ''

        for i in range(8):
            outputs, states = self.model.predict([start] + [states])
            outputs = outputs[0, 0, :]

            max_index = np.argmax(outputs)
            sampled_char = unique_letters[max_index]
            word += sampled_char

            start = np.zeros((1, 1, len(unique_letters)))
            start[0][0][max_index] = 1

        return word

In [158]:
enc_model = Model(enc_inputs, enc_state)

dec_inputs_state = Input(shape=(128,))
dec_inputs_state = [dec_inputs_state]
dec_outputs, dec_state = dec(dec_inputs, initial_state=dec_inputs_state)
decoder_output = dec_dense(dec_outputs)

dec_model = Model([dec_inputs] + dec_inputs_state, [decoder_output, dec_state])

Saving Model

In [161]:
enc_model.save('encoder')
dec_model.save('decoder')





INFO:tensorflow:Assets written to: encoder/assets


INFO:tensorflow:Assets written to: encoder/assets






INFO:tensorflow:Assets written to: decoder/assets


INFO:tensorflow:Assets written to: decoder/assets


Load Model

In [163]:
import keras
enc_model_saved = keras.models.load_model('encoder')
dec_model_saved = keras.models.load_model('decoder')









Checker script

In [55]:
def check(pred: str, true: str):
    correct = 0
    for a, b in zip(pred, true):
        if a == b:
            correct += 1

    # Prediction is more than 8 letters, so penalize for every extra letter.
    correct -= max(0, len(pred) - len(true))
    correct = max(0, correct)
    return correct

def evaluate(encoder, decoder):
    print("Obtaining metrics for eval data:")
    eval_data = pd.read_csv("eval_data.csv").to_numpy()
    results = {
        "pred": [],
        "true": [],
        "score": [],
    }
    correct = [0 for _ in range(9)]
    for x, y in eval_data:
        pred = decoder.predict(encoder.predict(x))
        score = check(pred, y)
        results["pred"].append(pred)
        results["true"].append(y)
        results["score"].append(score)

        correct[score] += 1
    print("Eval dataset results:")
    for num_chr in range(9):
        print(
            f"Number of predictions with {num_chr} correct predictions: {correct[num_chr]}"
        )
    points = sum(correct[4:6]) * 0.5 + sum(correct[6:])
    marks = round(min(2, points / 1400 * 2) * 2) / 2  # Rounds to the nearest 0.5
    print(f"Points: {points}")
    print(f"Marks: {marks}")
    # Save predicitons and true sentences to inspect manually if required.
    pd.DataFrame.from_dict(results).to_csv("results_eval.csv", index=False)

Evaluate

In [164]:
encoder = Encoder(enc_model_saved)
decoder = Decoder(dec_model_saved)
evaluate(encoder, decoder)

Obtaining metrics for eval data:
Eval dataset results:
Number of predictions with 0 correct predictions: 14
Number of predictions with 1 correct predictions: 82
Number of predictions with 2 correct predictions: 221
Number of predictions with 3 correct predictions: 481
Number of predictions with 4 correct predictions: 543
Number of predictions with 5 correct predictions: 402
Number of predictions with 6 correct predictions: 200
Number of predictions with 7 correct predictions: 52
Number of predictions with 8 correct predictions: 5
Points: 729.5
Marks: 1.0
