In [184]:
import string
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from itertools import chain

In [275]:
def isplnoun(*line):
    for entry in line:
        if str(entry)[0].isupper():
            if re.match('N 3pl$', str(entry)):
                return True
    return False


In [276]:
raw = pd.read_csv('leefloag/morph_english.flat', sep='\t', lineterminator='\n',
                  names=['pl', 'blk', 'sg', 'pos1', 'pos2', 'pos3', 'pos4'])
raw = raw.values
clean = raw[[not x.startswith(';') for x in raw[:,0]], :]
clean = np.delete(clean, 1, axis=1)

mask = [isplnoun(*x) for x in clean]
clean = clean[mask,:]
clean = clean[:, :2]
clean = clean[~pd.isnull(clean).any(axis=1)]
clean = np.array([[re.sub('[^a-z]', '', x.strip().lower()), re.sub('[^a-z]','', y.strip().lower())] for x, y in clean])

In [318]:
charset = set()
for x in clean.flatten():
    for y in x:
        charset.add(y)
mapping = {c: i for i, c in enumerate(charset)}
mapping['?'] = len(mapping) 
invmap = {i: c for c, i in mapping.items()}
lens = np.vectorize(len)(clean.transpose().flatten())
maxlen = max(lens)


def map_encode(*words, code=mapping, l=maxlen):
    output = []
    for word in words:
        xpand = list(word)
        padval = l - len(xpand)
        xpand[len(xpand):] = ['?'] * padval
        mapped = [code[y] for y in xpand]
        output.append(mapped)
    return np.array(output)
    
def map_decode(*ctext, code=invmap):
    output = []
    for word in ctext:
        dcoded = [code[x] for x in word]
        dcoded = ''.join(dcoded).strip('?')
        output.append(dcoded)
    return np.array(output)

In [395]:
invecs = tf.keras.utils.to_categorical(np.array(map_encode(*clean.transpose()[1])))
outvecs = tf.keras.utils.to_categorical(np.array(map_encode(*clean.transpose()[0])))


in_train, in_test, out_train, out_test = train_test_split(invecs, outvecs, test_size=.20, random_state=420)

# Model

In [493]:
minput = tf.keras.layers.Input(shape=(invecs.shape[1], invecs.shape[2]), name='main_input')
f = tf.keras.layers.Flatten()(minput)
h = tf.keras.layers.Dense(128, activation='relu')(f)

out_layers = []
for letter in range(invecs.shape[1]):
    out_layers.append(tf.keras.layers.Dense(invecs.shape[2], activation='softmax')(h))

slayermodel = tf.keras.models.Model(inputs=[minput], outputs=out_layers)

slayermodel.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [494]:
better_outs = [out_train[:,x,:] for x in range(32)]
slayermodel.fit(in_train, better_outs, epochs=10)



Train on 37634 samples
Epoch 1/10
 4352/37634 [==>...........................] - ETA: 43s - loss: 34.2649 - dense_644_loss: 2.7699 - dense_645_loss: 2.4326 - dense_646_loss: 2.7378 - dense_647_loss: 2.8094 - dense_648_loss: 2.7428 - dense_649_loss: 2.7043 - dense_650_loss: 2.5331 - dense_651_loss: 2.4131 - dense_652_loss: 2.1376 - dense_653_loss: 1.8296 - dense_654_loss: 1.4460 - dense_655_loss: 1.0905 - dense_656_loss: 0.8920 - dense_657_loss: 0.6793 - dense_658_loss: 0.4700 - dense_659_loss: 0.4359 - dense_660_loss: 0.4366 - dense_661_loss: 0.3006 - dense_662_loss: 0.2905 - dense_663_loss: 0.2207 - dense_664_loss: 0.3137 - dense_665_loss: 0.2358 - dense_666_loss: 0.2633 - dense_667_loss: 0.1884 - dense_668_loss: 0.1874 - dense_669_loss: 0.3003 - dense_670_loss: 0.2789 - dense_671_loss: 0.2410 - dense_672_loss: 0.2550 - dense_673_loss: 0.2275 - dense_674_loss: 0.1775 - dense_675_loss: 0.2238     

KeyboardInterrupt: 

In [492]:
tests = ['coen', 'jessica', 'mouse', 'smeej', 'door', 'deer', 'moose', 'jeff', 'gerpgork']
def make_answer_set(ins, model):
    outs = []
    for word in ins:
        y_test = model.predict(tf.keras.utils.to_categorical(map_encode(word)))
        outs.append(map_decode(np.argmax(y_test, axis=2).flatten())[0])
    print('|Input|Output|')
    for i, o, in zip(ins, outs):
        print(f'|{i}|{o}|')

make_answer_set(tests, slayermodel)

|Input|Output|
|coen|coens|
|jessica|jessicas|
|mouse|mouses|
|smeej|smeess|
|door|doors|
|deer|deers|
|moose|mooses|
|jeff|jeffs|
|gerpgork|gerpsorks|
