### Hyperparameters

In [110]:
# Hyperparameters

seqlen = 64
#batch_size = 1024
sample_size = 10
vocab_size = 10000
model_tag = "overfit"


### Code

In [33]:
import tensorflow.keras as K
import os.path

import sys, imp

sys.path.append("../src")
from artstat import util
import numpy as np


from tensorflow.keras import Model, Input
from tensorflow.keras.callbacks import LearningRateScheduler, ModelCheckpoint
from tensorflow.keras.layers import (Reshape, Embedding, CuDNNLSTM, BatchNormalization, Dense, 
                                     Concatenate, Lambda, Activation, Dropout)
import tensorflow as tf

In [34]:
imp.reload(util)

<module 'artstat.util' from '/app/src/artstat/util.py'>

In [35]:
path_train = "/data/local/artstat/train"
path_test = "/data/local/artstat/test"

glove_dims = 300

glove = "/data/shared/glove/glove.6B.%dd.txt" % glove_dims

#glove = "/home/pmilovanov/data/glove/glove.840B.300d.txt"
#glove = 
#glove = "/home/pmilovanov/data/glove/glove.6B.100d.txt"
#glove = "/home/pmilovanov/data/glove/glove.6B.300d.txt"


In [5]:
path_train

'/data/local/artstat/train'

In [6]:
#vocab_file = "../vocab.txt"
vocab_file = "../vocab_lower.txt"

words, vocab = util.load_vocab(vocab_file, vocab_size)

In [7]:
emb_matrix = util.load_embeddings(vocab, glove_dims, glove)

print(len(vocab))
print(emb_matrix.shape)

10000
(10001, 300)


In [8]:
imp.reload(util)
X, Xu = util.load_data(path_train, vocab, pad=seqlen, lowercase=True)

100%|##########| 22860/22860 [00:43<00:00, 530.20it/s]


In [9]:
print(np.max(X))
print("Total words:", len(X))
print("Unknown words: {:.2f}%".format(100 * sum(Xu[:,0]) / len(Xu)))

10000
Total words: 17676455
Unknown words: 7.16%


In [10]:
def sampling_layer(x):
    data, sample_indices = x
    def gather_per_batch(e):
        batch, indices = e
        indices = tf.cast(indices, tf.int32)
        #print("batch", type(batch), batch.dtype, batch.shape)
        #print("indices", type(indices), indices.dtype, indices.shape)
        return (tf.gather(batch, indices),0)
    return tf.map_fn(gather_per_batch, (data, sample_indices),
                    parallel_iterations=batch_size*2, swap_memory=False)[0]


def sampling_layer_gather_nd(x):
    data, sample_indices = x
    return tf.gather_nd(data, tf.cast(sample_indices, tf.int32))


In [11]:
def make_model(*, seqlen, sample_size, emb_matrix, 
               lstm_sizes=[256, 256],
               dense_size=300, dense_layers=3,
               aux_dim=2, dropout_rate=0.1):    
    dim = emb_matrix.shape[1] + 1
    
    input_x = Input((seqlen,), dtype="int32", name="input_x")
    input_aux = Input((seqlen, aux_dim), dtype="float32", name="input_aux")
    input_sample_indices = Input((sample_size,2), dtype="int32", name="input_sample_indices")

    resh_aux  = input_aux #Reshape((seqlen,), name="resh_aux")(input_aux)

    emb_layer = Embedding(*emb_matrix.shape, input_length=seqlen, 
                          trainable=False, weights=[emb_matrix], 
                          name="embedding")
    emb_x = emb_layer(input_x)
    concat_x = Concatenate(name="concat_x")([emb_x, resh_aux])
    
    yhat = concat_x
    
    for i, layer_size in enumerate(lstm_sizes):
        ret_sequences = (i < len(lstm_sizes)-1)
        layerno = i+1
        yhat = CuDNNLSTM(layer_size, return_sequences=ret_sequences, name=('lstm%d' % layerno))(yhat)
        yhat = BatchNormalization()(yhat)
        yhat = Dropout(dropout_rate)(yhat)
    
    for layer in range(1, dense_layers+1):
        yhat = Dense(300, activation="relu", name=("dense%d" % layer))(yhat)
        yhat = BatchNormalization()(yhat)
        yhat = Dropout(dropout_rate)(yhat)
   
    # These two layers are special: given the model returned by this function,
    # we can make a model for prediction by taking input_x, input_aux as inputs,
    # and constructing the output by putting softmax on top of out_linear
    # and concatenating it with out_aux.
    yhat_aux = Dense(aux_dim, activation="sigmoid", name="out_aux")(yhat)
    yhat = Dense(len(vocab)+2, activation="linear", name="out_linear")(yhat)
    # len(vocab)+2 is because the zeroth word is for padding
    # and last word is for "unknown" 
    
    #print(input_sample_indices.dtype, input_sample_indices.shape)
    out_train = Lambda(sampling_layer_gather_nd, name="sampling")([yhat, input_sample_indices])
    out_train = Activation('softmax')(out_train)
    out_train = Concatenate(name="concat_out_train")([out_train, yhat_aux])
          
    model_train = Model([input_x, input_aux, input_sample_indices], [out_train])
    
    return model_train


def make_predict_model(model_train):
    # Given the model returned by make_model() above
    # we can make a model for prediction by taking input_x, input_aux as inputs,
    # and constructing the output by putting softmax on top of out_linear
    # and concatenating it with out_aux.
    
    yhat_aux = model_train.get_layer(name="out_aux").output
    yhat = model_train.get_layer(name="out_linear").output
     
    out_predict = Activation('softmax')(yhat)
    out_predict = Concatenate(name="concat_out_predict")([out_predict, yhat_aux])
   
    input_x, input_aux, _ = model_train.inputs
    model_predict = Model([input_x, input_aux], [out_predict])
    
    return model_predict


In [80]:
lstm_size=256

mtrain = make_model(seqlen=seqlen, sample_size=sample_size,                                               
                    emb_matrix=emb_matrix,
                    lstm_sizes=[256,256],
                    dense_size=256, dense_layers=5,
                    dropout_rate=0.01)

mpredict = make_predict_model(mtrain)

In [81]:
opt = K.optimizers.Adam(lr=0.01)
mtrain.compile(opt, loss='categorical_crossentropy', metrics=["accuracy"])

In [102]:
imp.reload(util)
batch_size=128
train_seq = util.NegativeSamplingPermutedSequence(data_x=X, data_xu=Xu,
                                                 seqlen=seqlen, batch_size=batch_size,
                                                 sample_size=sample_size,
                                                 vocab_size=len(vocab)+1)

In [103]:
checkpoint_filepath = "checkpoints/weights.lstm%d.batch%d.glove%d.sample%d.vocab%d.%s.hdf5" % (lstm_size, batch_size, glove_dims, sample_size, vocab_size, model_tag)

print(checkpoint_filepath)
checkpoint = K.callbacks.ModelCheckpoint(checkpoint_filepath,                       
                                        verbose=2,
                                        save_best_only=True,
                                         monitor='loss'
                                        )

checkpoints/weights.lstm256.batch128.glove300.sample5.vocab10000.overfit.hdf5


In [104]:
import math

def decay(epoch):
    lr0 = 0.01
    period = 10
    factor = 0.97
    return lr0 * math.pow(factor, math.floor(epoch/period))

for i in range(0,100,10):
    print("epoch {:d}: {:.6f}".format(i,decay(i)))

    
decay_scheduler = LearningRateScheduler(decay, verbose=1)

epoch 0: 0.010000
epoch 10: 0.009700
epoch 20: 0.009409
epoch 30: 0.009127
epoch 40: 0.008853
epoch 50: 0.008587
epoch 60: 0.008330
epoch 70: 0.008080
epoch 80: 0.007837
epoch 90: 0.007602


In [107]:

numbatches = 4* X.shape[0] // (batch_size*seqlen)

mtrain.fit_generator(train_seq, steps_per_epoch=numbatches, epochs=5000000,
                    callbacks=[checkpoint, decay_scheduler],
                    initial_epoch=150, verbose=1)



Epoch 00151: LearningRateScheduler reducing learning rate to 0.006332511891367892.
Epoch 151/5000000
 138/8631 [..............................] - ETA: 21:17 - loss: 0.9593 - acc: 0.8856

KeyboardInterrupt: 

### Test

In [94]:
X_test, Xu_test = util.load_data(path_test, vocab, pad=seqlen, lowercase=True)
#X_test, Xu_test = util.load_data(path_misc, vocab, pad=seqlen, lowercase=True)

100%|##########| 472/472 [00:00<00:00, 564.11it/s]


In [95]:
X_test
print("Unknown words: {:.2f}%".format(100*np.sum(Xu_test[:,0]) / len(Xu_test)))


Unknown words: 7.75%


In [96]:
mtrain.inputs

[<tf.Tensor 'input_x_7:0' shape=(?, 64) dtype=int32>,
 <tf.Tensor 'input_aux_7:0' shape=(?, 64, 2) dtype=float32>,
 <tf.Tensor 'input_sample_indices_7:0' shape=(?, 5, 2) dtype=int32>]

In [108]:
mpredict.compile(opt, loss="categorical_crossentropy")

In [98]:
len(words)

10001

In [99]:
def groom_probs(scores, alpha):
    s2 = np.power(scores, alpha)
    total = np.sum(s2)
    return s2 / total


z = groom_probs(np.array([0.6, 0.3, 0.1]),
                  3.0)

print(z)
print(np.sum(z))

[0.8852459  0.11065574 0.00409836]
0.9999999999999999


In [109]:
def capitalize(s):
    if len(s) == 0: return s
    return s[0].upper() + s[1:]



i = np.random.randint(len(X_test) - seqlen)
gen = X_test[i:i+seqlen].tolist()
genu = Xu_test[i:i+seqlen].tolist()


tX = np.zeros((1, seqlen), dtype="int32")
tXu = np.zeros((1, seqlen, 2), dtype="float32")
results = []

for i, idx in enumerate(gen):
    word = "<UNK>"
    if genu[i][0] < 0.1:
        word = words[idx]
    if genu[i][1] > 0.9:
        word = capitalize(word)
    sys.stdout.write(word + " ")
    sys.stdout.flush()

print("=" * 100)

UNK_IDX = len(words)


punct = ":-;.,!?'\")"
punct2 = "-/'(\""

iterations = 512

prev_word=words[gen[-1]]
word=""

chars=0
for j in range(iterations):
    tX[0] = np.array(gen[-seqlen:], "int32")
    tXu[0] = np.array(genu[-seqlen:], "float32")
    
    #print(tX)
    z = mpredict.predict([tX, tXu])
    scores=z[0][:-2]
    aux=z[0][-2:]
    idx = UNK_IDX
    #print(scores)
    while idx == UNK_IDX:
        idx = np.random.choice(range(len(vocab)+2), p=scores)
    if idx != UNK_IDX:
        gen.append(idx)
        genu.append([0.0, aux[1]])
        word=words[idx]
        if aux[1] > 0.5:
            word = capitalize(word)
        results.append(word)
    else:
        gen.append(0)
        genu.append([1.0, 0.0])
        results.append("<UNK>")
    #print("idx", idx, UNK_IDX)

    if 80-chars < len(word)+1:
        sys.stdout.write("\n")
        chars=0
    if punct.find(word) < 0 and punct2.find(prev_word) < 0:
        sys.stdout.write(" ")
        chars+=1
    sys.stdout.write(word)
    chars+=len(word)
    sys.stdout.flush()
    
    prev_word=word
#    print("%s ", results[-1])

 profit nationally of history archive, dean and the Same words lecture. The
 spaces-of field of and still. This is connected down in the past decades, the
 line. The boundaries with a modern encounters, ultimately. At exhibitions in
 russia, a part opportunity which straight curators of a further--dimensional
 upon recently"on am does? at the museum, working s, the royal, the central
 point in terms, without both dimensions"-included he s existing only or-a key
 the life must be stop-, what what a artistic. A project, the potential. By the
 job are erased between her substances narrative a whole reference whole
 traditional, and fantasies. Using exhibition are of modernity on understanding
 in the mid-at the Three-january-2016 <#PAD#>, seoul and its the band the
 international, as facility to be an 130'not a space of artists? applied-20th
 space and a es few works, the first of his generation s museums prize since
 year is their than s be gradually in"in a cold our. This are a strong. 

KeyboardInterrupt: 

In [39]:
len(X_test)

test_seq = util.NegativeSamplingPermutedSequence(data_x=X_test, data_xu=Xu_test, batch_size=1024, seqlen=seqlen, vocab_size=len(vocab)+1, sample_size=sample_size)

In [40]:
mtrain.evaluate_generator(test_seq, steps=100, verbose=1)



[0.9795894759893418, 0.853818359375]

In [41]:
mtrain.evaluate_generator(train_seq, steps=100, verbose=1)



[0.9828013104200363, 0.85705078125]

In [92]:
from sklearn import preprocessing

In [93]:
norm_emb_matrix =  preprocessing.normalize(emb_matrix, axis=1)

In [94]:
zn = preprocessing.normalize(z[0,:], axis=1)


In [95]:
we = zn[-1, :300]

In [96]:
scores = np.matmul(norm_emb_matrix, we)

In [97]:
idx = np.argmax(scores)
print(idx)
word = words[idx]

23


In [98]:
word

'at'

In [84]:
words[2]

'.'