In [3]:
import os
import json
import argparse

import numpy as np

from model import build_model, save_weights
import numpy as np

from model import build_model, load_weights

from keras.models import Sequential, load_model
from keras.layers import LSTM, Dropout, TimeDistributed, Dense, Activation, Embedding


DATA_DIR = './data'
LOG_DIR = './logs'

BATCH_SIZE = 16
SEQ_LENGTH = 64

In [4]:
class TrainLogger(object):
    def __init__(self, file):
        self.file = os.path.join(LOG_DIR, file)
        self.epochs = 0
        with open(self.file, 'w') as f:
            f.write('epoch,loss,acc\n')

    def add_entry(self, loss, acc):
        self.epochs += 1
        s = '{},{},{}\n'.format(self.epochs, loss, acc)
        with open(self.file, 'a') as f:
            f.write(s)


In [5]:

def read_batches(T, vocab_size):
    length = T.shape[0]; #129,665
    batch_chars = int(length / BATCH_SIZE); # 8,104

    for start in range(0, batch_chars - SEQ_LENGTH, SEQ_LENGTH): # (0, 8040, 64)
        X = np.zeros((BATCH_SIZE, SEQ_LENGTH)) # 16X64
        Y = np.zeros((BATCH_SIZE, SEQ_LENGTH, vocab_size)) # 16X64X86
        for batch_idx in range(0, BATCH_SIZE): # (0,16)
            for i in range(0, SEQ_LENGTH): #(0,64)
                X[batch_idx, i] = T[batch_chars * batch_idx + start + i] # 
                Y[batch_idx, i, T[batch_chars * batch_idx + start + i + 1]] = 1
        yield X, Y

In [6]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Train the model on some text.')
    parser.add_argument('--input', default='input.txt', help='name of the text file to train from')
    parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train for')
    parser.add_argument('--freq', type=int, default=10, help='checkpoint save frequency')
    args, unknown = parser.parse_known_args()

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

epochs = args.epochs
save_freq = args.freq
text = open(os.path.join(DATA_DIR, args.input)).read()

print("processing")
# character to index and vice-versa mappings
char_to_idx = { ch: i for (i, ch) in enumerate(sorted(list(set(text)))) }
print("Number of unique characters: " + str(len(char_to_idx))) #86

idx_to_char = { i: ch for (ch, i) in char_to_idx.items() }
vocab_size = len(char_to_idx)
print("processing done")

print("creating model")
#model_architecture    
model = Sequential()
model.add(Embedding(vocab_size, 512, batch_input_shape=(BATCH_SIZE, SEQ_LENGTH)))
for i in range(3):
    model.add(LSTM(256, return_sequences=True, stateful=True))
    model.add(Dropout(0.2))

model.add(TimeDistributed(Dense(vocab_size))) 
model.add(Activation('softmax'))
print("model created")
    
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


#Train data generation
print("training data")
T = np.asarray([char_to_idx[c] for c in text], dtype=np.int32) #convert complete text into numerical indices

print("Length of text:" + str(T.size)) #129,665

steps_per_epoch = (len(text) / BATCH_SIZE - 1) / SEQ_LENGTH  

log = TrainLogger('training_log.csv')

for epoch in range(epochs):
    print('\nEpoch {}/{}'.format(epoch + 1, epochs))
        
    losses, accs = [], []

    for i, (X, Y) in enumerate(read_batches(T, vocab_size)):
            
        print(X);

        loss, acc = model.train_on_batch(X, Y)
        print('Batch {}: loss = {}, acc = {}'.format(i + 1, loss, acc))
        losses.append(loss)
        accs.append(acc)

    log.add_entry(np.average(losses), np.average(accs))
    
    if (epoch + 1) % save_freq == 0:
            save_weights(epoch + 1, model)
            print('Saved checkpoint to', 'weights.{}.h5'.format(epoch + 1))

print("training done...........")

processing
Number of unique characters: 86
processing done
creating model
model created
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (16, 64, 512)             44032     
_________________________________________________________________
lstm (LSTM)                  (16, 64, 256)             787456    
_________________________________________________________________
dropout (Dropout)            (16, 64, 256)             0         
_________________________________________________________________
lstm_1 (LSTM)                (16, 64, 256)             525312    
_________________________________________________________________
dropout_1 (Dropout)          (16, 64, 256)             0         
_________________________________________________________________
lstm_2 (LSTM)                (16, 64, 256)             525312    
________________________________

Batch 27: loss = 3.14103102684021, acc = 0.193359375
[[29.  1.  3. ... 28. 84.  3.]
 [63. 84.  3. ... 18.  1.  3.]
 [ 3. 34.  3. ... 31.  3. 63.]
 ...
 [28. 33. 31. ...  3. 34. 18.]
 [61. 60. 84. ... 60. 84.  3.]
 [84.  3. 30. ... 34. 70.  3.]]
Batch 28: loss = 3.301115036010742, acc = 0.1728515625
[[28.  3. 62. ...  3. 28.  3.]
 [34.  3. 61. ...  3. 31.  3.]
 [61. 61.  1. ... 25. 40. 66.]
 ...
 [ 1.  3. 29. ...  3. 29. 60.]
 [31.  3. 61. ...  1.  3. 28.]
 [64. 17. 34. ... 61. 60. 29.]]
Batch 29: loss = 3.397311210632324, acc = 0.1494140625
[[58. 62. 60. ... 25. 33. 72.]
 [63. 17. 64. ...  1. 46. 72.]
 [68. 62.  1. ... 28. 84.  3.]
 ...
 [28.  1.  3. ...  0.  3. 32.]
 [22.  3. 29. ... 17. 33. 84.]
 [ 1. 28. 34. ... 65. 58. 70.]]
Batch 30: loss = 3.347227096557617, acc = 0.1435546875
[[75. 70. 58. ... 17. 29. 14.]
 [ 1. 29. 69. ... 33.  0. 40.]
 [34.  3. 29. ...  3. 34.  3.]
 ...
 [70.  3. 29. ... 34. 84.  3.]
 [ 3. 34.  3. ... 28.  3. 32.]
 [ 1. 40. 78. ... 84.  3. 28.]]
Batch 31: loss

Batch 61: loss = 3.1540369987487793, acc = 0.1650390625
[[ 3. 34.  3. ... 31. 22.  3.]
 [14. 23.  0. ... 26. 29. 28.]
 [ 3. 31.  3. ...  1. 61. 62.]
 ...
 [84.  0.  3. ... 23. 22.  0.]
 [ 7. 58.  1. ... 51. 25.  1.]
 [29. 34.  1. ... 28.  1. 62.]]
Batch 62: loss = 3.0938234329223633, acc = 0.181640625
[[28. 29. 28. ... 64.  0.  5.]
 [84.  0.  3. ... 25.  0. 58.]
 [63. 84.  3. ... 61.  0.  5.]
 ...
 [47. 25. 47. ... 65. 58. 70.]
 [18. 15. 24. ... 45. 72. 80.]
 [28. 62. 84. ... 84. 62. 17.]]
Batch 63: loss = 3.2291955947875977, acc = 0.158203125
[[ 1. 41. 72. ...  3. 61. 17.]
 [84.  3. 31. ...  1. 59. 58.]
 [ 1. 41. 72. ... 31. 33. 28.]
 ...
 [ 1. 40. 78. ... 34.  3. 64.]
 [62.  0. 40. ... 61. 60. 84.]
 [61.  1. 64. ...  3. 28. 70.]]
Batch 64: loss = 3.213397979736328, acc = 0.146484375
[[59.  1. 59. ... 64.  1. 64.]
 [64. 84.  3. ... 66. 64.  0.]
 [84.  3. 31. ...  3. 31.  3.]
 ...
 [17. 34.  1. ... 30.  3. 32.]
 [ 0.  3. 34. ...  3. 30.  3.]
 [ 3. 62. 17. ...  0.  0. 51.]]
Batch 65: lo

Batch 95: loss = 2.8708009719848633, acc = 0.2158203125
[[34.  1.  3. ... 17. 61.  1.]
 [63. 84.  3. ... 28. 17. 63.]
 [72. 80. 75. ...  1. 45. 72.]
 ...
 [58. 17. 28. ... 18.  0. 47.]
 [ 3. 31.  3. ... 25.  1. 18.]
 [28. 61. 63. ... 71. 68. 76.]]
Batch 96: loss = 2.889787197113037, acc = 0.205078125
[[ 3. 28. 70. ... 62. 63. 84.]
 [84.  3. 28. ... 56. 61. 62.]
 [80. 62.  0. ... 84.  3. 34.]
 ...
 [25. 46. 70. ... 14. 23.  0.]
 [16. 21.  0. ...  1. 73.  1.]
 [ 1. 72. 63. ... 14. 35. 37.]]
Batch 97: loss = 3.033266067504883, acc = 0.1708984375
[[64. 17. 64. ...  3. 31. 22.]
 [ 1. 58. 17. ... 17. 63.  1.]
 [70.  3. 61. ... 61. 17. 60.]
 ...
 [38. 25. 31. ...  1.  3. 28.]
 [16. 18. 19. ...  1. 32. 17.]
 [33.  0. 40. ...  3. 29. 61.]]
Batch 98: loss = 3.0093603134155273, acc = 0.1845703125
[[ 3. 28. 33. ... 32. 84.  3.]
 [63. 62. 63. ... 61. 18.  1.]
 [84.  3. 34. ...  1. 63. 62.]
 ...
 [22.  3. 34. ... 29.  0. 84.]
 [60. 84.  3. ...  3. 29. 18.]
 [61.  1.  3. ...  3. 33. 32.]]
Batch 99: l

In [10]:
MODEL_DIR = './model'
model2 = Sequential()
model2.add(Embedding(vocab_size, 512, batch_input_shape=(1,1)))
for i in range(3):
    model2.add(LSTM(256, return_sequences=True, stateful=True))
    model2.add(Dropout(0.2))

model2.add(TimeDistributed(Dense(vocab_size))) 
model2.add(Activation('softmax'))

model2.load_weights(os.path.join(MODEL_DIR, 'weights.100.h5'.format(epoch)))
model2.summary()



sampled = []
for i in range(1024):
    batch = np.zeros((1, 1))
    if sampled:
        batch[0, 0] = sampled[-1]
    else:
        batch[0, 0] = np.random.randint(vocab_size)
    result = model2.predict_on_batch(batch).ravel()
    sample = np.random.choice(range(vocab_size), p=result)
    sampled.append(sample)

print("sampled")
print(sampled)
print(''.join(idx_to_char[c] for c in sampled))


Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (1, 1, 512)               44032     
_________________________________________________________________
lstm_12 (LSTM)               (1, 1, 256)               787456    
_________________________________________________________________
dropout_12 (Dropout)         (1, 1, 256)               0         
_________________________________________________________________
lstm_13 (LSTM)               (1, 1, 256)               525312    
_________________________________________________________________
dropout_13 (Dropout)         (1, 1, 256)               0         
_________________________________________________________________
lstm_14 (LSTM)               (1, 1, 256)               525312    
_________________________________________________________________
dropout_14 (Dropout)         (1, 1, 256)              