In [8]:
#importing all reqired Libraries

import os
import json
import argparse
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense, Activation, Embedding,TimeDistributed

In [11]:
DATA_DIR = './data'
LOG_DIR = './logs'
MODEL_DIR = './model'

BATCH_SIZE = 16
SEQ_LENGTH = 64

In [13]:
if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)

In [14]:
class TrainLogger(object):
    def __init__(self, file):
        self.file = os.path.join(LOG_DIR, file)
        self.epochs = 0
        with open(self.file, 'w') as f:
            f.write('epoch,loss,acc\n')

    def add_entry(self, loss, acc):
        self.epochs += 1
        s = '{},{},{}\n'.format(self.epochs, loss, acc)
        with open(self.file, 'a') as f:
            f.write(s)


In [15]:
def read_batches(T, vocab_size):
    length = T.shape[0]; 
    batch_chars = int(length / BATCH_SIZE); 

    for start in range(0, batch_chars - SEQ_LENGTH, SEQ_LENGTH): 
        X = np.zeros((BATCH_SIZE, SEQ_LENGTH)) 
        Y = np.zeros((BATCH_SIZE, SEQ_LENGTH, vocab_size)) 
        for batch_idx in range(0, BATCH_SIZE): 
            for i in range(0, SEQ_LENGTH): 
                X[batch_idx, i] = T[batch_chars * batch_idx + start + i] 
                Y[batch_idx, i, T[batch_chars * batch_idx + start + i + 1]] = 1
        yield X, Y

In [16]:
if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)

In [17]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Train the model on some text.')
    parser.add_argument('--input', default='input.txt', help='name of the text file to train from')
    parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train for')
    parser.add_argument('--freq', type=int, default=10, help='checkpoint save of  frequency')
    args, unknown = parser.parse_known_args()

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

epochs = args.epochs
save_freq = args.freq
text = open(os.path.join(DATA_DIR, args.input)).read()

print("processing")
# character to index and vice-versa mappings

char_to_idx = { ch: i for (i, ch) in enumerate(sorted(list(set(text)))) }
print("Number of unique characters: " + str(len(char_to_idx))) 

idx_to_char = { i: ch for (ch, i) in char_to_idx.items() }
vocab_size = len(char_to_idx)
print("processing done")

print("creating model")


#model_architecture 


model = Sequential()
model.add(Embedding(vocab_size, 512, batch_input_shape=(BATCH_SIZE, SEQ_LENGTH)))
for i in range(3):
    model.add(LSTM(256, return_sequences=True, stateful=True))
    model.add(Dropout(0.2))

model.add(TimeDistributed(Dense(vocab_size))) 
model.add(Activation('softmax'))
print("model created")
    
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


#Train data generation

print("training data")
T = np.asarray([char_to_idx[c] for c in text], dtype=np.int32) #convert complete text into numerical indices

print("Length of text:" + str(T.size)) #129,665

steps_per_epoch = (len(text) / BATCH_SIZE - 1) / SEQ_LENGTH  

log = TrainLogger('training_log.csv')

for epoch in range(epochs):
    print('\nEpoch {}/{}'.format(epoch + 1, epochs))
        
    losses, accs = [], []

    for i, (X, Y) in enumerate(read_batches(T, vocab_size)):
            
        print(X);

        loss, acc = model.train_on_batch(X, Y)
        print('Batch {}: loss = {}, acc = {}'.format(i + 1, loss, acc))
        losses.append(loss)
        accs.append(acc)

    log.add_entry(np.average(losses), np.average(accs))
    
    if (epoch + 1) % save_freq == 0:
            model.save_weights(str('model/weights_{}.h5'.format(epoch + 1)),model,save_format='h5')
            print('Saved checkpoint to', 'weights.{}.h5'.format(epoch + 1))

print("training done...........")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 [62. 63. 64. ... 63. 17. 62.]
 ...
 [35.  1.  3. ... 35.  3. 35.]
 [25. 85.  0. ... 47. 25. 39.]
 [ 1.  3. 35. ... 35.  3. 32.]]
Batch 7: loss = 0.25009945034980774, acc = 0.919921875
[[ 3. 29. 30. ... 18. 25. 25.]
 [14. 17. 65. ... 14. 17. 85.]
 [85.  3. 31. ... 61.  1. 32.]
 ...
 [18. 85. 85. ... 71.  1. 41.]
 [31. 31.  1. ...  3. 35. 22.]
 [17. 32.  1. ... 25. 51. 59.]]
Batch 8: loss = 0.2527606785297394, acc = 0.916015625
[[ 0.  3. 32. ... 63. 26. 64.]
 [ 3. 32.  3. ... 17. 30.  1.]
 [59. 78. 59. ... 14. 64. 10.]
 ...
 [79. 77. 67. ... 63. 85.  3.]
 [ 3. 62. 61. ... 30. 35. 30.]
 [70. 69. 67. ... 20. 11.  1.]]
Batch 9: loss = 0.23276610672473907, acc = 0.9296875
[[85.  3. 33. ... 71.  3. 33.]
 [ 3. 29. 22. ... 66. 73. 76.]
 [ 3. 29. 62. ... 32.  3. 29.]
 ...
 [31.  3. 62. ... 55.  0. 85.]
 [ 1. 62. 30. ...  3. 29. 61.]
 [80. 67. 59. ...  3. 33. 17.]]
Batch 10: loss = 0.2589469850063324, acc = 0.9130859375
[[17. 33.  

In [18]:
MODEL_DIR = './model'
model2 = Sequential()
model2.add(Embedding(vocab_size, 512, batch_input_shape=(1,1)))
for i in range(3):
    model2.add(LSTM(256, return_sequences=True, stateful=True))
    model2.add(Dropout(0.2))

model2.add(TimeDistributed(Dense(vocab_size))) 
model2.add(Activation('softmax'))

model2.load_weights(os.path.join(MODEL_DIR, 'weights_100.h5'.format(epoch)))
model2.summary()



sampled = []
for i in range(1024):
    batch = np.zeros((1, 1))
    if sampled:
        batch[0, 0] = sampled[-1]
    else:
        batch[0, 0] = np.random.randint(vocab_size)
    result = model2.predict_on_batch(batch).ravel()
    sample = np.random.choice(range(vocab_size), p=result)
    sampled.append(sample)

print("sampled")
print(sampled)
print(''.join(idx_to_char[c] for c in sampled))


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, 1, 512)               44544     
_________________________________________________________________
lstm_3 (LSTM)                (1, 1, 256)               787456    
_________________________________________________________________
dropout_3 (Dropout)          (1, 1, 256)               0         
_________________________________________________________________
lstm_4 (LSTM)                (1, 1, 256)               525312    
_________________________________________________________________
dropout_4 (Dropout)          (1, 1, 256)               0         
_________________________________________________________________
lstm_5 (LSTM)                (1, 1, 256)               525312    
_________________________________________________________________
dropout_5 (Dropout)          (1, 1, 256)              