In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [2]:
print("Num GPUs Available: ", 
      len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
drive_nb_dir = '/content/drive/My Drive/Colab_Notebooks'

# 1. Generative Models for Text

### 1.(c)-i Concatenate text files to create corpus.

In [5]:
def createCorpus(data_dir):
  print(data_dir)
  corpus = []
  for root, _, files in os.walk(data_dir):
    for f in files:
      with open(root + f, encoding='ascii', errors='ignore') as book:
        cur_corpus = book.read().lower()
        corpus.append(cur_corpus)
      print('Read text {}, string length {}'.format(f, len(corpus[-1])))
  corpus = sorted(corpus, key=lambda x : len(x))
  return corpus

In [6]:
concatCorpus = createCorpus(drive_nb_dir + '/data/books/')
print([len(book) for book in concatCorpus])

/content/drive/My Drive/Colab_Notebooks/data/books/
Read text TAM.txt, string length 514652
Read text OKEWFSMP.txt, string length 405741
Read text TAMatter.txt, string length 766542
Read text THWP.txt, string length 2005566
Read text TPP.txt, string length 244306
Read text AIIMAT.txt, string length 746219
Read text MLOE.txt, string length 412226
[244306, 405741, 412226, 514652, 746219, 766542, 2005566]


### 1.(c)-ii Use char-level representation

In [7]:
def charRepresent(corpus, num_books):
  chars = set([])
  for book in corpus[:num_books]:
    cur = list(set(book))
    chars.update(cur)
  chars = sorted(list(chars))
  char2int = dict((c, i) for i, c in enumerate(chars))
  int2char = dict((i, c) for i, c in enumerate(chars))
  return char2int, int2char

In [8]:
char2int, int2char = charRepresent(concatCorpus, 5)
print(char2int)

{'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, '&': 7, "'": 8, '(': 9, ')': 10, '*': 11, '+': 12, ',': 13, '-': 14, '.': 15, '/': 16, '0': 17, '1': 18, '2': 19, '3': 20, '4': 21, '5': 22, '6': 23, '7': 24, '8': 25, '9': 26, ':': 27, ';': 28, '<': 29, '=': 30, '>': 31, '?': 32, '[': 33, '\\': 34, ']': 35, '^': 36, '_': 37, 'a': 38, 'b': 39, 'c': 40, 'd': 41, 'e': 42, 'f': 43, 'g': 44, 'h': 45, 'i': 46, 'j': 47, 'k': 48, 'l': 49, 'm': 50, 'n': 51, 'o': 52, 'p': 53, 'q': 54, 'r': 55, 's': 56, 't': 57, 'u': 58, 'v': 59, 'w': 60, 'x': 61, 'y': 62, 'z': 63, '{': 64, '|': 65, '}': 66, '~': 67}


### 1.(c)-iv Window the corpus

In [9]:
def windowCorpus(corpus, win_size, char2int):
    input = []
    output = []
    for w in range(0, len(corpus)-win_size+1, 1):
        seqIn = corpus[w : w + win_size - 1]
        seqOut = corpus[w + win_size - 1]
        #print(seqIn)
        #print(seqOut)
        input.append([char2int[c] for c in seqIn])
        output.append(char2int[seqOut])
    #print(len(output))
    return input, output

def dataGenerate(corpus, num_books):
  win_size = 100
  inSeq, outChar = [], []
  for book in corpus[:num_books]:
    cur_in, cur_out = windowCorpus(book, win_size, char2int)
    inSeq.extend(cur_in)
    outChar.extend(cur_out)
  return inSeq, outChar

In [10]:
inSeq, outChar = dataGenerate(concatCorpus, 5)
print(len(inSeq))

2322649


### 1.(v) One-hot code the output and normalize input

In [11]:
# reshape
lstm_input = np.reshape(inSeq, (len(inSeq), 100 - 1, 1))
print(lstm_input.shape)
# normalize
lstm_input = lstm_input / float(len(char2int))
#print(lstm_input[0])

(2322649, 99, 1)


In [12]:
lstm_output = np_utils.to_categorical(outChar)
print(lstm_output.shape)

(2322649, 68)


### 1.(c)-vi~x Build single layer LSTM

In [13]:
# build LSTM model
LSTMmodel = Sequential()
#len(char2int)
LSTMmodel.add(LSTM(256,
                   input_shape=(lstm_input.shape[1], lstm_input.shape[2])))
LSTMmodel.add(Dropout(0.2))
LSTMmodel.add(Dense(lstm_output.shape[1], activation='softmax'))
print(LSTMmodel.summary())

LSTMmodel.compile(loss='categorical_crossentropy', optimizer='adam')

filepath = drive_nb_dir + "./LSTMweights/weights-improvement-{epoch:02d}-{loss:.2f}-bigger.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               264192    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 68)                17476     
Total params: 281,668
Trainable params: 281,668
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
# fit the model
LSTMmodel.fit(lstm_input, lstm_output, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20
Epoch 00001: loss improved from inf to 2.62615, saving model to /content/drive/My Drive/Colab_Notebooks./LSTMweights/weights-improvement-01-2.63-bigger.hdf5
Epoch 2/20
Epoch 00002: loss improved from 2.62615 to 2.26316, saving model to /content/drive/My Drive/Colab_Notebooks./LSTMweights/weights-improvement-02-2.26-bigger.hdf5
Epoch 3/20
Epoch 00003: loss improved from 2.26316 to 2.10294, saving model to /content/drive/My Drive/Colab_Notebooks./LSTMweights/weights-improvement-03-2.10-bigger.hdf5
Epoch 4/20
Epoch 00004: loss did not improve from 2.10294
Epoch 5/20
Epoch 00005: loss did not improve from 2.10294
Epoch 6/20
Epoch 00006: loss did not improve from 2.10294
Epoch 7/20
Epoch 00007: loss improved from 2.10294 to 2.06570, saving model to /content/drive/My Drive/Colab_Notebooks./LSTMweights/weights-improvement-07-2.07-bigger.hdf5
Epoch 8/20
Epoch 00008: loss improved from 2.06570 to 1.95381, saving model to /content/drive/My Drive/Colab_Notebooks./LSTMweights/weights-im

<tensorflow.python.keras.callbacks.History at 0x7f5106a3aa90>

### 1.(c)-vi Use the network with the best weights to generate 1000 characters

In [15]:
init = 'There are those who take mental phenomena naively, just as they would physical phenomena. This school of psychologists tends not to emphasize the object.'

write = [char2int[c] for c in init[-99:].lower()]

for i in range(1000):
  # convert to numpy array and normalize it
  seq = np.reshape(write, (1, len(write), 1))
  seq = seq / float(len(char2int))
  # predict the next character
  predictChar = LSTMmodel.predict(seq, verbose=0)
  predictIdx = np.argmax(predictChar)
  init += int2char[predictIdx]
  # make new input sequence
  write.append(predictIdx)
  write = write[1:len(write)]

print(init)

There are those who take mental phenomena naively, just as they would physical phenomena. This school of psychologists tends not to emphasize the object. the soace of the soace of the soace ase not the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace of the soace

In [16]:
# fit the model for 10 epoch more
LSTMmodel.fit(lstm_input, lstm_output, epochs=10, batch_size=128, callbacks=callbacks_list)

Epoch 1/10
Epoch 00001: loss did not improve from 1.66869
Epoch 2/10
Epoch 00002: loss did not improve from 1.66869
Epoch 3/10
Epoch 00003: loss improved from 1.66869 to 1.66108, saving model to /content/drive/My Drive/Colab_Notebooks./LSTMweights/weights-improvement-03-1.66-bigger.hdf5
Epoch 4/10
Epoch 00004: loss did not improve from 1.66108
Epoch 5/10
Epoch 00005: loss did not improve from 1.66108
Epoch 6/10
Epoch 00006: loss improved from 1.66108 to 1.65162, saving model to /content/drive/My Drive/Colab_Notebooks./LSTMweights/weights-improvement-06-1.65-bigger.hdf5
Epoch 7/10

In [17]:
init = 'There are those who take mental phenomena naively, just as they would physical phenomena. This school of psychologists tends not to emphasize the object.'

write = [char2int[c] for c in init[-99:].lower()]

for i in range(1000):
  # convert to numpy array and normalize it
  seq = np.reshape(write, (1, len(write), 1))
  seq = seq / float(len(char2int))
  # predict the next character
  predictChar = LSTMmodel.predict(seq, verbose=0)
  predictIdx = np.argmax(predictChar)
  init += int2char[predictIdx]
  # make new input sequence
  write.append(predictIdx)
  write = write[1:len(write)]

print(init)

There are those who take mental phenomena naively, just as they would physical phenomena. This school of psychologists tends not to emphasize the object. the sraae of the srace of the srace of the srace of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of the sraae of the srace of t