In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras as K
import random
import sqlite3
import tensorflow as tf
from tensorflow.keras.layers import Input, Dropout, Dense, concatenate, Embedding
from tensorflow.keras.layers import Flatten, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from keras.utils import np_utils

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import LSTM, CuDNNGRU, CuDNNLSTM
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback

import warnings
warnings.filterwarnings('ignore')
import os
print(os.listdir("datasets"))

['SW_EpisodeIV.txt', 'SW_EpisodeV.txt', 'SW_EpisodeVI.txt', 'wordcloud_masks']


Using TensorFlow backend.


In [2]:
All_SW_Scripts = ""

def TextToString(txt):
    with open (txt, "r") as file:
        data=file.readlines()
        script = ""
        for x in data[1:-1]:
            x = x.lower().replace('"','').replace("\n"," \n ").split(' ')
            x[1] += ":"
            script += " ".join(x[1:-1]).replace("\n"," \n ")
        return script
    
All_SW_Scripts += TextToString("datasets/SW_EpisodeIV.txt")
All_SW_Scripts += TextToString("datasets/SW_EpisodeV.txt")
All_SW_Scripts += TextToString("datasets/SW_EpisodeVI.txt")

In [3]:
print(All_SW_Scripts[:1000])

threepio: did you hear that?  they've shut down the main reactor.  we'll be destroyed for sure.  this is madness!  
 threepio: we're doomed!  
 threepio: there'll be no escape for the princess this time.  
 threepio: what's that?  
 threepio: i should have known better than to trust the logic of a half-sized thermocapsulary dehousing assister...  
 luke: hurry up!  come with me!  what are you waiting for?!  get in gear!  
 threepio: artoo! artoo-detoo, where are you?  
 threepio: at last!  where have you been?  
 threepio: they're heading in this direction. what are we going to do?  we'll be sent to the spice mines of kessel or smashed into who knows what!  
 threepio: wait a minute, where are you going?  
 imperial: officer the death star plans are not in the main computer.  
 vader: where are those transmissions you intercepted?  
 rebel: officer we intercepted no transmissions. aaah...  this is a consular ship. were on a diplomatic mission.  
 vader: if this is a consular ship... wh

In [4]:
text_file = open("All_SW_Scripts.txt", "w")
text_file.write(All_SW_Scripts)
text_file.close()

In [5]:
Text_Data = All_SW_Scripts

charindex = list(set(Text_Data))
charindex.sort() 
print(charindex)

np.save("charindex.npy", charindex)

print(len(Text_Data))

['\n', ' ', '!', '#', "'", ',', '-', '.', '/', '0', '1', '2', '3', '4', '6', '7', '8', ':', ';', '?', '\\', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
166969


In [6]:
%%time
CHARS_SIZE = len(charindex)
SEQUENCE_LENGTH = 100
X_train = []
Y_train = []
for i in range(0, len(Text_Data)-SEQUENCE_LENGTH, 1 ): 
    X = Text_Data[i:i + SEQUENCE_LENGTH]
    Y = Text_Data[i + SEQUENCE_LENGTH]
    X_train.append([charindex.index(x) for x in X])
    Y_train.append(charindex.index(Y))

X_train = np.reshape(X_train, (len(X_train), SEQUENCE_LENGTH))

Y_train = np_utils.to_categorical(Y_train)

Wall time: 11.9 s


In [7]:
X_train.shape

(166869, 100)

In [8]:
Y_train.shape

(166869, 47)

In [9]:
def get_model():
    model = Sequential()
    inp = Input(shape=(SEQUENCE_LENGTH, ))
    x = Embedding(CHARS_SIZE, 100, trainable=False)(inp)
    x = CuDNNLSTM(512, return_sequences=True,)(x)
    x = CuDNNLSTM(512, return_sequences=True,)(x)
    x = CuDNNLSTM(512,)(x)
    x = Dense(256, activation="elu")(x)
    x = Dense(128, activation="elu")(x)
    outp = Dense(CHARS_SIZE, activation='softmax')(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.001),
                  metrics=['accuracy'],
                 )

    return model

model = get_model()

model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 100)          4700      
_________________________________________________________________
cu_dnnlstm (CuDNNLSTM)       (None, 100, 512)          1257472   
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 100, 512)          2101248   
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 512)               2101248   
____________________________________________________________

In [10]:
class TextSample(Callback):

    def __init__(self):
       super(Callback, self).__init__() 

    def on_epoch_end(self, epoch, logs={}):
        pattern = X_train[700]
        outp = []
        seed = [charindex[x] for x in pattern]
        sample = 'TextSample:' +''.join(seed)+'|'
        for t in range(100):
          x = np.reshape(pattern, (1, len(pattern)))
          pred = self.model.predict(x)
          result = np.argmax(pred)
          outp.append(result)
          pattern = np.append(pattern,result)
          pattern = pattern[1:len(pattern)]
        outp = [charindex[x] for x in outp]
        outp = ''.join(outp)
        sample += outp
        print(sample)

textsample = TextSample()

In [11]:
filepath="model_checkpoint.hdf5"

checkpoint = ModelCheckpoint(filepath,
                             monitor='loss',
                             verbose=1,
                             save_best_only=True,
                             mode='min')

early = EarlyStopping(monitor="loss",
                      mode="min",
                      patience=1)

In [14]:
model_callbacks = [checkpoint, early]
model.fit(X_train, Y_train,
          batch_size=50,
          epochs=40,
          verbose=2,
          callbacks = model_callbacks)

Train on 166869 samples
Epoch 1/40

Epoch 00001: loss improved from inf to 1.98613, saving model to model_checkpoint.hdf5
166869/166869 - 230s - loss: 1.9861 - acc: 0.4306
Epoch 2/40

Epoch 00002: loss improved from 1.98613 to 1.38387, saving model to model_checkpoint.hdf5
166869/166869 - 235s - loss: 1.3839 - acc: 0.5846
Epoch 3/40

Epoch 00003: loss improved from 1.38387 to 1.18989, saving model to model_checkpoint.hdf5
166869/166869 - 236s - loss: 1.1899 - acc: 0.6351
Epoch 4/40

Epoch 00004: loss improved from 1.18989 to 1.07270, saving model to model_checkpoint.hdf5
166869/166869 - 235s - loss: 1.0727 - acc: 0.6656
Epoch 5/40

Epoch 00005: loss improved from 1.07270 to 0.98221, saving model to model_checkpoint.hdf5
166869/166869 - 235s - loss: 0.9822 - acc: 0.6903
Epoch 6/40

Epoch 00006: loss improved from 0.98221 to 0.89610, saving model to model_checkpoint.hdf5
166869/166869 - 238s - loss: 0.8961 - acc: 0.7122
Epoch 7/40

Epoch 00007: loss improved from 0.89610 to 0.81747, savi

KeyboardInterrupt: 

In [None]:
# model = load_model(filepath)
model.save_weights("full_train_weights.hdf5")
model.save("full_train_model.hdf5")