# Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import json

pd.options.display.max_colwidth = None

In [2]:
data = pd.read_csv('Quotes.csv')
print(data.shape)
data.head()

(79988, 1)


Unnamed: 0,Quote
0,i love deadlines i love the whooshing noise they make as they go by
1,there is no greater agony than bearing an untold story inside you
2,what really knocks me out is a book that when youre all done reading it you wish the author that wrote it was a terrific friend of yours and you could call him up on the phone whenever you felt like it that doesnt happen much though
3,if theres a book that you want to read but it hasnt been written yet then you must write it
4,there is nothing to writing all you do is sit down at a typewriter and bleed


In [3]:
vocab = list(sorted(set(list(' '.join(data['Quote'].values.reshape(-1,))))))
vocab.append('~')
print(vocab)
json.dump(vocab, open('vocab.json', 'w'))

[' ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~']


In [4]:
char_to_ind = {u:i for i, u in enumerate(vocab)}
print(char_to_ind)
json.dump(char_to_ind, open('vocab_mapping.json', 'w'))

{' ': 0, '0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'a': 11, 'b': 12, 'c': 13, 'd': 14, 'e': 15, 'f': 16, 'g': 17, 'h': 18, 'i': 19, 'j': 20, 'k': 21, 'l': 22, 'm': 23, 'n': 24, 'o': 25, 'p': 26, 'q': 27, 'r': 28, 's': 29, 't': 30, 'u': 31, 'v': 32, 'w': 33, 'x': 34, 'y': 35, 'z': 36, '~': 37}


In [5]:
text = data['Quote'].values

In [6]:
def encode(text):
    encoded =  [char_to_ind[c] for c in text]
    encoded = encoded + [37]*(608-len(encoded)) # equating lengths of all sequences by putting '~' (608 to make it divisible by 16)
    return np.array(encoded)

In [7]:
encoded = np.array(list(map(encode, text)))
encoded = encoded.astype('int8')
encoded

array([[19,  0, 22, ..., 37, 37, 37],
       [30, 18, 15, ..., 37, 37, 37],
       [33, 18, 11, ..., 37, 37, 37],
       ...,
       [24, 15, 32, ..., 37, 37, 37],
       [29, 30, 25, ..., 37, 37, 37],
       [14, 25, 24, ..., 37, 37, 37]], dtype=int8)

In [8]:
encoded.shape

(79988, 608)

In [9]:
np.save('encoded.npy', encoded)

In [10]:
import tensorflow as tf

In [11]:
seq_len = 15
char_dataset = tf.data.Dataset.from_tensor_slices(encoded.reshape(79988*608))
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)

In [12]:
def create_seq_targets(seq):
    input_txt = seq[:-1]
    target_txt = seq[1:]
    return input_txt, target_txt

In [13]:
dataset = sequences.map(create_seq_targets)

In [14]:
batch_size = 128
buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

# Modelling

In [15]:
vocab_size = len(vocab)
embed_dim = 64
rnn_neurons = 1024

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, GRU
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [17]:
def sparse_cat_loss(y_true,y_pred):
    return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [18]:
def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    model = Sequential()
    model.add(Embedding(vocab_size, embed_dim, batch_input_shape=[batch_size, None]))
    model.add(GRU(rnn_neurons, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
    model.add(Dense(vocab_size))
    model.compile(optimizer='adam', loss=sparse_cat_loss) 
    return model

In [19]:
model = create_model(
  vocab_size=vocab_size,
  embed_dim=embed_dim,
  rnn_neurons=rnn_neurons,
  batch_size=batch_size
)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 64)           2432      
_________________________________________________________________
gru (GRU)                    (128, None, 1024)         3348480   
_________________________________________________________________
dense (Dense)                (128, None, 38)           38950     
Total params: 3,389,862
Trainable params: 3,389,862
Non-trainable params: 0
_________________________________________________________________


In [20]:
es = EarlyStopping(monitor='loss', mode='min', verbose=1, patience=7)
filepath = "model.h5"
ckpt = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
rlp = ReduceLROnPlateau(monitor='loss', patience=3, factor=0.2)

In [None]:
history = model.fit(
    dataset,
    callbacks=[es, ckpt, rlp],
    epochs=30
)