In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Bidirectional,LSTM,Dense
import matplotlib.pyplot as plt

In [2]:
!gdown --id 108jAePKK4R3BVYBbYJZ32JWUwxeMg20K

Downloading...
From: https://drive.google.com/uc?id=108jAePKK4R3BVYBbYJZ32JWUwxeMg20K
To: f:\Code\NLP\04_Week\sonnets.txt

  0%|          | 0.00/93.6k [00:00<?, ?B/s]
100%|██████████| 93.6k/93.6k [00:00<00:00, 612kB/s]
100%|██████████| 93.6k/93.6k [00:00<00:00, 612kB/s]


In [3]:
data_path='F:/Code/NLP/04_Week/sonnets.txt'
with open(data_path) as f:
    data=f.read()

In [4]:
corpus=data.lower().split('\n')

In [5]:
print(len(corpus))
print( '\n'.join(corpus[i] for i in range(5)))

2159
from fairest creatures we desire increase,
that thereby beauty's rose might never die,
but as the riper should by time decease,
his tender heir might bear his memory:
but thou, contracted to thine own bright eyes,


Tokenizing the texts

In [6]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words=len(tokenizer.word_index)+1
total_words

3211

In [7]:
corpus[0]

'from fairest creatures we desire increase,'

In [8]:
tokenizer.texts_to_sequences(corpus[0])

[[],
 [],
 [58],
 [],
 [],
 [],
 [17],
 [6],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [17],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [6],
 [],
 [],
 [],
 [6],
 [],
 [],
 [],
 [],
 [17],
 [],
 [],
 []]

In [9]:
tokenizer.texts_to_sequences([corpus[0]])

[[34, 417, 877, 166, 213, 517]]

In [10]:
tokenizer.texts_to_sequences([corpus[0]])[0]

[34, 417, 877, 166, 213, 517]

Generating n_grams

In [11]:
def n_gram_seqs(corpus, tokenizer):
    """
    Generates a list of n-gram sequences
    
    Args:
        corpus (list of string): lines of texts to generate n-grams for
        tokenizer (object): an instance of the Tokenizer class containing the word-index dictionary
    
    Returns:
        input_sequences (list of int): the n-gram sequences for each line in the corpus
    """
    input_sequences = []

    for i in corpus:
        token=tokenizer.texts_to_sequences([i])[0]
        for i in range(1,len(token)):
            n_gram_token=token[:i+1]
            input_sequences.append(n_gram_token)

    
    return input_sequences

In [12]:
first_example_sequence = n_gram_seqs([corpus[0]], tokenizer)

print("n_gram sequences for first example look like this:\n")
first_example_sequence

n_gram sequences for first example look like this:



[[34, 417],
 [34, 417, 877],
 [34, 417, 877, 166],
 [34, 417, 877, 166, 213],
 [34, 417, 877, 166, 213, 517]]

In [13]:
n_gram_1=n_gram_seqs(corpus[:1], tokenizer)
n_gram_1

[[34, 417],
 [34, 417, 877],
 [34, 417, 877, 166],
 [34, 417, 877, 166, 213],
 [34, 417, 877, 166, 213, 517]]

In [14]:
next_3_examples=n_gram_seqs(corpus[1:4],tokenizer)
next_3_examples

[[8, 878],
 [8, 878, 134],
 [8, 878, 134, 351],
 [8, 878, 134, 351, 102],
 [8, 878, 134, 351, 102, 156],
 [8, 878, 134, 351, 102, 156, 199],
 [16, 22],
 [16, 22, 2],
 [16, 22, 2, 879],
 [16, 22, 2, 879, 61],
 [16, 22, 2, 879, 61, 30],
 [16, 22, 2, 879, 61, 30, 48],
 [16, 22, 2, 879, 61, 30, 48, 634],
 [25, 311],
 [25, 311, 635],
 [25, 311, 635, 102],
 [25, 311, 635, 102, 200],
 [25, 311, 635, 102, 200, 25],
 [25, 311, 635, 102, 200, 25, 278]]

In [15]:
len(next_3_examples)

19

In [16]:
# n_grams of input_sequences have length:
input_sequences=n_gram_seqs(corpus,tokenizer)
print(len(input_sequences))

15462


In [17]:
# maximum length of sequences is:
max_len=max(len(i)  for i in input_sequences)
max_len

11

Padded Sequences

In [18]:
padded_seq=pad_sequences(first_example_sequence,maxlen=max(len(i) for i in first_example_sequence))
padded_seq

array([[  0,   0,   0,   0,  34, 417],
       [  0,   0,   0,  34, 417, 877],
       [  0,   0,  34, 417, 877, 166],
       [  0,  34, 417, 877, 166, 213],
       [ 34, 417, 877, 166, 213, 517]])

In [19]:
padded_seq_next_3=pad_sequences(next_3_examples,maxlen=max(len(i) for i in next_3_examples))
padded_seq_next_3

array([[  0,   0,   0,   0,   0,   0,   8, 878],
       [  0,   0,   0,   0,   0,   8, 878, 134],
       [  0,   0,   0,   0,   8, 878, 134, 351],
       [  0,   0,   0,   8, 878, 134, 351, 102],
       [  0,   0,   8, 878, 134, 351, 102, 156],
       [  0,   8, 878, 134, 351, 102, 156, 199],
       [  0,   0,   0,   0,   0,   0,  16,  22],
       [  0,   0,   0,   0,   0,  16,  22,   2],
       [  0,   0,   0,   0,  16,  22,   2, 879],
       [  0,   0,   0,  16,  22,   2, 879,  61],
       [  0,   0,  16,  22,   2, 879,  61,  30],
       [  0,  16,  22,   2, 879,  61,  30,  48],
       [ 16,  22,   2, 879,  61,  30,  48, 634],
       [  0,   0,   0,   0,   0,   0,  25, 311],
       [  0,   0,   0,   0,   0,  25, 311, 635],
       [  0,   0,   0,   0,  25, 311, 635, 102],
       [  0,   0,   0,  25, 311, 635, 102, 200],
       [  0,   0,  25, 311, 635, 102, 200,  25],
       [  0,  25, 311, 635, 102, 200,  25, 278]])

In [20]:
# Padded whole Corpus
max_sequence_length=max(len(i)  for i in input_sequences)
padded_whole_corpus=pad_sequences(input_sequences,maxlen=max_sequence_length)
padded_whole_corpus.shape

(15462, 11)

Spliting into features and labels

In [21]:
# Feature and labels for example 1
features_for_1=padded_seq[:,:-1]
labels_for_1=padded_seq[:,-1]
one_hot_encode_for_1=to_categorical(labels_for_1,total_words)
print(one_hot_encode_for_1.shape)
features_for_1


(5, 3211)


array([[  0,   0,   0,   0,  34],
       [  0,   0,   0,  34, 417],
       [  0,   0,  34, 417, 877],
       [  0,  34, 417, 877, 166],
       [ 34, 417, 877, 166, 213]])

In [22]:
features=padded_whole_corpus[:,:-1]
labels=padded_whole_corpus[:,-1]
one_hot_encode=to_categorical(labels,total_words)
print(features.shape)
print(one_hot_encode.shape)

(15462, 10)
(15462, 3211)


Model Creating

In [23]:
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_length-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words,activation='softmax'))

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [24]:
history=model.fit(features,one_hot_encode,epochs=50,verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [26]:
acc=history.history['accuracy']
loss=history.history['loss']
epochs=range(len(acc))
plt.plot(epochs,acc,'b',label='Training Accuracy')
plt.figure()
plt.show()

: 