In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os

from keras.models import Model
from keras.layers import Dense, Embedding, Input, LSTM
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam, SGD
# sys.version

2024-07-22 21:29:33.992967: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Loading the data

In [2]:
input_texts = []
target_texts = []

In [3]:
for line in open("../poetry_generator/data/robert_frost.txt"):
    line = line.rstrip()
    if not line:
        continue
        
    input_line = '<sos>' + line
    target_line = line + '<eos>'
    
    input_texts.append(input_line)
    target_texts.append(target_line)

In [4]:
all_lines = input_texts + target_texts
all_lines[:15]

['<sos>Two roads diverged in a yellow wood,',
 '<sos>And sorry I could not travel both',
 '<sos>And be one traveler, long I stood',
 '<sos>And looked down one as far as I could',
 '<sos>To where it bent in the undergrowth;',
 '<sos>Then took the other, as just as fair,',
 '<sos>And having perhaps the better claim',
 '<sos>Because it was grassy and wanted wear,',
 '<sos>Though as for that the passing there',
 '<sos>Had worn them really about the same,',
 '<sos>And both that morning equally lay',
 '<sos>In leaves no step had trodden black.',
 '<sos>Oh, I kept the first for another day!',
 '<sos>Yet knowing how way leads on to way',
 '<sos>I doubted if I should ever come back.']

In [5]:
len(all_lines)

2872

### Converting sentences into integers and word to integer mapping
#### Tokenizer 1) Splits into individual tokens/words 2) each word is converted into an integer index for mapping 

In [6]:
from keras.preprocessing.text import Tokenizer

MAX_VOCAB_SIZE = 3000
MAX_SEQUENCE_LENGTH = 100

In [7]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
tokenizer.fit_on_texts(all_lines)
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

target_sequences[:15]

[[133, 571, 572, 7, 3, 573],
 [4, 574, 5, 66, 28, 984],
 [4, 24, 25, 985, 163, 5],
 [4, 181, 69, 25, 14, 137, 14, 5],
 [2, 39, 8, 986, 7, 1],
 [130, 197, 1, 575, 14, 73, 14],
 [4, 145, 305, 1, 260],
 [440, 8, 12, 576, 4, 244],
 [155, 14, 13, 11, 1, 577, 2280],
 [21, 579, 50, 415, 122, 1],
 [4, 153, 11, 987, 988, 2281],
 [7, 989, 44, 581, 21, 990, 2282],
 [941, 5, 261, 1, 262, 13, 323],
 [191, 992, 106, 80, 993, 15, 2],
 [5, 994, 29, 5, 126, 154, 86, 2283]]

In [8]:
print(input_sequences[:15])

max_seq_len_from_data = max(len(s) for s in input_sequences)
print('Maximum sequence length', max_seq_len_from_data)


[[413, 571, 572, 7, 3, 573, 808], [16, 574, 5, 66, 28, 984, 153], [16, 24, 25, 985, 163, 5, 258], [16, 181, 69, 25, 14, 137, 14, 5, 66], [59, 39, 8, 986, 7, 1, 2408], [259, 197, 1, 575, 14, 73, 14, 2409], [16, 145, 305, 1, 260, 2410], [2411, 8, 12, 576, 4, 244, 2412], [414, 14, 13, 11, 1, 577, 58], [578, 579, 50, 415, 122, 1, 809], [16, 153, 11, 987, 988, 580], [102, 989, 44, 581, 21, 990, 991], [2413, 5, 261, 1, 262, 13, 323, 2414], [810, 992, 106, 80, 993, 15, 2, 80], [30, 994, 29, 5, 126, 154, 86, 324]]
Maximum sequence length 11


#### word to integer mapping

In [9]:
word_to_index = tokenizer.word_index
print('unique tokens =',len(word_to_index))
# assert('<sos>' in word_to_index)
# assert('<eos>' in word_to_index)

unique tokens = 4614


### Padding sequences --> by adding padding tokens, all sequences of varying lengths have same length so that they can be effectively processed by the model

In [10]:
max_seq_len = min(max_seq_len_from_data, MAX_SEQUENCE_LENGTH)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_seq_len, padding='post')

In [11]:
print('Input sequence is {} with the size of {}' .format(input_sequences[0], input_sequences.shape[1]))
print('Target sequence is {} with the size of {}' .format(target_sequences[0], target_sequences.shape[1]))

Input sequence is [413 571 572   7   3 573 808   0   0   0   0] with the size of 11
Target sequence is [133 571 572   7   3 573   0   0   0   0   0] with the size of 11


### Load pre-trained word vectors with GloVe and word --> vec mapping

In [14]:
path2glove = os.getcwd() + '/glove/glove.6B.50d.txt'

In [34]:
word2vec = {}
with open(path2glove) as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
        
    print('Found {} word vectors.' .format(len(word2vec)))

Found 400000 word vectors.


In [45]:
from itertools import islice

dict(islice(word2vec.items(), 3))

{'the': array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
        -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
         2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
         1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
        -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
        -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
         4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
         7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
        -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
         1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
       dtype=float32),
 ',': array([ 0.013441,  0.23682 , -0.16899 ,  0.40951 ,  0.63812 ,  0.47709 ,
        -0.42852 , -0.55641 , -0.364   , -0.23938 ,  0.13001 , -0.063734,
        -0.39575 , -0.48162 ,  0.23291 ,  0.090201, -0.13324 ,  0.078639,
        -0.4

In [40]:
from itertools import islice

word2vec_first_10 = dict(islice(word2vec.items(), 3))

print(word2vec_first_10)

{'the': array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32), ',': array([ 0.013441,  0.23682 , -0.16899 ,  0.40951 ,  0.63812 ,  0.47709 ,
       -0.42852 , -0.55641 , -0.364   , -0.23938 ,  0.13001 , -0.063734,
       -0.39575 , -0.48162 ,  0.23291 ,  0.090201, -0.13324 ,  0.078639,
       -0.41634 , -0.1542