In [1]:
import re
import nltk
import spacy
import emoji
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras

from keras import Sequential
from keras.layers import InputLayer, Dense, SimpleRNN, Embedding
from sklearn.model_selection import train_test_split

import wordcloud
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

nlp= spacy.load('en_core_web_sm')




In [2]:
data= pd.read_csv(r"D:\Datasets\Poems_text_NWP_NLP\poems-100.csv")

In [3]:
data.head()

Unnamed: 0,text
0,"O my Luve's like a red, red rose\nThat’s newly..."
1,"The rose is red,\nThe violet's blue,\nSugar is..."
2,How do I love thee? Let me count the ways.\nI ...
3,"Had I the heavens' embroidered cloths,\nEnwrou..."
4,"I.\n Enough! we're tired, my heart and I.\n..."


In [4]:
data.shape

(100, 1)

In [5]:
# preprocess the data
def data_preprocess(text):
    
    # normalize the data
    text= text.lower()

    # remove links
    text= re.sub(r'https?://\S+', ' ', text)

    # remove special characters
    text= re.sub(r'[^a-zA-Z0-9]+', ' ', text)

    # replace emoji with there meaning 
    text= emoji.demojize(text, delimiters= ',')
        
    return text

In [6]:
# ! pip install emoji

In [7]:
data['cleaned_text']= data['text'].apply(data_preprocess)

In [8]:
data

Unnamed: 0,text,cleaned_text
0,"O my Luve's like a red, red rose\nThat’s newly...",o my luve s like a red red rose that s newly s...
1,"The rose is red,\nThe violet's blue,\nSugar is...",the rose is red the violet s blue sugar is swe...
2,How do I love thee? Let me count the ways.\nI ...,how do i love thee let me count the ways i lov...
3,"Had I the heavens' embroidered cloths,\nEnwrou...",had i the heavens embroidered cloths enwrought...
4,"I.\n Enough! we're tired, my heart and I.\n...",i enough we re tired my heart and i we sit bes...
...,...,...
95,The city had withdrawn into itself\nAnd left a...,the city had withdrawn into itself and left at...
96,O gift of God! O perfect day:\n Whereon...,o gift of god o perfect day whereon shall no ...
97,"The world is too much with us; late and soon,\...",the world is too much with us late and soon ge...
98,To him who in the love of Nature holds\nCo...,to him who in the love of nature holds commun...


In [9]:
# Helper for tokenizing and stemming 

def stem_lemmatize(text):
    
    # for tokenizing the word
    text= text.split()
    
    # lemmatize the word
    lemma= WordNetLemmatizer()
    text= [lemma.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]
    #concatenating the tokens into sentence
    text= ' '.join(text)
    
    return text

In [10]:
data['cleaned_text']= data['cleaned_text'].apply(stem_lemmatize)

In [11]:
data

Unnamed: 0,text,cleaned_text
0,"O my Luve's like a red, red rose\nThat’s newly...",luve like red red rose newly sprung june luve ...
1,"The rose is red,\nThe violet's blue,\nSugar is...",rose red violet blue sugar sweet
2,How do I love thee? Let me count the ways.\nI ...,love thee let count way love thee depth breadt...
3,"Had I the heavens' embroidered cloths,\nEnwrou...",heaven embroidered cloth enwrought golden silv...
4,"I.\n Enough! we're tired, my heart and I.\n...",enough tired heart sit beside headstone thus w...
...,...,...
95,The city had withdrawn into itself\nAnd left a...,city withdrawn left last country country whirl...
96,O gift of God! O perfect day:\n Whereon...,gift god perfect day whereon shall man work pl...
97,"The world is too much with us; late and soon,\...",world much u late soon getting spending lay wa...
98,To him who in the love of Nature holds\nCo...,love nature hold communion visible form speaks...


In [12]:
data['text'][2]

"How do I love thee? Let me count the ways.\nI love thee to the depth and breadth and height\nMy soul can reach, when feeling out of sight\nFor the ends of being and ideal grace.\nI love thee to the level of every day's\nMost quiet need, by sun and candle-light.\nI love thee freely, as men strive for right.\nI love thee purely, as they turn from praise.\nI love thee with the passion put to use\nIn my old griefs, and with my childhood's faith.\nI love thee with a love I seemed to lose\nWith my lost saints. I love with the breath,\nSmiles, tears, of all my life; and, if God choose,\nI shall but love thee better after death.\n"

In [13]:
data['cleaned_text'][2]

'love thee let count way love thee depth breadth height soul reach feeling sight end ideal grace love thee level every day quiet need sun candle light love thee freely men strive right love thee purely turn praise love thee passion put use old grief childhood faith love thee love seemed lose lost saint love breath smile tear life god choose shall love thee better death'

In [14]:
# converting text to numerical format

# define the tokenizer object
tokenizer= Tokenizer()

# fitting tokenizer on the data --> get unique index for each word in the corpus 
tokenizer.fit_on_texts(data['cleaned_text'])


# check the dictionary 
print(len(set(tokenizer.index_word)))
tokenizer.index_word

4484


{1: 'one',
 2: 'love',
 3: 'shall',
 4: 'know',
 5: 'night',
 6: 'like',
 7: 'day',
 8: 'heart',
 9: 'see',
 10: 'life',
 11: 'thee',
 12: 'yet',
 13: 'come',
 14: 'long',
 15: 'man',
 16: 'old',
 17: 'thing',
 18: 'time',
 19: 'go',
 20: 'would',
 21: 'earth',
 22: 'eye',
 23: 'never',
 24: 'sea',
 25: 'men',
 26: 'look',
 27: 'hand',
 28: 'thou',
 29: 'well',
 30: 'every',
 31: 'ever',
 32: 'sun',
 33: 'thy',
 34: 'little',
 35: 'tree',
 36: 'light',
 37: 'god',
 38: 'u',
 39: 'voice',
 40: 'good',
 41: 'head',
 42: 'let',
 43: 'upon',
 44: 'heaven',
 45: 'take',
 46: 'nothing',
 47: 'stand',
 48: 'death',
 49: 'could',
 50: 'young',
 51: 'may',
 52: 'star',
 53: 'make',
 54: 'woman',
 55: 'sound',
 56: 'wood',
 57: 'still',
 58: 'till',
 59: 'sleep',
 60: 'must',
 61: 'say',
 62: 'air',
 63: 'sweet',
 64: 'soul',
 65: 'world',
 66: 'child',
 67: 'give',
 68: 'word',
 69: 'side',
 70: 'far',
 71: 'pas',
 72: 'first',
 73: 'back',
 74: 'wind',
 75: 'year',
 76: 'white',
 77: 'house',


In [15]:
# convert the text sentences into sequences of words using the vocabulary created the code 
# for converting the sentences into the sequence of text is as given.

sentences= tokenizer.texts_to_sequences(data['cleaned_text'])
print(sentences[0])

[298, 6, 128, 128, 226, 1078, 1785, 760, 298, 6, 1786, 1787, 181, 553, 203, 227, 28, 1788, 1789, 161, 298, 298, 11, 57, 182, 58, 24, 761, 253, 58, 24, 761, 253, 182, 149, 1790, 1791, 32, 298, 11, 57, 182, 427, 10, 3, 352, 428, 11, 29, 298, 428, 11, 29, 13, 298, 1079, 299, 100, 353]


In [16]:
data['cleaned_text'][0]

'luve like red red rose newly sprung june luve like melodie sweetly play tune fair art thou bonnie lass deep luve luve thee still dear till sea gang dry till sea gang dry dear rock melt wi sun luve thee still dear sand life shall run fare thee well luve fare thee well come luve tho ten thousand mile'

In [17]:
# determine the vocabulary size
vocab_size= len(tokenizer.index_word)+1   # 0 is reserved for padding so that’s why we added 1

In [18]:
vocab_size

4485

In [19]:
data= []

for sent in sentences:
    for i in range(1, len(sent)):
        
        seq= sent[i-1:i+1]
        data.append(seq)
                

In [20]:
# the data is a list of lists where the inner list has the first entry x and the second entry y.
data[0:10]

[[298, 6],
 [6, 128],
 [128, 128],
 [128, 226],
 [226, 1078],
 [1078, 1785],
 [1785, 760],
 [760, 298],
 [298, 6],
 [6, 1786]]

In [21]:
len(data)

12486

In [22]:
# splitting the data into X and y

# convert list into array
data_array= np.array(data)

In [23]:
print(data_array.shape)
data_array

(12486, 2)


array([[ 298,    6],
       [   6,  128],
       [ 128,  128],
       ...,
       [ 143,  151],
       [ 151, 1784],
       [1784, 1075]])

In [24]:
# convert it into X and y

X, y= data_array[:, 0], data_array[:, 1]

In [25]:
X

array([ 298,    6,  128, ...,  143,  151, 1784])

In [26]:
y

array([   6,  128,  128, ...,  151, 1784, 1075])

In [27]:
#convert the ouputs into the one hot vector over all the unique words
y= to_categorical(y, vocab_size)

In [28]:
# Build the model

model = Sequential()
model.add(Embedding(vocab_size, 2, input_length= 1))
# this line of code mean creating LSTM with 100 cell/neuron with each cell/neuron has its 
# own forget gate input gate and output gate
model.add(keras.layers.LSTM(100))
model.add(keras.layers.Dropout(0.2))
model.add(Dense(vocab_size, activation= 'softmax'))


model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 2)              8970      
                                                                 
 lstm (LSTM)                 (None, 100)               41200     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 4485)              452985    
                                                                 
Total params: 503155 (1.92 MB)
Trainable params: 503155 (1.92 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [29]:
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [30]:
# training and evaluation
model.fit(X, y, epochs= 45)

Epoch 1/45


Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


<keras.src.callbacks.History at 0x16536174110>

In [31]:
def generate_text(model, tokenizer, enter_text, n_pred):
    
    input_text, result= enter_text, enter_text
#     print(input_text, result)
    
    for i in range(n_pred):
        
        encoded= tokenizer.texts_to_sequences([input_text])[0]
        encoded= np.array(encoded)
        
        predict= model.predict(encoded, verbose= 0)
        y_pred= np.argmax(predict, axis= 1)
        out_word= ''
        
        for word, index in tokenizer.word_index.items():
            if index==y_pred:
                out_word= word
                break
                
    # append to input
    input_text, result= out_word, result+' '+out_word
            
    return result
                

In [46]:
print(generate_text(model, tokenizer, 'love', 13))

love love


In [55]:
encoder_inputs = keras.layers.Input(shape=(None, 256))
encoder = keras.layers.LSTM(100, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

In [56]:
encoder_outputs, state_h, state_c

(<KerasTensor: shape=(None, 100) dtype=float32 (created by layer 'lstm_4')>,
 <KerasTensor: shape=(None, 100) dtype=float32 (created by layer 'lstm_4')>,
 <KerasTensor: shape=(None, 100) dtype=float32 (created by layer 'lstm_4')>)