## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding
from tensorflow.keras.optimizers import Adam
import nltk
import string



## Importing the file

In [2]:
text=""
with open('/kaggle/input/nlp-specialization-data/Novel - Moby-Dick By Herman Melville.txt', 'r') as file:
    text=file.readlines()
# text

In [3]:
df=pd.DataFrame({"text":text})
df

Unnamed: 0,text
0,CHAPTER 1\n
1,\n
2,Loomings.\n
3,\n
4,\n
...,...
22418,"sheathed beaks. On the second day, a sail dre..."
22419,picked me up at last. It was the devious-crui...
22420,her retracing search after her missing childre...
22421,orphan.\n


## Preprocessing

In [4]:
def process_text(text):
    text=text.lower()
    text=re.sub(r'\n+', " ", text)
    for i in string.punctuation:
        text=text.replace(i, "")
    return text

In [5]:
df['cleaned_text']=df['text'].apply(process_text)

In [6]:
df=df[((df != ' ').all(1))].reset_index()
df['n_words']=df['cleaned_text'].str.split(' ').apply(lambda x: len(x))
df=df.loc[df['n_words']>7].reset_index()

In [7]:
df

Unnamed: 0,level_0,index,text,cleaned_text,n_words
0,2,5,Call me Ishmael. Some years ago--never mind h...,call me ishmael some years agonever mind how ...,11
1,3,6,precisely--having little or no money in my pur...,preciselyhaving little or no money in my purse...,11
2,4,7,"particular to interest me on shore, I thought ...",particular to interest me on shore i thought i...,14
3,5,8,little and see the watery part of the world. ...,little and see the watery part of the world i...,18
4,6,9,driving off the spleen and regulating the circ...,driving off the spleen and regulating the circ...,12
...,...,...,...,...,...
17555,19243,22416,soft and dirgelike main. The unharming sharks...,soft and dirgelike main the unharming sharks ...,14
17556,19244,22417,with padlocks on their mouths; the savage sea-...,with padlocks on their mouths the savage seaha...,11
17557,19245,22418,"sheathed beaks. On the second day, a sail dre...",sheathed beaks on the second day a sail drew ...,14
17558,19246,22419,picked me up at last. It was the devious-crui...,picked me up at last it was the deviouscruisi...,14


## Tokenization

In [8]:
tokenizer=Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(df['cleaned_text'])
total_words=len(tokenizer.word_index)+1

In [9]:
print(f"Total Words: {total_words}")

Total Words: 19116


In [10]:
input_sequences=[]

for line in df['cleaned_text']:
    token_list=tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1, len(token_list)):
        n_gram=token_list[:i+1]
        input_sequences.append(n_gram)
        
print("Total number of sequences: ", len(input_sequences))

Total number of sequences:  185497


In [11]:
max_sequence_len=max([len(x) for x in input_sequences])
print("Max sequence length:", max_sequence_len)
input_sequences=np.array(pad_sequences(input_sequences, padding='pre', maxlen=max_sequence_len))

Max sequence length: 18


In [12]:
input_sequences[45]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0, 2003,  130,    2], dtype=int32)

## X and y

In [13]:
X, labels=input_sequences[:, :-1], input_sequences[:, -1]

In [14]:
ind=np.random.choice(X.shape[0], size=20000)
X=X[ind]

In [15]:
X.shape

(20000, 17)

In [16]:
y=tf.keras.utils.to_categorical(labels, num_classes=total_words)
y=y[ind]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

## Modelling

In [18]:
tf.keras.backend.clear_session()
model=Sequential()
model.add(Embedding(total_words, 200, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Bidirectional(LSTM(50)))
model.add(Dense(total_words, activation='softmax'))

adam=Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

In [19]:
history=model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=64, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Testing

In [22]:
reversed_wi={value:key for key, value in tokenizer.word_index.items()}

seed_text="picked me up at last"
next_words = 10

for _ in range(next_words):
    token_list=tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], padding='pre', maxlen=max_sequence_len-1)
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word=reversed_wi[predicted[0]]
    seed_text+=" "+output_word

print(seed_text)

picked me up at last the whale and have the whale and have have the


## Inference

Here it is pretty visible that the model is not performing very well.
This is due to the following reasons:
- Better model training for more epochs
- Improving model architexture to include more layers and more units per layers
- Using transformer architecture over RNN family because Transformers better understands the contexts and is even quicker because of parallel processing.
- Also, I had reduced the size of the input X because of out of memory issues so adding more rows can also be helpful