In [None]:
import spacy
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from pickle import dump,load

In [None]:
nlp=spacy.load("en_core_web_sm",disable=['parser','tagger','ner'])

In [None]:
def read_book(path):
    with open(path,'r') as f:
        book=f.read()
    return book


In [None]:
#This function creates tokens by ignoring the special charecters
def seperate_punctation(text):
    return [token.text.lower() for token in nlp(text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n']


In [None]:
#replace by the length of the tokens
nlp.max_length=1198623

In [None]:
# Read the book and convert into tokens
book=read_book("/content/drive/MyDrive/DataSets/moby_dick_four_chapters.txt")
tokens=seperate_punctation(book)

In [None]:
# Creates sequences of text
len_sequences=25+1
text_sequences=[]
for i in range(len_sequences,len(tokens)-len_sequences):
    text_sequences.append(tokens[i:i+len_sequences])

In [None]:
#Tokenizer gets adjusted according to the  sequences
tokenizer=Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [None]:
sequences=tokenizer.texts_to_sequences(text_sequences)

In [None]:
sequences=np.array(sequences)
vocabulary_size=len(tokenizer.word_counts)

In [None]:
x=sequences[:,:-1]
y=sequences[:,-1]
y=to_categorical(y,num_classes=vocabulary_size+1)
seq_len=x.shape[1]

In [None]:

def create_model(vocabulary_size, seq_len):
    model = Sequential([
        Embedding(vocabulary_size, 25),
        LSTM(50, return_sequences=True),
        LSTM(50),
        Dense(50, activation="relu"),
        Dense(vocabulary_size, activation='softmax'),
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Assuming vocabulary_size and seq_len are defined
model = create_model(vocabulary_size=vocabulary_size + 1, seq_len=seq_len)
model.summary()


In [None]:
model.fit(x,y,batch_size=128,epochs=300,verbose=True)

Epoch 1/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 80ms/step - accuracy: 0.0414 - loss: 7.4409
Epoch 2/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 59ms/step - accuracy: 0.0518 - loss: 6.3847
Epoch 3/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 81ms/step - accuracy: 0.0515 - loss: 6.3526
Epoch 4/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - accuracy: 0.0498 - loss: 6.2462
Epoch 5/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 82ms/step - accuracy: 0.0512 - loss: 6.1353
Epoch 6/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 56ms/step - accuracy: 0.0536 - loss: 6.0178
Epoch 7/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 114ms/step - accuracy: 0.0550 - loss: 5.9114
Epoch 8/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 56ms/step - accuracy: 0.0597 - loss: 5.8526
Epoch 9/300
[1m89/89[0m [32m━━━━━━

<keras.src.callbacks.history.History at 0x7ee96d4d6c20>

In [None]:
model.save("/content/drive/MyDrive/DataSets/chatbot_model.h5")



In [None]:
dump(tokenizer,open("/content/drive/MyDrive/DataSets/tokenizer_chatbot",'wb'))

In [None]:

def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_word_ind = np.argmax(model.predict(pad_encoded, verbose=0), axis=-1)[0]
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' ' + pred_word
        output_text.append(pred_word)
    return ' '.join(output_text)



In [None]:
import random
random_pick=random.randint(0,len(text_sequences))
random_seed_text=text_sequences[random_pick]
seed_text=" ".join(random_seed_text)
seed_text

'frost is all on the outside or whether thou observest it from that sashless window where the frost is on both sides and of which the'

In [None]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=10)

In [None]:
seed_text=input("The story line Which needs to be continued:")

In [None]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=10)

'it with the middle of the dreary street shouldering i'