In [97]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [67]:
## open the dataset chatbot
with open('chatbot.txt', 'r', encoding='utf-8') as file:
    text_file = file.read()

In [68]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_file])

In [99]:
total_words = len(tokenizer.word_index) + 1
total_words

2737

In [70]:
tokenizer.word_index

{'i': 1,
 'you': 2,
 'the': 3,
 'to': 4,
 'a': 5,
 'it': 6,
 'that': 7,
 'do': 8,
 'what': 9,
 'of': 10,
 'and': 11,
 'is': 12,
 'have': 13,
 'are': 14,
 'in': 15,
 'so': 16,
 'like': 17,
 'they': 18,
 'was': 19,
 "it's": 20,
 'did': 21,
 'yes': 22,
 'for': 23,
 'my': 24,
 'about': 25,
 'but': 26,
 "don't": 27,
 'on': 28,
 'be': 29,
 'no': 30,
 'he': 31,
 "i'm": 32,
 'me': 33,
 "that's": 34,
 'we': 35,
 'how': 36,
 'your': 37,
 'too': 38,
 'go': 39,
 'not': 40,
 'good': 41,
 'think': 42,
 'going': 43,
 'why': 44,
 'will': 45,
 'with': 46,
 'really': 47,
 'well': 48,
 'at': 49,
 'want': 50,
 'get': 51,
 'know': 52,
 'just': 53,
 'all': 54,
 'there': 55,
 'one': 56,
 "i'll": 57,
 'can': 58,
 'this': 59,
 'would': 60,
 'see': 61,
 'if': 62,
 "you're": 63,
 'people': 64,
 'nice': 65,
 'out': 66,
 'then': 67,
 'great': 68,
 'right': 69,
 'time': 70,
 'she': 71,
 'should': 72,
 'day': 73,
 "what's": 74,
 "didn't": 75,
 'new': 76,
 'up': 77,
 'oh': 78,
 'maybe': 79,
 'need': 80,
 'her': 81,
 

In [77]:
input_sequences = []
for line in text_file.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [78]:
input_sequences

[[1049, 36],
 [1049, 36, 14],
 [1049, 36, 14, 2],
 [1049, 36, 14, 2, 159],
 [1049, 36, 14, 2, 159, 32],
 [1049, 36, 14, 2, 159, 32, 666],
 [1049, 36, 14, 2, 159, 32, 666, 36],
 [1049, 36, 14, 2, 159, 32, 666, 36, 25],
 [1049, 36, 14, 2, 159, 32, 666, 36, 25, 596],
 [1049, 24],
 [1049, 24, 1629],
 [1049, 24, 1629, 12],
 [1049, 24, 1629, 12, 2559],
 [1049, 24, 1629, 12, 2559, 2560],
 [32, 666],
 [32, 666, 36],
 [32, 666, 36, 25],
 [32, 666, 36, 25, 596],
 [32, 666, 36, 25, 596, 32],
 [32, 666, 36, 25, 596, 32, 151],
 [32, 666, 36, 25, 596, 32, 151, 41],
 [32, 666, 36, 25, 596, 32, 151, 41, 133],
 [32, 666, 36, 25, 596, 32, 151, 41, 133, 23],
 [32, 666, 36, 25, 596, 32, 151, 41, 133, 23, 473],
 [32, 151],
 [32, 151, 41],
 [32, 151, 41, 133],
 [32, 151, 41, 133, 23],
 [32, 151, 41, 133, 23, 473],
 [32, 151, 41, 133, 23, 473, 30],
 [32, 151, 41, 133, 23, 473, 30, 166],
 [32, 151, 41, 133, 23, 473, 30, 166, 16],
 [32, 151, 41, 133, 23, 473, 30, 166, 16, 36],
 [32, 151, 41, 133, 23, 473, 30, 

In [81]:
## maximum sentence length / pad sequence
max_sequence_len = max([len(x) for x in input_sequences])
max_sequence_len

32

In [82]:
input_sequence_padded = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
input_sequence_padded

array([[   0,    0,    0, ...,    0, 1049,   36],
       [   0,    0,    0, ..., 1049,   36,   14],
       [   0,    0,    0, ...,   36,   14,    2],
       ...,
       [   0,    0,    0, ...,    2,   38,  507],
       [   0,    0,    0, ...,    0,  507,  595],
       [   0,    0,    0, ...,  507,  595,  222]])

In [95]:
x,y = input_sequence_padded[:, :-1], input_sequence_padded[:, -1]

In [100]:
y = tf.keras.utils.to_categorical(y, num_classes = total_words)

In [109]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [101]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length = max_sequence_len))
model.add(GRU(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(GRU(200))
model.add(Dense(total_words, activation='softmax'))
model.build(input_shape=(None, max_sequence_len))

In [102]:
model.summary()

In [104]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [106]:
from tensorflow.keras.callbacks import EarlyStopping
stops = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
history = model.fit(x_train, y_train,batch_size=32, epochs=200, validation_data=(x_test, y_test))

Epoch 1/200
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 41ms/step - accuracy: 0.0352 - loss: 6.4652 - val_accuracy: 0.0622 - val_loss: 5.8749
Epoch 2/200
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 53ms/step - accuracy: 0.0748 - loss: 5.6355 - val_accuracy: 0.1024 - val_loss: 5.4275
Epoch 3/200
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 94ms/step - accuracy: 0.1139 - loss: 5.0800 - val_accuracy: 0.1254 - val_loss: 5.1628
Epoch 4/200
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 96ms/step - accuracy: 0.1432 - loss: 4.6369 - val_accuracy: 0.1359 - val_loss: 4.9795
Epoch 5/200
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 96ms/step - accuracy: 0.1797 - loss: 4.2038 - val_accuracy: 0.1549 - val_loss: 4.8242
Epoch 6/200
[1m1205/1205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 96ms/step - accuracy: 0.2265 - loss: 3.8023 - val_accuracy: 0.1724 - val_loss: 4