In [1]:
import keras
from keras.utils import to_categorical
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
from keras.layers import Dense, Flatten, Embedding
import numpy as np
import re
from bpe import BPE

In [2]:
# Hyperparameters
vocab_size = 2000
max_length = 20

In [3]:
# text cleaning and fit bpe tokenizer
bpe = BPE(vocab_size)

def tokenize(text):
    clean_text = re.sub("r[^A-z0-9?!.', ]", '', str(text))
    return ' '.join(word_tokenize(clean_text))


x = []
y = []

with open("dialogs.txt", 'r') as file:
    for line in file.readlines():
        question, answer = line.replace("\n", '').split('\t')
        x.append(tokenize(question))
        y.append(answer)


bpe.fit(x)

x = [bpe.decode(i) for i in x]
x = pad_sequences(x, maxlen=max_length)

In [4]:
# Save the tokenizer as a file
bpe.save()

# Load the tokenizer from a file
bpe.load()

In [5]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
output_size = len(label_encoder.classes_)
y = to_categorical(y)

In [6]:
test_x, test_y = x[:100], y[:100]

In [7]:
# Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=32, input_length=max_length),
    Flatten(),
    Dense(units=64, activation='relu'),
    Dense(units=32, activation='relu'),
    Dense(units=16, activation='relu'),
    Dense(units=output_size, activation='softmax')
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary() 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 32)            64000     
                                                                 
 flatten (Flatten)           (None, 640)               0         
                                                                 
 dense (Dense)               (None, 64)                41024     
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dense_3 (Dense)             (None, 3608)              61336     
                                                                 
Total params: 168968 (660.03 KB)
Trainable params: 16896

In [8]:
model.fit(x, y, epochs=50)

val_loss, val_acc = model.evaluate(test_x, test_y)
print(val_loss)
print(val_acc)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
0.09635287523269653
0.9700000286102295


In [9]:
# save model 
model.save("ChatBot.hdf5")

model = keras.models.load_model("ChatBot.hdf5")
val_loss, val_acc = model.evaluate(test_x, test_y)
print(val_loss)
print(val_acc)

  saving_api.save_model(


0.09635287523269653
0.9700000286102295


In [10]:
def preprocess_input(text):
    text_lower = text.lower()
    tokenize_text = tokenize(text_lower)
    decode_text = bpe.decode(tokenize_text)
    return pad_sequences([decode_text], maxlen=max_length)


while True:
    user_input = input("You: ")
    print(f"You: {user_input}")
    preprocessed_input = preprocess_input(user_input)
    prediction = model.predict(preprocessed_input)
    predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
    print(f"Bot: {predicted_label[0]}")
    print()

Bot: i'm fine. how about yourself?

Bot: i haven't been better. how about yourself?

Bot: i'm going to pcc.

Bot:  bye

Bot:  bye

Bot:  bye

