# 1. Download The Data From Kaggle

In [1]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
! kaggle datasets download -d hijest/cleaned-data-for-the-chatbot-collected-from-movies

Downloading cleaned-data-for-the-chatbot-collected-from-movies.zip to /content
 98% 299M/305M [00:03<00:00, 77.3MB/s]
100% 305M/305M [00:03<00:00, 87.1MB/s]


In [3]:
!unzip '/content/cleaned-data-for-the-chatbot-collected-from-movies.zip'

Archive:  /content/cleaned-data-for-the-chatbot-collected-from-movies.zip
  inflating: dialogs_expanded.csv    
  inflating: input3.csv              
  inflating: model_att29iter_expanded.data-00000-of-00001  
  inflating: model_att29iter_expanded.index  
  inflating: target3.csv             


# 2. Import Libraries

In [4]:
import re
from tqdm import tqdm
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import model_selection
import tensorflow as tf

# 3. Read Data

In [5]:
data_frame = pd.read_csv("/content/dialogs_expanded.csv", index_col=False)
data_frame.head()

Unnamed: 0.1,Unnamed: 0,question,answer,question_as_int,answer_as_int,question_len,answer_len
0,1,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....,"[54, 67, 74, 74, 12, 1, 40, 1, 82, 70, 77, 83,...","[45, 77, 82, 1, 82, 70, 67, 1, 70, 63, 65, 73,...",71,55
1,2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...,"[45, 77, 82, 1, 82, 70, 67, 1, 70, 63, 65, 73,...","[46, 73, 63, 87, 14, 14, 14, 1, 82, 70, 67, 76...",55,73
2,3,You're asking me out. That's so cute. What's ...,Forget it.,"[56, 77, 83, 8, 80, 67, 1, 63, 81, 73, 71, 76,...","[37, 77, 80, 69, 67, 82, 1, 71, 82, 14]",62,10
3,4,"No, no, it's my fault -- we didn't have a prop...",Cameron.,"[45, 77, 12, 1, 76, 77, 12, 1, 71, 82, 8, 81, ...","[34, 63, 75, 67, 80, 77, 76, 14]",65,8
4,9,"Gosh, if only we could find Kat a boyfriend...",Let me see what I can do.,"[38, 77, 81, 70, 12, 1, 71, 68, 1, 77, 76, 74,...","[43, 67, 82, 1, 75, 67, 1, 81, 67, 67, 1, 85, ...",46,25


#### take only question & answer columns

In [6]:
data_frame = data_frame[['question','answer']]
data_frame.head()

Unnamed: 0,question,answer
0,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
1,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
2,You're asking me out. That's so cute. What's ...,Forget it.
3,"No, no, it's my fault -- we didn't have a prop...",Cameron.
4,"Gosh, if only we could find Kat a boyfriend...",Let me see what I can do.


#### cleaning text function

In [7]:
def cleaning_text(text):
    text = text.lower()
    text = re.sub("", "", text)
    text = re.sub("https?://\S+|www\.\S+", "", text)
    text = re.sub("<.*?>+", "", text)
    text = re.sub("\n", "", text)
    text = re.sub(r"[^\w]", " ", text)
    text = re.sub("\w*\d\w*", "", text)
    text = " ".join(filter(lambda x: x[0] != "@", text.split()))
    return text

data_frame.question = data_frame.question.map(cleaning_text)
data_frame.answer = data_frame.answer.map(cleaning_text)

In [8]:
data_frame.head()

Unnamed: 0,question,answer
0,well i thought we d start with pronunciation i...,not the hacking and gagging and spitting part ...
1,not the hacking and gagging and spitting part ...,okay then how bout we try out some french cuis...
2,you re asking me out that s so cute what s you...,forget it
3,no no it s my fault we didn t have a proper in...,cameron
4,gosh if only we could find kat a boyfriend,let me see what i can do


#### write function to add <add> at the beginning of text and <end> at last.

In [9]:
def add_start_end(text):
    text = f"<start> {text} <end>"
    return text

data_frame.question = data_frame.question.map(add_start_end)
data_frame.answer = data_frame.answer.map(add_start_end)

In [10]:
data_frame.head()

Unnamed: 0,question,answer
0,<start> well i thought we d start with pronunc...,<start> not the hacking and gagging and spitti...
1,<start> not the hacking and gagging and spitti...,<start> okay then how bout we try out some fre...
2,<start> you re asking me out that s so cute wh...,<start> forget it <end>
3,<start> no no it s my fault we didn t have a p...,<start> cameron <end>
4,<start> gosh if only we could find kat a boyfr...,<start> let me see what i can do <end>


#### add tokenizer function to convert text into sequence of number "pad"

In [11]:
def tokenize(lang):

    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        oov_token="", filters='!"#$%&()*+,-./:;=?@[\]^_`{|}~\t\n'
    )  

    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding="post")

    return tensor, lang_tokenizer

#### tokenize the data



In [12]:
question_sequence, question_tokenizer = tokenize(data_frame.question)
answer_sequence, answer_tokenizer = tokenize(data_frame.answer)

In [13]:
vocab_inp_size = len(question_tokenizer.word_index) + 1
vocab_tar_size = len(answer_tokenizer.word_index) + 1

embedding_dim = 256

units = 1024

batch_size = 32

#### split the data into training , validation

In [14]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    question_sequence, answer_sequence, test_size=0.2
)

In [15]:
def convert(token, tensor):
    for t in tensor:
        if t != 0:
            print("%d -> %s" % (t, token.index_word[t]))

In [16]:
def data_pipline(x, y, batch_size=32):
    data = tf.data.Dataset.from_tensor_slices((x, y))

    data = data.shuffle(1028)

    data = data.batch(batch_size, drop_remainder=True)

    data = data.prefetch(tf.data.AUTOTUNE)

    return data


train_dataset = data_pipline(x_train, y_train)

test_dataset = data_pipline(x_test, y_test)     

# 4. Modeling

## Encoder

In [17]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
        super(Encoder, self).__init__()
  
        self.batch_size = batch_size
        self.encoder_units = encoder_units

        self.embedding = tf.keras.layers.Embedding(
            vocab_size, embedding_dim, mask_zero=True
        )

        self.gru = tf.keras.layers.GRU(
            self.encoder_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer="glorot_uniform",
        )

    def call(self, x, hidden):
        x = self.embedding(x)
       
        output, state = self.gru(x, initial_state=hidden)

        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.encoder_units))

## Decoder

In [18]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.decoder_units = decoder_units

        self.embedding = tf.keras.layers.Embedding(
            vocab_size, embedding_dim, mask_zero=True
        )

        self.gru = tf.keras.layers.GRU(
            self.decoder_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer="glorot_uniform",
        )

        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden):

        x = self.embedding(x)

        output, hidden = self.gru(x, hidden)

        output = tf.reshape(output, (-1, output.shape[2]))

        x = tf.nn.softmax(self.fc(output))

        return x, hidden

In [19]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, batch_size)

# initialize the hidden state
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(question_sequence[:32], sample_hidden)
print(
    "Encoder output shape: (batch size, sequence length, units) {}".format(
        sample_output.shape
    )
)
print("Encoder Hidden state shape: (batch size, units) {}".format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (32, 29, 1024)
Encoder Hidden state shape: (batch size, units) (32, 1024)


In [20]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, batch_size)
# the shape of sample hidden is (batch_size, units)
sample_decoder_output, _ = decoder(tf.random.uniform((batch_size, 1)), sample_hidden)

print(
    "Decoder output shape: (batch_size, vocab size) {}".format(
        sample_decoder_output.shape
    )
)

Decoder output shape: (batch_size, vocab size) (32, 27873)


## Optimizer & Loss Function

In [21]:
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction="none"
)

def loss_function(real, pred):
    # create the mask to ignore the padding tokens
    mask = tf.math.logical_not(tf.math.equal(real, 0))

    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)

    loss_ *= mask

    return tf.reduce_mean(loss_)

In [22]:
train_loss = tf.metrics.Mean(name="train loss")

test_loss = tf.metrics.Mean(name="test loss")

## Model Training

In [23]:
@tf.function

def train_step(inputs, target, enc_hidden):

    loss = 0

    with tf.GradientTape() as tape:
        
        enc_output, enc_hidden = encoder(inputs, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims(
            [answer_tokenizer.word_index[""]] * inputs.shape[0], 1
        )

        for t in range(1, target.shape[1]):
            predictions, dec_hidden = decoder(dec_input, dec_hidden)

            loss += loss_function(target[:, t], predictions)

            dec_input = tf.expand_dims(target[:, t], 1)

    batch_loss = loss / int(target.shape[1])

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    train_loss(batch_loss)
    return batch_loss

In [24]:
@tf.function

def test_step(inputs, target, enc_hidden):
    loss = 0
    
    enc_output, enc_hidden = encoder(inputs, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims(
        [answer_tokenizer.word_index[""]] * inputs.shape[0], 1
    )
    for t in range(1, target.shape[1]):

        predictions, dec_hidden = decoder(dec_input, dec_hidden)

        loss += loss_function(target[:, t], predictions)

        dec_input = tf.expand_dims(target[:, t], 1)

    batch_loss = loss / int(target.shape[1])

    test_loss(batch_loss)

## Training Loop

In [25]:
EPOCHS = 10

for epoch in range(EPOCHS):

    train_loss.reset_states()

    test_loss.reset_states()

    # initalize the hidden state of the encoder to zeros
    enc_hidden = encoder.initialize_hidden_state()

    steps_per_epoch = (
        answer_sequence.shape[0] // batch_size
    )  # => 4356 batch in the dataset
    bar = tf.keras.utils.Progbar(target=steps_per_epoch)

    count = 0
    # iterate over the training dataset
    for (batch, (inputs, target)) in enumerate(train_dataset):
        # update the progress bar
        count += 1
        # run the training step
        batch_loss = train_step(inputs, target, enc_hidden)
        bar.update(count)  # manually update the progress bar

    # iterate over the testing dataset
    for (batch, (inputs, target)) in enumerate(test_dataset):
        count += 1
        # run the testing step
        batch_loss = test_step(inputs, target, enc_hidden)
        bar.update(count)

    print("#" * 100)
    print(f"Epoch #{epoch + 1}")
    print(f"Training Loss {train_loss.result()}")
    print(f"Testing Loss {test_loss.result()}")
    print("#" * 100)

####################################################################################################
Epoch #1
Training Loss 1.4109516143798828
Testing Loss 1.2998616695404053
####################################################################################################
####################################################################################################
Epoch #2
Training Loss 1.199803113937378
Testing Loss 1.292590618133545
####################################################################################################
####################################################################################################
Epoch #3
Training Loss 1.046667456626892
Testing Loss 1.340939998626709
####################################################################################################
####################################################################################################
Epoch #4
Training Loss 0.901482105255127
Testing Loss 1.4044268131256104
##

# 5. Model Testing

In [26]:
def chatbot(sentence):

    # clean the input question sentence
    sentence = cleaning_text(sentence)
    # add the start token to the sentence
    sentence = add_start_end(sentence)
    # tokenize the sentence
    inputs = question_tokenizer.texts_to_sequences([sentence])
    # pad the sentence
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        inputs, maxlen=29, padding="post"
    )

    # initalize the hidden state of the encoder to zeros
    hidden = [tf.zeros((1, units))]
    # pass the sentence to the encoder with the hidden state as the initial hidden state
    enc_out, enc_hidden = encoder(inputs, hidden)
    # set the initial decoder hidden state to the encoder hidden state
    dec_hidden = enc_hidden
    # create the start token
    # start_token shape == (batch_size, 1)
    # repeat the start token for the batch size times
    dec_input = tf.expand_dims([answer_tokenizer.word_index[""]], 0)
    # create the result string
    result = ""
    # loop over the length of the sentence (32)

    for t in range(32):
        # passing the encoder output and the decoder hidden state to the decoder make sure the decoder input is the previous predicted word
        predictions, dec_hidden = decoder(dec_input, dec_hidden)

        # getting the predicted word index
        predicted_id = tf.argmax(predictions[0]).numpy()
        # getting the predicted word using the predicted index
        # add the predicted word to the result string
        result += answer_tokenizer.index_word[predicted_id] + " "
        # if the predicted word is the  token then stop the loop
        if answer_tokenizer.index_word[predicted_id] == "":
            # remove the  and  tokens from the result string
            result = result.replace(" ", "")
            result = result.replace("  ", "")
            # remove the  and  tokens from the sentence string
            sentence = sentence.replace(" ", "")
            sentence = sentence.replace(" ", "")
            return sentence, result

        # using the predicted word as the next decoder input
        dec_input = tf.expand_dims([predicted_id], 0)
    # remove the  and  tokens from the result string
    result = result.replace("<end>", "")

    return result


In [31]:
while True:
    message = input('Enter Message or type q to quit : \n')
    if message =='q':
        break
    output = chatbot(message)
    print('Bot : ',output)

Enter Message or type q to quit : 
hello
Bot :  who is this  you got no problem here you re going with the list                  
Enter Message or type q to quit : 
how are you?
Bot :  verb roger this is edie finneran edie this is roger kint he was killed                   
Enter Message or type q to quit : 
what is the weather today?
Bot :  it is within hours   when she comes to the jungle                     
Enter Message or type q to quit : 
good bye
Bot :  good bye                               
Enter Message or type q to quit : 
q
