### Implementation of the model Seq2Seq

#### Encoder-Decoder (without attention)

##### Task: train a chatbot (base) on a dataset.

In [None]:
# Using TensorFlow 2.0
# Switching to version 2.0 (only works in Colab)

%tensorflow_version 2.x

In [None]:
# Loading libraries
# TensorFlow must have at least version 2.0

import codecs
import numpy as np
import pandas as pd
import tensorflow as tf
print(tf.__version__)

import matplotlib.pyplot as plt
%matplotlib inline

2.2.0


In [None]:
# Loading and reading data
# We will use dialogues from movies as a training dataset

# connecting to google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# working directory; create a directory at the first launch (if it doesn't exist yet),
# otherwise, replace True with False
if False:
    !mkdir "/content/drive/My Drive/Seq2Seq_light"
%cd "/content/drive/My Drive/Seq2Seq_light"

/content/drive/My Drive/Seq2Seq_light


In [None]:
# uploading data (ChatBot_Dataset) to the current working directory (Seq2Seq_light)
if False:
    !7z x ChatBot_Dataset.7z


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 6741321 bytes (6584 KiB)

Extracting archive: ChatBot_Dataset.7z
--
Path = ChatBot_Dataset.7z
Type = 7z
Physical Size = 6741321
Headers Size = 237
Method = LZMA2:25
Solid = +
Blocks = 1

  0%     40% 2 - ChatBot_Dataset/movie_lines.txt                                         79% 2 - ChatBot_Dataset/movie_lines.txt                                        Everything is Ok

Folders: 1
Files: 2
Size:       41402849
Compressed: 6741321


In [None]:
# let's look at the contents of the files
movie_convers = open('ChatBot_Dataset/movie_conversations.txt',mode='rt')
movie_convers = movie_convers.readlines()
movie_convers[:3]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']\n",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']\n",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']\n"]

In [None]:
# let's look at the contents of the files
with codecs.open('ChatBot_Dataset/movie_lines.txt', encoding='cp1251', errors='ignore') as file:
    movie_lines = [movie_lines for movie_lines in file]

movie_lines[:3]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n']

In [None]:
# Data pre-processing
# converting our list data to dataframe (movie_convers)

character_id_1 = []
character_id_2 = []
movie_id = []
dialogues = []

for line in movie_convers: 
    line_row = line.split('+++$+++')
    character_id_1.append(line_row[0].strip())
    character_id_2.append(line_row[1].strip())
    movie_id.append(line_row[2].strip())
    dialogues.append(line_row[3].strip())

movie_convers_df = pd.DataFrame({'character_id_1': character_id_1, 
                                 'character_id_2': character_id_2, 
                                 'movie_id': movie_id, 
                                 'dialogues': dialogues}, 
                                columns = ['character_id_1', 
                                           'character_id_2', 
                                           'movie_id', 
                                           'dialogues']) 

print(movie_convers_df.shape)
movie_convers_df.head()

(83097, 4)


Unnamed: 0,character_id_1,character_id_2,movie_id,dialogues
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']"
1,u0,u2,m0,"['L198', 'L199']"
2,u0,u2,m0,"['L200', 'L201', 'L202', 'L203']"
3,u0,u2,m0,"['L204', 'L205', 'L206']"
4,u0,u2,m0,"['L207', 'L208']"


In [None]:
# converting our list data to dataframe (movie_lines)

line_id = []
character_id = []
movie_id = []
name = []
text = []

for line in movie_lines: 
    line = line.replace('\t', '')
    line = line.replace('\xad', '')
    line_row = line.split('+++$+++')
    line_id.append(line_row[0].strip())
    character_id.append(line_row[1].strip())
    movie_id.append(line_row[2].strip())
    name.append(line_row[3].strip())
    text.append(line_row[4].strip())

movie_lines_df = pd.DataFrame({'line_id': line_id, 
                               'character_id': character_id, 
                               'movie_id': movie_id, 
                               'name': name, 
                               'text': text}, 
                                columns = ['line_id', 
                                           'character_id', 
                                           'movie_id', 
                                           'name', 
                                           'text']) 

print(movie_lines_df.shape)
movie_lines_df.head()

(304713, 5)


Unnamed: 0,line_id,character_id,movie_id,name,text
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [None]:
# input (input_texts) and output offers (target_texts)
movie_array = movie_lines_df.iloc[:, 4:5].to_numpy()[:10000]
input_texts = movie_array[:-1]
target_texts = movie_array[1:]

In [None]:
# dictionaries
def prepare_vocab(texts):
    vocab = sorted(set(' '.join(map(str, texts))))
    vocab.append('<START>')
    vocab.append('<END>')
    vocab_size = len(vocab)
    char2idx = {u:i for i, u in enumerate(vocab)}
    idx2char = np.array(vocab)
    return vocab_size, char2idx, idx2char

INPUT_VOCAB_SIZE, input_char2idx, input_idx2char = prepare_vocab(input_texts)
TARGET_VOCAB_SIZE, target_char2idx, target_idx2char = prepare_vocab(target_texts)

# arrays of lists to arrays of strings
input_texts = np.resize(input_texts, (input_texts.shape[0], ))
target_texts = np.resize(target_texts, (target_texts.shape[0], ))

In [None]:
# encoder_input_seqs-inputs to the Encoder
# decoder_input_seqs-inputs in the Decoder
# decoder_target_seqs-target outputs from Decoder (and the entire Encoder-Decoder model)
# chains will be sequences of integer indexes

input_texts_as_int = [[input_char2idx[c] for c in text] for text in input_texts]
target_texts_as_int = [[target_char2idx[c] for c in text] for text in target_texts]

encoder_input_seqs = [np.array(text) for text in input_texts_as_int]
decoder_input_seqs = []
decoder_target_seqs = []
for target_text in target_texts_as_int:
    decoder_input_seqs.append(np.array([target_char2idx['<START>']] + target_text))
    decoder_target_seqs.append(np.array(target_text + [target_char2idx['<END>']]))

In [None]:
# padding of chains
# before making padding, let's look at the chains

print(encoder_input_seqs[:3])
print('-----------------')
print(decoder_input_seqs[:3])
print('-----------------')
print(decoder_target_seqs[:3])

[array([47, 65, 62, 82,  0, 61, 72,  0, 71, 72, 77,  1]), array([47, 65, 62, 82,  0, 61, 72,  0, 77, 72,  1]), array([36,  0, 65, 72, 73, 62,  0, 76, 72, 11])]
-----------------
[array([87, 47, 65, 62, 82,  0, 61, 72,  0, 77, 72,  1]), array([87, 36,  0, 65, 72, 73, 62,  0, 76, 72, 11]), array([87, 46, 65, 62,  0, 72, 68, 58, 82, 27])]
-----------------
[array([47, 65, 62, 82,  0, 61, 72,  0, 77, 72,  1, 88]), array([36,  0, 65, 72, 73, 62,  0, 76, 72, 11, 88]), array([46, 65, 62,  0, 72, 68, 58, 82, 27, 88])]


In [None]:
max_enc_seq_length = max([len(seq) for seq in encoder_input_seqs])
max_dec_seq_length = max([len(seq) for seq in decoder_input_seqs])

encoder_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(
    encoder_input_seqs,
    value=input_char2idx[' '],
    padding='post',
    maxlen=max_enc_seq_length)

decoder_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(
    decoder_input_seqs,
    value=target_char2idx[' '],
    padding='post',
    maxlen=max_dec_seq_length)

decoder_target_seqs = tf.keras.preprocessing.sequence.pad_sequences(
    decoder_target_seqs,
    value=target_char2idx[' '],
    padding='post',
    maxlen=max_dec_seq_length)

In [None]:
# Creating a model

H_SIZE = 256 # Dimension of the hidden state
EMB_SIZE = 256 # embedding dimension (for both input and output chains)
BATCH_SIZE = 64
#INPUT_VOCAB_SIZE
#TARGET_VOCAB_SIZE

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.embed = tf.keras.layers.Embedding(INPUT_VOCAB_SIZE, EMB_SIZE)
        self.lstm0 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(H_SIZE, 
                                                                        return_sequences=True, 
                                                                        return_state=True)) 
        self.lstm1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(H_SIZE, 
                                                                        return_sequences=False, 
                                                                        return_state=True))
        
    def call(self, x):
        out = self.embed(x)
        out, h0_f, c0_f, h0_b, c0_b = self.lstm0(out)
        out, h1_f, c1_f, h1_b, c1_b = self.lstm1(out)
        h0 = tf.keras.layers.Concatenate()([h0_f, h0_b])
        c0 = tf.keras.layers.Concatenate()([c0_f, c0_b])
        h1 = tf.keras.layers.Concatenate()([h1_f, h1_b])
        c1 = tf.keras.layers.Concatenate()([c1_f, c1_b])
        state = [(h0, c0), (h1, c1)]
        return state

class Decoder(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.embed = tf.keras.layers.Embedding(TARGET_VOCAB_SIZE, EMB_SIZE)
        self.lstm0 = tf.keras.layers.LSTM(H_SIZE*2, 
                                          return_sequences=True, 
                                          return_state=True)
        self.lstm1 = tf.keras.layers.LSTM(H_SIZE*2, 
                                          return_sequences=True, 
                                          return_state=True)
        self.fc = tf.keras.layers.Dense(TARGET_VOCAB_SIZE, activation='softmax')

    def call(self, x, init_state):
        out = self.embed(x)
        out, h0, c0 = self.lstm0(out, initial_state=init_state[0])
        out, h1, c1 = self.lstm1(out, initial_state=init_state[1])

        out = self.fc(out)
        state = [(h0, c0), (h1, c1)]
        return out, state

In [None]:
encoder_model = Encoder()
decoder_model = Decoder()

encoder_inputs = tf.keras.layers.Input(shape=(None,))
decoder_inputs = tf.keras.layers.Input(shape=(None,))

enc_state = encoder_model(encoder_inputs)
decoder_outputs, _ = decoder_model(decoder_inputs, enc_state)

# general model
seq2seq = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
seq2seq.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder_1 (Encoder)             [((None, 512), (None 2648320     input_3[0][0]                    
__________________________________________________________________________________________________
decoder_1 (Decoder)             ((None, None, 89), [ 3742553     input_4[0][0]                    
Total params: 6,390,873
Trainable params: 6,390,873
Non-trainable params: 0
________________

In [None]:
# Training model 

EPOCHS = 50
loss = tf.losses.SparseCategoricalCrossentropy()
seq2seq.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

In [None]:
seq2seq.fit([encoder_input_seqs, decoder_input_seqs], decoder_target_seqs, 
            batch_size=BATCH_SIZE, 
            epochs=EPOCHS)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa0479b6dd8>

In [None]:
# Function for inference  

def seq2seq_inference(input_seq):
    state = encoder_model(input_seq)

    target_seq = np.array([[target_char2idx['<START>']]])

    decoded_sentence = ''
    while True:
        output_tokens, state = decoder_model(target_seq, state)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = target_idx2char[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '<END>' or
           len(decoded_sentence) > max_dec_seq_length):
            break

        target_seq = np.array([[sampled_token_index]])

    return decoded_sentence

In [None]:
# Example of inference 

for seq_index in range(70, 80):
    input_seq = encoder_input_seqs[seq_index: seq_index + 1]
    decoded_sentence = seq2seq_inference(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Result sentence:', decoded_sentence)
    print('Target sentence:', target_texts[seq_index])

-
Input sentence: You think you ' re the only sophomore at the prom?
Result sentence: I don't know.  I'm not at all.<END>
Target sentence: I don't have to be home 'til two.
-
Input sentence: I don't have to be home 'til two.
Result sentence: What are you doing to say?<END>
Target sentence: I have to be home in twenty minutes.
-
Input sentence: I have to be home in twenty minutes.
Result sentence: What are you doing?<END>
Target sentence: All I know is -- I'd give up my private line to go out with a guy like Joey.
-
Input sentence: All I know is -- I'd give up my private line to go out with a guy like Joey.
Result sentence: I don't know what you did.<END>
Target sentence: Sometimes I wonder if the guys we're supposed to want to go out with are the ones we actually want to go out with, you know?
-
Input sentence: Sometimes I wonder if the guys we're supposed to want to go out with are the ones we actually want to go out with, you know?
Result sentence: What are you doing?<END>
Target sen

#### Encoder-Decoder with attention

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.embed = tf.keras.layers.Embedding(INPUT_VOCAB_SIZE, EMB_SIZE)
        self.lstm0 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(H_SIZE, 
                                                                        return_sequences=True, 
                                                                        return_state=True)) 
        self.lstm1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(H_SIZE, 
                                                                        return_sequences=True, 
                                                                        return_state=True))
        
    def call(self, x):
        out = self.embed(x)
        out, h0_f, c0_f, h0_b, c0_b = self.lstm0(out)
        out, h1_f, c1_f, h1_b, c1_b = self.lstm1(out)
        h0 = tf.keras.layers.Concatenate()([h0_f, h0_b])
        c0 = tf.keras.layers.Concatenate()([c0_f, c0_b])
        h1 = tf.keras.layers.Concatenate()([h1_f, h1_b])
        c1 = tf.keras.layers.Concatenate()([c1_f, c1_b])
        state = [(h0, c0), (h1, c1)]
        return out, state

class Attention(tf.keras.Model):
    def __init__(self):
      super().__init__()
      self.dense1 = tf.keras.layers.Dense(1, 
                                          input_shape=(1, 
                                                       encoder_input_seqs.shape[1], 
                                                       H_SIZE*2), activation="softmax")

    def call(self, encoder_out):
      x = encoder_out
      x = self.dense1(x)
      return x

class Decoder(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.embed = tf.keras.layers.Embedding(TARGET_VOCAB_SIZE, EMB_SIZE)
        self.lstm0 = tf.keras.layers.LSTM(H_SIZE*2, 
                                          return_sequences=True, 
                                          return_state=True)
        self.lstm1 = tf.keras.layers.LSTM(H_SIZE*2, 
                                          return_sequences=True, 
                                          return_state=True)
        self.fc = tf.keras.layers.Dense(TARGET_VOCAB_SIZE, activation='softmax')
         
    def call(self, x, init_state, att_out):
        out = self.embed(x)
        out, h0, c0 = self.lstm0(out, initial_state=init_state[0])
        out, h1, c1 = self.lstm1(out, initial_state=init_state[1])
        out = tf.concat([out, att_out], 2)
        out = self.fc(out)
        state = [(h0, c0), (h1, c1)]
        return out, state

In [None]:
encoder_model = Encoder()
decoder_model = Decoder()
attention_model = Attention()

encoder_inputs = tf.keras.layers.Input(shape=(None,))
decoder_inputs = tf.keras.layers.Input(shape=(None,))

enc_output, enc_state = encoder_model(encoder_inputs)
att_out = attention_model(enc_output)
decoder_outputs, _ = decoder_model(decoder_inputs, enc_state, att_out)

# общая модель 
seq2seq = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
seq2seq.summary()

In [None]:
# Next - in the same way as the Encoder-decoder without attention

# P.S.: Encoder-decoder with attention takes longer to learn
