## ENV setup

Setup libraries and dependent data files 

#### Code

In [1]:
# Importing the required notebooks
#import import_ipynb
import data_utils

In [2]:
# Importing the required libraires
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.model_selection import train_test_split

## Variable setup

Setting up the variable values for the entire program

In [3]:
# Setting up the variable for preparing the model
only_start = False
max_vocab_length = 8000
max_length = 20
test_split = 0.2
random_seed = 42
data_subset = 20000
embedding_output_dimension = 128
lstm_units = 400
stacked_lstm_units = 256
dropout_rate = 0.2
epoch = 50
sparse_loss_fun = 'sparse_categorical_crossentropy'
one_hot_loss_fuc = 'categorical_crossentropy'

## Data preprocessing

preparing the datasets for model creation

In [4]:
# Load the data 
# Load the movie details meta data
movie_titles = data_utils.load_data(file_name='movie_titles_metadata.txt')

# Load the conversation meta data
movie_conversations = data_utils.load_data(file_name='movie_conversations.txt')

# Load the conversation lines
movie_lines = data_utils.load_data(file_name='movie_lines.txt')

Data read from ./data/movie_titles_metadata.txt and converted into 618 lines
Data read from ./data/movie_conversations.txt and converted into 83098 lines
Data read from ./data/movie_lines.txt and converted into 304714 lines


#### Variable check

In [5]:
len(movie_titles), len(movie_conversations), len(movie_lines)

(618, 83098, 304714)

#### Code

In [6]:
# Prepare dictionary for all data
movie_title_list, movie_conversation_list, movie_lines_list = data_utils.prepare_data(movie_titles=movie_titles, movie_conversations=movie_conversations, movie_lines=movie_lines)

#### Variable check

In [7]:
len(movie_title_list), len(movie_conversation_list), len(movie_lines_list)

(617, 83097, 304713)

In [8]:
movie_title_list

[{'movie_id': 'm0',
  'name': '10 things i hate about you',
  'year': '1999',
  'rating': '6.90',
  'genre': ['comedy', 'romance']},
 {'movie_id': 'm1',
  'name': '1492: conquest of paradise',
  'year': '1992',
  'rating': '6.20',
  'genre': ['adventure', 'biography', 'drama', 'history']},
 {'movie_id': 'm2',
  'name': '15 minutes',
  'year': '2001',
  'rating': '6.10',
  'genre': ['action', 'crime', 'drama', 'thriller']},
 {'movie_id': 'm3',
  'name': '2001: a space odyssey',
  'year': '1968',
  'rating': '8.40',
  'genre': ['adventure', 'mystery', 'sci-fi']},
 {'movie_id': 'm4',
  'name': '48 hrs.',
  'year': '1982',
  'rating': '6.90',
  'genre': ['action', 'comedy', 'crime', 'drama', 'thriller']},
 {'movie_id': 'm5',
  'name': 'the fifth element',
  'year': '1997',
  'rating': '7.50',
  'genre': ['action', 'adventure', 'romance', 'sci-fi', 'thriller']},
 {'movie_id': 'm6',
  'name': '8mm',
  'year': '1999',
  'rating': '6.30',
  'genre': ['crime', 'mystery', 'thriller']},
 {'movie_

In [9]:
movie_conversation_list

[{'speaker1': 'u0',
  'speaker2': 'u2',
  'movie_id': 'm0',
  'line_ids': ['L194', 'L195', 'L196', 'L197']},
 {'speaker1': 'u0',
  'speaker2': 'u2',
  'movie_id': 'm0',
  'line_ids': ['L198', 'L199']},
 {'speaker1': 'u0',
  'speaker2': 'u2',
  'movie_id': 'm0',
  'line_ids': ['L200', 'L201', 'L202', 'L203']},
 {'speaker1': 'u0',
  'speaker2': 'u2',
  'movie_id': 'm0',
  'line_ids': ['L204', 'L205', 'L206']},
 {'speaker1': 'u0',
  'speaker2': 'u2',
  'movie_id': 'm0',
  'line_ids': ['L207', 'L208']},
 {'speaker1': 'u0',
  'speaker2': 'u2',
  'movie_id': 'm0',
  'line_ids': ['L271', 'L272', 'L273', 'L274', 'L275']},
 {'speaker1': 'u0',
  'speaker2': 'u2',
  'movie_id': 'm0',
  'line_ids': ['L276', 'L277']},
 {'speaker1': 'u0',
  'speaker2': 'u2',
  'movie_id': 'm0',
  'line_ids': ['L280', 'L281']},
 {'speaker1': 'u0',
  'speaker2': 'u2',
  'movie_id': 'm0',
  'line_ids': ['L363', 'L364']},
 {'speaker1': 'u0',
  'speaker2': 'u2',
  'movie_id': 'm0',
  'line_ids': ['L365', 'L366']},
 {'spe

In [10]:
movie_lines_list

[{'line_id': 'L1045',
  'speaker': 'u0',
  'movie_id': 'm0',
  'character': 'BIANCA',
  'dialogue': 'They do not!'},
 {'line_id': 'L1044',
  'speaker': 'u2',
  'movie_id': 'm0',
  'character': 'CAMERON',
  'dialogue': 'They do to!'},
 {'line_id': 'L985',
  'speaker': 'u0',
  'movie_id': 'm0',
  'character': 'BIANCA',
  'dialogue': 'I hope so.'},
 {'line_id': 'L984',
  'speaker': 'u2',
  'movie_id': 'm0',
  'character': 'CAMERON',
  'dialogue': 'She okay?'},
 {'line_id': 'L925',
  'speaker': 'u0',
  'movie_id': 'm0',
  'character': 'BIANCA',
  'dialogue': "Let's go."},
 {'line_id': 'L924',
  'speaker': 'u2',
  'movie_id': 'm0',
  'character': 'CAMERON',
  'dialogue': 'Wow'},
 {'line_id': 'L872',
  'speaker': 'u0',
  'movie_id': 'm0',
  'character': 'BIANCA',
  'dialogue': "Okay -- you're gonna need to learn how to lie."},
 {'line_id': 'L871',
  'speaker': 'u2',
  'movie_id': 'm0',
  'character': 'CAMERON',
  'dialogue': 'No'},
 {'line_id': 'L870',
  'speaker': 'u0',
  'movie_id': 'm0',


#### Code

In [11]:
# Prepare dataframe from  the dictionary
movie_title_df = data_utils.dataframe_from_dict(data_dict_list=movie_title_list)
movie_conversation_df = data_utils.dataframe_from_dict(data_dict_list=movie_conversation_list)
movie_lines_df = data_utils.dataframe_from_dict(data_dict_list=movie_lines_list)

#### Variable check

In [12]:
movie_title_df.head()

Unnamed: 0,movie_id,name,year,rating,genre
0,m0,10 things i hate about you,1999,6.9,"[comedy, romance]"
1,m1,1492: conquest of paradise,1992,6.2,"[adventure, biography, drama, history]"
2,m2,15 minutes,2001,6.1,"[action, crime, drama, thriller]"
3,m3,2001: a space odyssey,1968,8.4,"[adventure, mystery, sci-fi]"
4,m4,48 hrs.,1982,6.9,"[action, comedy, crime, drama, thriller]"


In [13]:
movie_conversation_df.head()

Unnamed: 0,speaker1,speaker2,movie_id,line_ids
0,u0,u2,m0,"[L194, L195, L196, L197]"
1,u0,u2,m0,"[L198, L199]"
2,u0,u2,m0,"[L200, L201, L202, L203]"
3,u0,u2,m0,"[L204, L205, L206]"
4,u0,u2,m0,"[L207, L208]"


In [14]:
movie_lines_df.head()

Unnamed: 0,line_id,speaker,movie_id,character,dialogue
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


#### Code

In [15]:
# Prepare genre dictionary
genre_dict = data_utils.get_genre_dict(movie_title_df=movie_title_df)

Genre dictionary prepared


#### Variable check

In [16]:
genre_dict

{'sci-fi': ['m3',
  'm5',
  'm9',
  'm12',
  'm14',
  'm15',
  'm34',
  'm41',
  'm47',
  'm48',
  'm50',
  'm55',
  'm57',
  'm57',
  'm58',
  'm63',
  'm67',
  'm68',
  'm74',
  'm74',
  'm75',
  'm95',
  'm96',
  'm97',
  'm100',
  'm107',
  'm113',
  'm122',
  'm125',
  'm126',
  'm126',
  'm134',
  'm176',
  'm179',
  'm184',
  'm187',
  'm189',
  'm189',
  'm191',
  'm192',
  'm193',
  'm194',
  'm195',
  'm196',
  'm197',
  'm198',
  'm199',
  'm200',
  'm201',
  'm211',
  'm213',
  'm214',
  'm215',
  'm221',
  'm226',
  'm232',
  'm236',
  'm237',
  'm253',
  'm255',
  'm260',
  'm283',
  'm301',
  'm304',
  'm314',
  'm315',
  'm317',
  'm319',
  'm321',
  'm328',
  'm338',
  'm339',
  'm343',
  'm345',
  'm346',
  'm358',
  'm365',
  'm388',
  'm409',
  'm410',
  'm410',
  'm411',
  'm411',
  'm413',
  'm421',
  'm433',
  'm434',
  'm440',
  'm447',
  'm457',
  'm472',
  'm473',
  'm473',
  'm478',
  'm489',
  'm521',
  'm526',
  'm528',
  'm529',
  'm530',
  'm531',
  'm540

#### Code

In [17]:
# Make dialogue dict for final dataset
dialogue_dict, conversation_data_df = data_utils.prepare_conversations(movie_lines_df=movie_lines_df, movie_conversation_df=movie_conversation_df, only_start=only_start)

Conversations prepared


#### Variable check

In [18]:
dialogue_dict

{'L1045': 'They do not!',
 'L1044': 'They do to!',
 'L985': 'I hope so.',
 'L984': 'She okay?',
 'L925': "Let's go.",
 'L924': 'Wow',
 'L872': "Okay -- you're gonna need to learn how to lie.",
 'L871': 'No',
 'L870': 'I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869': 'Like my fear of wearing pastels?',
 'L868': 'The "real you".',
 'L867': 'What good stuff?',
 'L866': "I figured you'd get to the good stuff eventually.",
 'L865': 'Thank God!  If I had to hear one more story about your coiffure...',
 'L864': "Me.  This endless ...blonde babble. I'm like, boring myself.",
 'L863': 'What crap?',
 'L862': 'do you listen to this crap?',
 'L861': 'No...',
 'L860': 'Then Guillermo says, "If you go any lighter, you\'re gonna look like an extra on 90210."',
 'L699': 'You always been this selfish?',
 'L698': 'But',
 'L697': "Then that's all you had to say.",
 'L696': 'Well, no...',
 'L695': "You never wanted to go out with 'me, did y

In [19]:
conversation_data_df.head()

Unnamed: 0,movie_id,input,target
0,m0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,m0,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,m0,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,m0,You're asking me out. That's so cute. What's ...,Forget it.
4,m0,"No, no, it's my fault -- we didn't have a prop...",Cameron.


#### Code

In [20]:
# Do cleaning of the text data
conversation_data_df['input'] = conversation_data_df['input'].apply(data_utils.clean_text)
conversation_data_df['target'] = conversation_data_df['target'].apply(data_utils.clean_text, add_tags=True)

#### Variable check

In [21]:
conversation_data_df.head()

Unnamed: 0,movie_id,input,target
0,m0,can we make this quick roxanne korrine and and...,START_ well i thought we would start with pron...
1,m0,well i thought we would start with pronunciati...,START_ not the hacking and gagging and spittin...
2,m0,not the hacking and gagging and spitting part ...,START_ okay then how bout we try out some fren...
3,m0,you are asking me out that is so cute what is ...,START_ forget it _END
4,m0,no no its my fault we did not have a proper in...,START_ cameron _END


In [22]:
conversation_data_df.describe()

Unnamed: 0,movie_id,input,target
count,221616,221616,221616
unique,617,187664,187033
top,m289,what,START_ what _END
freq,1192,1732,1601


#### Code

In [23]:
# Filtering data which are not in appropriate length
filtered_conversation_df = data_utils.filter_short_long(conversation_data_df=conversation_data_df, 
                                                        min_q_length=2, max_q_length=20, 
                                                        min_a_length=2, max_a_length=20)

33% filtered from original data


#### Variable check

In [24]:
filtered_conversation_df.head()

Unnamed: 0,movie_id,input,target
0,m0,well i thought we would start with pronunciati...,START_ not the hacking and gagging and spittin...
1,m0,not the hacking and gagging and spitting part ...,START_ okay then how bout we try out some fren...
2,m0,you are asking me out that is so cute what is ...,START_ forget it _END
3,m0,no no its my fault we did not have a proper in...,START_ cameron _END
4,m0,gosh if only we could find kat a boyfriend,START_ let me see what i can do _END


In [25]:
filtered_conversation_df.describe()

Unnamed: 0,movie_id,input,target
count,148207,148207,148207
unique,617,132259,121712
top,m299,i do not know,START_ what _END
freq,789,250,1311


## Prepare Vectorizer

#### Code

In [26]:
# Prepare text vectorizer object
Vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                output_mode="int",
                                output_sequence_length=max_length,
                                standardize=None)

In [27]:
# Adapt the text vectorizer for the dataset
Vectorizer.adapt(filtered_conversation_df['target'].to_numpy())

In [28]:
vocab_list = Vectorizer.get_vocabulary()

#### Variable check

In [29]:
Vectorizer.get_vocabulary()

['',
 '[UNK]',
 '_END',
 'START_',
 'you',
 'i',
 'not',
 'the',
 'to',
 'a',
 'is',
 'do',
 'it',
 'that',
 'what',
 'are',
 'me',
 'have',
 'of',
 'am',
 'and',
 'no',
 'we',
 'in',
 'he',
 'know',
 'will',
 'its',
 'for',
 'this',
 'my',
 'your',
 'was',
 'be',
 'on',
 'just',
 'did',
 'they',
 'but',
 'would',
 'with',
 'about',
 'like',
 'yes',
 'all',
 'get',
 'here',
 'yeah',
 'she',
 'well',
 'so',
 'him',
 'right',
 'oh',
 'how',
 'out',
 'think',
 'want',
 'why',
 'got',
 'can',
 'up',
 'go',
 'if',
 'there',
 'at',
 'one',
 'now',
 'her',
 'good',
 'going',
 'see',
 'where',
 'come',
 'who',
 'ca',
 'tell',
 'say',
 'then',
 'sure',
 'could',
 'were',
 'okay',
 'time',
 'an',
 'mean',
 'as',
 'look',
 'from',
 'back',
 'take',
 'been',
 'too',
 'some',
 'them',
 'his',
 'never',
 'when',
 'something',
 'really',
 'does',
 'us',
 'man',
 'or',
 'had',
 'way',
 'said',
 'maybe',
 'need',
 'should',
 'sir',
 'more',
 'very',
 'make',
 'down',
 'little',
 'any',
 'sorry',
 'noth

## Prepare traing and test datasets from subset of data

#### Code

In [30]:
# Filter only the comedy movies data
comedy_movies_list = genre_dict['comedy']

comedy_movie_line_df = filtered_conversation_df[filtered_conversation_df['movie_id'].isin(comedy_movies_list)][:data_subset]

#### Variable check

In [31]:
comedy_movie_line_df.head()

Unnamed: 0,movie_id,input,target
0,m0,well i thought we would start with pronunciati...,START_ not the hacking and gagging and spittin...
1,m0,not the hacking and gagging and spitting part ...,START_ okay then how bout we try out some fren...
2,m0,you are asking me out that is so cute what is ...,START_ forget it _END
3,m0,no no its my fault we did not have a proper in...,START_ cameron _END
4,m0,gosh if only we could find kat a boyfriend,START_ let me see what i can do _END


In [32]:
comedy_movie_line_df.describe()

Unnamed: 0,movie_id,input,target
count,20000,20000,20000
unique,68,18781,17476
top,m100,i do not know,START_ what _END
freq,644,38,190


#### Code

In [33]:
# Prepare data for datasets also remove the sentences with most unknown tokens
training_data, testing_data = data_utils.split_vectorize_filter_unk(conversation_data_df=comedy_movie_line_df, Vectorizer=Vectorizer, test_split=test_split, seed=random_seed)

Training data points: 16000
Test data points: 4000
10% filtered from training data points
After unknown token filters training data points: 14314


#### Variable check

In [34]:
training_data.keys(), testing_data.keys()

(dict_keys(['input', 'target', 'input_vectors', 'target_vectors']),
 dict_keys(['input', 'target', 'input_vectors', 'target_vectors']))

In [35]:
for x in training_data.keys():
    print('training', x, np.array(training_data[x]).shape, type(training_data[x]))
    print('testing', x, np.array(testing_data[x]).shape, type(testing_data[x]))

training input (14314,) <class 'list'>
testing input (4000,) <class 'numpy.ndarray'>
training target (14314,) <class 'list'>
testing target (4000,) <class 'numpy.ndarray'>
training input_vectors (14314, 20) <class 'list'>
testing input_vectors (4000, 20) <class 'tensorflow.python.framework.ops.EagerTensor'>
training target_vectors (14314, 20) <class 'list'>
testing target_vectors (4000, 20) <class 'tensorflow.python.framework.ops.EagerTensor'>


In [36]:
sample_index = 0

In [37]:
Vectorizer.get_vocabulary()[188]

'those'

In [38]:
training_data['input'][sample_index], testing_data['target'][sample_index]

('listen could you stop crying please',
 'START_ it sure is now eat it up you are gonna need your energy _END')

In [39]:
training_data['input_vectors'][sample_index], testing_data['target_vectors'][sample_index]

(<tf.Tensor: shape=(20,), dtype=int64, numpy=
 array([ 234,   80,    4,  213, 1666,  133,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int64)>,
 <tf.Tensor: shape=(20,), dtype=int64, numpy=
 array([  12,   79,   10,   67,  386,   12,   61,    4,   15,  120,  108,
          31, 1489,    2,    0,    0,    0,    0,    0,    0], dtype=int64)>)

In [40]:
comedy_movie_line_df['target'].to_numpy()[0]

'START_ not the hacking and gagging and spitting part please _END'

In [41]:
len(training_data['target_vectors'])
    

14314

In [42]:
training_data.keys()

dict_keys(['input', 'target', 'input_vectors', 'target_vectors'])

In [43]:
tf.keras.utils.to_categorical(tf.random.uniform(shape=(2,3)), 6).shape

(2, 3, 6)

In [44]:
tf.expand_dims(tf.constant(np.array(training_data['target_vectors'])), axis = -1)

<tf.Tensor: shape=(14314, 20, 1), dtype=int64, numpy=
array([[[ 21],
        [  5],
        [ 80],
        ...,
        [  0],
        [  0],
        [  0]],

       [[  5],
        [ 11],
        [  6],
        ...,
        [  0],
        [  0],
        [  0]],

       [[  1],
        [110],
        [ 24],
        ...,
        [  0],
        [  0],
        [  0]],

       ...,

       [[  5],
        [ 26],
        [ 63],
        ...,
        [  0],
        [  0],
        [  0]],

       [[ 43],
        [  2],
        [  0],
        ...,
        [  0],
        [  0],
        [  0]],

       [[ 14],
        [  2],
        [  0],
        ...,
        [  0],
        [  0],
        [  0]]], dtype=int64)>

#### Code

In [45]:
# Preparing datasets
train_inputs = np.array(training_data['input'])
train_targets = np.array(training_data['target'])
#train_vector_targets = tf.keras.utils.to_categorical(np.array(training_data['target_vectors']), max_vocab_length)
train_vector_targets = tf.expand_dims(tf.constant(np.array(training_data['target_vectors'])), axis=-1)

test_inputs = np.array(testing_data['input'])
test_targets = np.array(testing_data['target'])
#test_vector_targets = tf.keras.utils.to_categorical(np.array(testing_data['target_vectors']), max_vocab_length)
test_vector_targets = tf.expand_dims(tf.constant(np.array(testing_data['target_vectors'])), axis=-1)

#### Variable check

In [46]:
print(tf.__version__)

2.7.0


In [47]:
train_inputs.shape, train_targets.shape, train_vector_targets.shape

((14314,), (14314,), TensorShape([14314, 20, 1]))

In [48]:
test_inputs.shape, test_targets.shape, test_vector_targets.shape

((4000,), (4000,), TensorShape([4000, 20, 1]))

#### Code

In [49]:
# Preparing dataset for training and validation
train_data_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_targets))
train_lables_dataset = tf.data.Dataset.from_tensor_slices(train_vector_targets)
train_dataset = tf.data.Dataset.zip((train_data_dataset, train_lables_dataset))
train_dataset = train_dataset.batch(128).prefetch(tf.data.AUTOTUNE)

test_data_dataset = tf.data.Dataset.from_tensor_slices((test_inputs, test_targets))
test_lables_dataset = tf.data.Dataset.from_tensor_slices(test_vector_targets)
test_dataset = tf.data.Dataset.zip((test_data_dataset, test_lables_dataset))
test_dataset = test_dataset.batch(128).prefetch(tf.data.AUTOTUNE)

## Model creation

### Model -1 Single layer LSTM

* Input for this model is string (question, answer first word(START_))
* Output will be the probability of the next word

In [50]:
# Creating embedding object for encoder and decoder models
EncoderEmbeddingLayer = layers.Embedding(input_dim=max_vocab_length,
                                output_dim=embedding_output_dimension, # 128
                                input_length=max_length,
                                mask_zero=True,
                                name='encoder_embedding_layer')

DecoderEmbeddingLayer = layers.Embedding(input_dim=max_vocab_length,
                                output_dim=embedding_output_dimension, # 128
                                input_length=max_length,
                                mask_zero=True,
                                name='decoder_embedding_layer')

In [51]:
# create encoder & decoder initial layers
EncoderInput = layers.Input(shape=(1,), dtype=tf.string)
encoder_vector = Vectorizer(EncoderInput)

DecoderInput = layers.Input(shape=(1,), dtype=tf.string)
decoder_vector = Vectorizer(DecoderInput)

In [52]:
# Create encoder
encoder_embeddings = EncoderEmbeddingLayer(encoder_vector)
EncoderLstmLayer = layers.LSTM(lstm_units, return_state=True, name='Encoder_LSTM')
encoder_lstm_outputs, state_h, state_c = EncoderLstmLayer(encoder_embeddings)
encoder_states = [state_h, state_c]

In [53]:
# Create Decoder
decoder_embeddings = DecoderEmbeddingLayer(decoder_vector)
DecoderLstmLayer = layers.LSTM(lstm_units, return_sequences=True, return_state=True, name='Decoder_LSTM')
decoder_lstm_outputs, _, _ = DecoderLstmLayer(decoder_embeddings, initial_state=encoder_states)
DecoderDenseLayer = layers.Dense(max_vocab_length, activation='softmax', name='Decoder_dense')
decoder_dense_outputs = DecoderDenseLayer(decoder_lstm_outputs)

EncDecModel = Model([EncoderInput, DecoderInput], decoder_dense_outputs)

In [54]:
# Compile model
EncDecModel.compile(loss=sparse_loss_fun,
                    optimizer=tf.keras.optimizers.Adam(),
                    metrics=['accuracy'])

In [55]:
# decoder at test time
EncModel = tf.keras.Model(EncoderInput, encoder_states)

DecoderStateInputH = layers.Input(shape=(lstm_units,))
DecoderStateInputC = layers.Input(shape=(lstm_units,))
decoder_states_inputs = [DecoderStateInputH, DecoderStateInputC]

decoder_vector_test = Vectorizer(DecoderInput)
dec_embedding_test = DecoderEmbeddingLayer(decoder_vector_test)

decoder_lstm_outputs_test, state_h_test, state_c_test = DecoderLstmLayer(dec_embedding_test, initial_state=decoder_states_inputs)
decoder_states_test = [state_h_test, state_c_test]
decoder_dense_outputs_test = DecoderDenseLayer(decoder_lstm_outputs_test)

DecModel = Model(
    inputs = [DecoderInput, decoder_states_inputs],
    outputs = [decoder_dense_outputs_test] + decoder_states_test)

#### Variable check

In [56]:
EncDecModel.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization (TextVector  (None, 20)          0           ['input_1[0][0]',                
 ization)                                                         'input_2[0][0]']                
                                                                                                  
 encoder_embedding_layer (Embed  (None, 20, 128)     1024000     ['text_vectorization[0][0]'] 

#### Code

In [57]:
model_history = EncDecModel.fit(train_dataset,
                epochs=epoch-40,
                validation_data=test_dataset)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Variable check

In [58]:
train_inputs[:10]

array(['listen could you stop crying please',
       'well what difference does it make where you are sitting',
       'he work for us', 'hi what are you doing', 'why would not it',
       'and someone from the epa is here to see you',
       'to customers approaching the bank from the rear',
       'neighbor doctor i live across the hall', 'you told me',
       'hey karen how ya doin'], dtype='<U132')

In [59]:
train_targets[:10]

array(['START_ no i could not _END',
       'START_ i do not want to have to sit next to snorri all that time _END',
       'START_ ahno sir he was here for a job interview _END',
       'START_ nothing _END', 'START_ i just do not think it would _END',
       'START_ the epa what is he want _END', 'START_ right _END',
       'START_ and you come into mrs ryans apartment as often as you please looking into various health matters _END',
       'START_ when _END', 'START_ what are you doing here _END'],
      dtype='<U130')

#### Code

In [60]:
def decoder_model_test(input_word: str, stat: list):
    decoder_vector_test = Vectorizer([input_word])
    dec_embedding_test = DecoderEmbeddingLayer(decoder_vector_test)
    decoder_lstm_outputs_test, state_h_test, state_c_test = DecoderLstmLayer(dec_embedding_test, initial_state=stat)
    decoder_dense_output_test = DecoderDenseLayer(decoder_lstm_outputs_test)
    word_idx = tf.argmax(decoder_dense_output_test[0, 0, :]).numpy()
    next_word = vocab_list[word_idx]
    stat = [state_h_test, state_c_test]
    return next_word, stat


In [61]:
human = 'hi'
stat = EncModel.predict([human])
next_word = 'START_'
stop_condition = True
bot_response = ""
while stop_condition:
    stat = [tf.constant(stat[0]), tf.constant(stat[1])]
    next_word, stat = decoder_model_test(next_word, stat)
    bot_response += next_word + ' '
    if next_word == '_END' or len(bot_response.split()) > max_length:
        stop_condition = False
print(bot_response)

you are a [UNK] _END 


In [62]:
end_convo = False
while not end_convo:
    # Getting the input from user
    human = input("Human: ")
    if human == 'END CONVO':
        end_convo = True
    # Encoding the input
    stat = EncModel.predict([human])
    next_word = 'START_'
    stop_condition = True
    bot_response = ""
    while stop_condition:
        stat = [tf.constant(stat[0]), tf.constant(stat[1])]
        next_word, stat = decoder_model_test(next_word, stat)
        bot_response += next_word + ' '
        if next_word == '_END' or len(bot_response.split()) > max_length:
            stop_condition = False
    print("KATTA:", bot_response)

KATTA: you are a [UNK] _END 
KATTA: you are a [UNK] _END 
KATTA: i am not you _END 


In [None]:
EncDecModel.save_weights('./model_weights/1LayerLstmComedy50epochs/EncDecModel1Weights')