# Chatbot KATTA V1
This chat bot is a general purpose chat bot which is trained on movies conversations.
* This is a try 1 with a base model

## Setting up the environments

In [1]:
# Importing the required modules.
import pandas as pd
import numpy as np
import re
import string
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.model_selection import train_test_split

## Load the data
* Get the movie details
* Get the conversation details
* Get the conversation lines

In [2]:
# Reading the required data files
# Reading the movie details meta data
with open('./data/movie_titles_metadata.txt', 'r', encoding='utf-8', errors='ignore') as mtm:
    movie_titles = mtm.read().split('\n')

# Reading the conversation meta data
with open('./data/movie_conversations.txt', 'r', encoding='utf-8', errors='ignore') as mc:
    movie_conversations = mc.read().split('\n')

# Reading the conversation lines
with open('./data/movie_lines.txt', 'r', encoding='utf-8', errors='ignore') as ml:
    movie_lines = ml.read().split('\n')

In [3]:
# Prepare dictionary for all data 
# Prepare dictionary for movie meta data
movie_title_list = []
for line in movie_titles:
    if not line:
        continue # for identifying and ignoring empty lines
    movie_title_info = {}
    movie_info = line.split(' +++$+++ ')
    movie_title_info['movie_id'] = movie_info[0].strip()
    movie_title_info['name'] = movie_info[1].strip()
    movie_title_info['year'] = movie_info[2].strip()
    movie_title_info['rating'] = movie_info[3].strip()
    movie_title_info['genre'] = movie_info[-1][2:-2].strip().split("', '") # this is for splitting the genres from ['comedy', 'romance'] to a list
    movie_title_list.append(movie_title_info)

# Prepare dictionary for movie convo meta data
movie_conversation_list = []
for line in movie_conversations:
    if not line:
        continue # for identifying and ignoring empty lines
    movie_conversation_info = {}
    conversation_info = line.split(' +++$+++ ')
    movie_conversation_info['speaker1'] = conversation_info[0].strip()
    movie_conversation_info['speaker2'] = conversation_info[1].strip()
    movie_conversation_info['movie_id'] = conversation_info[2].strip()
    movie_conversation_info['line_ids'] = conversation_info[-1][2:-2].strip().split("', '")# this is for splitting the conversation info from ['L198', 'L199'] to a list
    movie_conversation_list.append(movie_conversation_info)

# Prepare dictionary for movie dialogues
movie_lines_list = []
for line in movie_lines:
    if not line:
        continue # for identifying and ignoring empty lines
    movie_line_info = {}
    line_info = line.split(' +++$+++ ')
    movie_line_info['line_id'] = line_info[0].strip()
    movie_line_info['speaker'] = line_info[1].strip()
    movie_line_info['movie_id'] = line_info[2].strip()
    movie_line_info['character'] = line_info[3].strip()
    movie_line_info['dialogue'] = line_info[-1].strip()
    movie_lines_list.append(movie_line_info)

In [4]:
# create dataframe for all the above dicts for better processing
movie_title_df = pd.DataFrame.from_dict(movie_title_list)
movie_conversation_df = pd.DataFrame.from_dict(movie_conversation_list)
movie_lines_df = pd.DataFrame.from_dict(movie_lines_list)

In [5]:
# Get the list of available genres from the whole dataset 
genres = movie_title_df['genre'].to_numpy()
genre_set = set()
for genre_list in genres:
    for genre in genre_list:
        if genre:
            genre_set.add(genre)


In [6]:
# Checking the count of movies in each genres and storing the movies with respect to their genres in the dictionary
genre_dict = {}
for genre_name in genre_set:
    genre_dict[genre_name] = []
for movie, genre_list in movie_title_df[['movie_id', 'genre']].to_numpy():
    for genre in genre_list:
        if genre:
            genre_dict[genre].append(movie)

### Variable & Data checks

In [53]:
for genre_name, genre_lists in genre_dict.items():
    print(f'Movies available in this {genre_name.upper()} genre', len(genre_lists))


Movies available in this FAMILY genre 17
Movies available in this MYSTERY genre 102
Movies available in this ACTION genre 168
Movies available in this CRIME genre 147
Movies available in this HISTORY genre 21
Movies available in this DOCUMENTARY genre 3
Movies available in this BIOGRAPHY genre 25
Movies available in this THRILLER genre 269
Movies available in this COMEDY genre 162
Movies available in this FANTASY genre 78
Movies available in this HORROR genre 99
Movies available in this ADVENTURE genre 116
Movies available in this DRAMA genre 320
Movies available in this ADULT genre 1
Movies available in this MUSIC genre 13
Movies available in this SCI-FI genre 120
Movies available in this WAR genre 23
Movies available in this ANIMATION genre 18
Movies available in this ROMANCE genre 132
Movies available in this SPORT genre 8
Movies available in this SHORT genre 5
Movies available in this MUSICAL genre 8
Movies available in this FILM-NOIR genre 4
Movies available in this WESTERN genre 

In [47]:
movie_title_df[['movie_id', 'genre']].head().to_numpy()

array([['m0', list(['comedy', 'romance'])],
       ['m1', list(['adventure', 'biography', 'drama', 'history'])],
       ['m2', list(['action', 'crime', 'drama', 'thriller'])],
       ['m3', list(['adventure', 'mystery', 'sci-fi'])],
       ['m4', list(['action', 'comedy', 'crime', 'drama', 'thriller'])]],
      dtype=object)

In [40]:
genre_set

{'action',
 'adult',
 'adventure',
 'animation',
 'biography',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'film-noir',
 'history',
 'horror',
 'music',
 'musical',
 'mystery',
 'romance',
 'sci-fi',
 'short',
 'sport',
 'thriller',
 'war',
 'western'}

In [14]:
movie_title_df.head(3)

Unnamed: 0,movie_id,name,year,rating,genre
0,m0,10 things i hate about you,1999,6.9,"[comedy, romance]"
1,m1,1492: conquest of paradise,1992,6.2,"[adventure, biography, drama, history]"
2,m2,15 minutes,2001,6.1,"[action, crime, drama, thriller]"


In [18]:
movie_title_df.dtypes

movie_id    object
name        object
year        object
rating      object
genre       object
dtype: object

In [19]:
for i, l in enumerate(movie_title_df['genre'][:5]):
    print("list", l, "is", type(l))

list ['comedy', 'romance'] is <class 'list'>
list ['adventure', 'biography', 'drama', 'history'] is <class 'list'>
list ['action', 'crime', 'drama', 'thriller'] is <class 'list'>
list ['adventure', 'mystery', 'sci-fi'] is <class 'list'>
list ['action', 'comedy', 'crime', 'drama', 'thriller'] is <class 'list'>


In [11]:
movie_conversation_df.head(3)

Unnamed: 0,speaker1,speaker2,movie_id,line_ids
0,u0,u2,m0,"[L194, L195, L196, L197]"
1,u0,u2,m0,"[L198, L199]"
2,u0,u2,m0,"[L200, L201, L202, L203]"


In [20]:
movie_conversation_df.dtypes

speaker1    object
speaker2    object
movie_id    object
line_ids    object
dtype: object

In [21]:
for i, l in enumerate(movie_conversation_df['line_ids'][:5]):
    print("list", l, "is", type(l))

list ['L194', 'L195', 'L196', 'L197'] is <class 'list'>
list ['L198', 'L199'] is <class 'list'>
list ['L200', 'L201', 'L202', 'L203'] is <class 'list'>
list ['L204', 'L205', 'L206'] is <class 'list'>
list ['L207', 'L208'] is <class 'list'>


In [12]:
movie_lines_df.head(3)

Unnamed: 0,line_id,speaker,movie_id,character,dialogue
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.


In [26]:
## Test check
movie_conversation_df[['movie_id', 'line_ids']].head(3).to_numpy()

array([['m0', list(['L194', 'L195', 'L196', 'L197'])],
       ['m0', list(['L198', 'L199'])],
       ['m0', list(['L200', 'L201', 'L202', 'L203'])]], dtype=object)

In [30]:
movie_lines_df['dialogue'][movie_lines_df['line_id']=='L194'].to_numpy()[0]

'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.'

## Prepare final datasets
Prepare data for model training

In [7]:
# Make conversation line dictionary for preparing the final dataset
dialogue_ids = movie_lines_df['line_id'].to_numpy()
dialogue_lines = movie_lines_df['dialogue'].to_numpy()
dialogue_dict = {}
for dialogue_id, dialogue_line in zip(dialogue_ids, dialogue_lines):
    dialogue_dict[dialogue_id] = dialogue_line

#len(dialogue_dict)

In [8]:
# prepare final/actual dictionary for creating the chat bot
# This dictionary will have the conversation wise data.
conversation_data_dict = {}
conversation_data_dict['movie_id'] = []
conversation_data_dict['input'] = []
conversation_data_dict['target'] = []
for movie_id, convo_list in movie_conversation_df[['movie_id', 'line_ids']].to_numpy():
    for convos in range(len(convo_list)-1):
        conversation_data_dict['movie_id'].append(movie_id)
        conversation_data_dict['input'].append(dialogue_dict[convo_list[convos]])
        conversation_data_dict['target'].append(dialogue_dict[convo_list[convos+1]])

# Prepare dataframe from the dictionary for better access
conversation_data_df = pd.DataFrame.from_dict(conversation_data_dict)

#### Variable & Data checks

In [43]:
conversation_data_df.head()

Unnamed: 0,movie_id,input,target
0,m0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,m0,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,m0,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,m0,You're asking me out. That's so cute. What's ...,Forget it.
4,m0,"No, no, it's my fault -- we didn't have a prop...",Cameron.


In [27]:
p = np.array([[1,2,3,4], [3,4,5,6]])
np.unique(p)

array([1, 2, 3, 4, 5, 6])

In [46]:
# Save the created dataframes for future accesses.
conversation_data_df.to_csv('./data/processed_data/conversation_data.csv')
movie_conversation_df.to_csv('./data/processed_data/movie_conversation.csv')
movie_lines_df.to_csv('./data/processed_data/movie_lines.csv')
movie_title_df.to_csv('./data/processed_data/movie_title.csv')

## Data processing

In [9]:
# create a function for data cleaning
def clean_text(input_text: str, add_tags: bool = False, start_tag: str = 'START_ ', end_tag: str = ' _END', 
                remove_punc: bool = True, remove_symbols: str = '[^0-9a-z #+_]', ignore_words: list = [], 
                remove_numbers: bool = True, replace_word_from: list = [], replace_word_to: list = []):
    """
    Input: input_text (string), add_tags (optional - bool), start_tag (optional - string), end_tag (optional - string), 
            remove_punc (optional - bool), remove_symbols (optional - string), ignore_words (optional - list), remove_numbers (optional - bool),
            replace_word_from (optional - bool), replace_word_to (optional - bool)
    Output: cleaned text (string)
    description:
        This function will clean the input text given by removong the bad symbols, numbers, punctuations, extra spaces... and return back the cleaned text
        if the add_tags value is True (it's False by default) it will add the start tag and end tags at the start and end of the text
        we can also define the start_tag and end_tag values
    """
    def remove_punctuation(text: str):
        punctuation_list = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in punctuation_list)

    def remove_bad_symbols(text: str, symbols: str):
        bad_symbols = re.compile(symbols)
        return bad_symbols.sub(' ', text)

    def remove_extra_space(text: str):
        extra_space = re.compile(' +')
        return extra_space.sub(' ', text)

    def remove_ignore_words(text: str, ignore_words_list: list):
        for word in ignore_words_list:
            text = text.replace(word, " ")
        return text
    
    def remove_digits(text:str):
        remove_digit = str.maketrans('', '', string.digits)
        return text.translate(remove_digit)

    def replace_words(text: str, replace_word_list_from: list, replace_word_list_to: list):
        for from_word, to_word in zip(replace_word_list_from, replace_word_list_to):
            text = text.replace(str(from_word).lower(), str(to_word).lower())
        return text

    def add_start_end_tags(text: str):
        return 'START_ ' + text + ' _END'

    input_text = input_text.lower()
    input_text = replace_words(input_text, replace_word_from, replace_word_to) if replace_word_from and (len(replace_word_from) == len(replace_word_to)) else input_text
    input_text = remove_ignore_words(input_text, ignore_words) if ignore_words else input_text
    input_text = remove_digits(input_text) if remove_numbers else input_text
    input_text = remove_punctuation(input_text) if remove_punc else input_text
    input_text = remove_bad_symbols(input_text, remove_symbols) if remove_symbols else input_text
    input_text = add_start_end_tags(input_text) if add_tags else input_text
    input_text = remove_extra_space(input_text)
    return input_text.strip()



In [10]:
conversation_data_df['input'] = conversation_data_df['input'].apply(clean_text)
conversation_data_df['target'] = conversation_data_df['target'].apply(clean_text, add_tags=True)

#### Variable & Data check

In [11]:
test_sentence = "Hi There buddy! can't you speak? ... okay here's my number 786797655765. ring me"

In [12]:
clean_text(test_sentence, replace_word_from=["can't"], replace_word_to=['Cannot'])

'hi there buddy cannot you speak okay heres my number ring me'

In [13]:
conversation_data_df.head()

Unnamed: 0,movie_id,input,target
0,m0,can we make this quick roxanne korrine and and...,START_ well i thought wed start with pronuncia...
1,m0,well i thought wed start with pronunciation if...,START_ not the hacking and gagging and spittin...
2,m0,not the hacking and gagging and spitting part ...,START_ okay then how bout we try out some fren...
3,m0,youre asking me out thats so cute whats your n...,START_ forget it _END
4,m0,no no its my fault we didnt have a proper intr...,START_ cameron _END


## Model building
* first we will build model only with the comedy movies

### Model 1 
This model will only include comedy movies.
This model architecture is inspired from https://medium.com/analytics-vidhya/machine-translation-encoder-decoder-model-7e4867377161

This is a Encoder - decoder model
* Encoder will be trained with input data
* Decoder will be trained using target data (differently - during traing)
* During predictions the layers from decoder is used

In [15]:
# Get only the comedy movies
comedy_movies_list = genre_dict['comedy']

# filter only the comedy movies from total dataframe
comedy_movie_line_df = conversation_data_df[conversation_data_df['movie_id'].isin(comedy_movies_list)][:30000]

In [16]:
comedy_movie_line_df.shape, conversation_data_df.shape

((30000, 3), (221616, 3))

In [133]:
# Defining parameters text vectorizer & creating text vectorizer 
max_vocab_length = 10000
max_length = 20
text_vectorizer = layers.experimental.preprocessing.TextVectorization(
                    max_tokens=max_vocab_length,
                    output_mode="int",
                    output_sequence_length=max_length,
                    standardize=None)

In [134]:
# Adapting the training data for preparing the final dictionary
text_vectorizer.adapt(comedy_movie_line_df['target'].to_numpy())

In [327]:
# Creating emmbedding object
embedding_output_dimension = 128
enc_embedding = layers.Embedding(input_dim=max_vocab_length,
                                output_dim=embedding_output_dimension,
                                #input_length=max_length,
                                mask_zero=True)

In [328]:
# Create encoder
lstm_units = 64
encoder_inputs = layers.Input(shape=(1,), dtype=tf.string)
encoder_vector = text_vectorizer(encoder_inputs)
enc_emd = enc_embedding(encoder_vector)
encoder_lstm = layers.LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emd)
encoder_states = [state_h, state_c]

In [329]:
# create embedding layer for decoder
dec_embedding = layers.Embedding(input_dim=max_vocab_length,
                                output_dim=embedding_output_dimension, # 128
                                #input_length=max_length,
                                mask_zero=True)

In [330]:
# Create decoder
decoder_inputs = layers.Input(shape=(None,))
#decoder_vector = text_vectorizer(decoder_inputs)
dec_emb = dec_embedding(decoder_inputs)
decoder_lstm = layers.LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = layers.Dense(max_vocab_length, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model_train = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [364]:
model_train.compile(loss='categorical_crossentropy',
                    optimizer=tf.keras.optimizers.Adam(),
                    metrics=['mse'])

In [332]:
model_train.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_14 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization_5 (TextVect  (None, 20)          0           ['input_14[0][0]']               
 orization)                                                                                       
                                                                                                  
 input_15 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_12 (Embedding)       (None, 20, 128)      1280000     ['text_vectorization_5[3][0

In [363]:
# splitting data for training and validation
train_inputs, test_inputs, train_targets, test_targets = train_test_split(comedy_movie_line_df['input'].to_numpy(),
                                                                            comedy_movie_line_df['target'].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)
len(train_inputs), len(test_inputs), len(train_targets), len(test_targets)

(27000, 3000, 27000, 3000)

In [334]:
# Converting the output text to vectors for training the model
train_vector_targets = text_vectorizer(train_targets)
test_vector_targets = text_vectorizer(test_targets)

#vector_targets

In [335]:
train_targets.shape, train_vector_targets.shape

((27000,), TensorShape([27000, 20]))

#### Variable & data check

In [160]:
train_inputs

array(['yeah well dont let it get out', 'pretty advanced isnt it',
       'what do you think of me', ..., 'five bills',
       'sir before you boot me i just want to explain i mean okay you got a goatguy with a hook for a head',
       'that was your people magazine with the letters cut out wasnt it'],
      dtype=object)

In [144]:
len(text_vectorizer.get_vocabulary())

10000

In [145]:
test_text = 'START_ hi there _END'

In [146]:
text_vectorizer([test_text])

<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[  3, 341,  69,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]], dtype=int64)>

In [56]:
for t, word in enumerate(test_text.split()):
    if t<len(test_text.split()) -1:
        print(word)
    if t>0:
        print('Hello', word)


START_
hi
Hello hi
there
Hello there
Hello _END


In [52]:
train_vector_targets[:128].shape

TensorShape([128, 20])

In [49]:
max_length

20

In [140]:
train_vector_targets[0]

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([    2,    18,   216,    43,     6,   183,    11,   119,     4,
         653,    47, 10117,  2374,    49, 13934,  1529,    11,   245,
           3,     0], dtype=int64)>

In [146]:
text_vectorizer('hi there')

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([363,  67,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0], dtype=int64)>

In [136]:
len(train_vector_targets)

61668

In [135]:
text_vectorizer.get_vocabulary()[67]

'there'

In [148]:
a=[1,2,3,4,5,6,7,8,9]
a[0] = 10
a

[10, 2, 3, 4, 5, 6, 7, 8, 9]

In [182]:
train_vector_targets

<tf.Tensor: shape=(61668, 20), dtype=int64, numpy=
array([[   2,   18,  216, ...,  245,    3,    0],
       [   2,  152,   52, ...,    0,    0,    0],
       [   2, 3149,  143, ...,    0,    0,    0],
       ...,
       [   2,    9,   34, ...,    0,    0,    0],
       [   2,  285,   31, ...,    0,    0,    0],
       [   2, 6499,    3, ...,    0,    0,    0]], dtype=int64)>

In [188]:
temp = []
for x in train_vector_targets:
    temp = tf.one_hot(x, depth=max_vocab_length)
    break


np.array(temp).shape

(20, 15000)

In [82]:
train_vector_targets.shape

TensorShape([27000, 20])

In [105]:
np.where(train_vector_targets[0][:6].numpy() == 0)[0].size

0

In [101]:
np.where(train_vector_targets[0].numpy() == 0)[0].size

14

In [106]:
np.where(train_vector_targets[0].numpy() == 3)[0][0]

5

In [150]:
train_vector_targets[0].numpy().size -1

19

In [148]:
text_vectorizer.get_vocabulary()[:5]

['', '[UNK]', '_END', 'START_', 'you']

In [151]:
text_vectorizer.get_vocabulary().index('START_')

3

In [152]:
train_vector_targets[0]

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([  3,  43, 106,  27, 479,   2,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0], dtype=int64)>

In [156]:
train_vector_targets[0].numpy()[1:], train_vector_targets[0].numpy()[1:].shape

(array([ 43, 106,  27, 479,   2,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0], dtype=int64),
 (19,))

In [158]:
np.append(train_vector_targets[0].numpy()[1:], [0]), np.append(train_vector_targets[0].numpy()[1:], [0]).shape

(array([ 43, 106,  27, 479,   2,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0], dtype=int64),
 (20,))

In [178]:
start_index = text_vectorizer.get_vocabulary().index('START_')
end_index = text_vectorizer.get_vocabulary().index('_END')
start_index, end_index

(3, 2)

In [179]:
text_vectorizer.get_vocabulary()[:5]

['', '[UNK]', '_END', 'START_', 'you']

In [183]:
train_vector_targets[0].numpy()

array([  3,  43, 106,  27, 479,   2,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0], dtype=int64)

In [239]:
np.where(train_vector_targets[0].numpy() == end_index)[0][0]

5

In [236]:
len(train_vector_targets[0])

20

In [196]:
train_vector_targets[0].shape[0]

20

In [191]:
train_vector_targets[0].shape
np.zeros(2, train_vector_targets[0].shape)

TypeError: Cannot interpret 'TensorShape([20])' as a data type

In [197]:
len(text_vectorizer.get_vocabulary())

10000

#### code

In [336]:
# Preparing generator function for fetching dataset
def batch_data_generator(x_vec, y_vec, vocab_list: list, batch_size: int = 128, ):
    while True:
        for i in range(0, len(x_vec), batch_size):
            encoder_input_data = x_vec[i:i+batch_size]
            decoder_input_data = np.zeros((batch_size, y_vec[0].shape[0]), dtype=int) #y_vec[i:i+batch_size]
            decoder_target_data = np.zeros((batch_size, y_vec[0].shape[0], len(vocab_list)), dtype=int) #y_vec[i:i+batch_size] #tf.zeros((batch_size, max_length, max_vocab_length), dtype=tf.float32)
            start_index = vocab_list.index('START_')
            end_index = vocab_list.index('_END')
            all_zero = np.zeros(len(vocab_list))
            end_vector = np.zeros(len(vocab_list))
            end_vector[end_index] = 1
            for j, target_vector in enumerate(y_vec[i:i+batch_size]):
                #print(target_vector, j)
                #print(decoder_target_data[j].shape)
                #print(np.append(decoder_target_data[j][1:], [0]))
                #print(decoder_target_data[j])
                closing_index = np.where(target_vector.numpy() == end_index)[0].size
                max_index = len(target_vector.numpy()) - 1
                if closing_index:
                    max_index = np.where(target_vector.numpy() == end_index)[0][0]
                vector_length = len(target_vector.numpy()) -1
                #print(vector_length)
                #print('max_index', max_index)
                for t, idx in enumerate(target_vector.numpy()):
                    #print(idx)
                    if idx == end_index:
                        #print('end ', t, idx)
                        decoder_input_data[j][t] = 0
                    else:
                        decoder_input_data[j][t] = idx
                    if t == max_index:
                        #print(t)
                        decoder_target_data[j][t-1][idx] = 1
                    elif t > 0:
                        decoder_target_data[j][t-1][idx] = 1
                    if t == vector_length:
                        decoder_target_data[j][t][idx] = 1
                    #decoder_target_data[j] = np.append(decoder_target_data[j][1:], all_zero)
                #print(decoder_input_data[j])
            yield ([encoder_input_data, decoder_input_data], decoder_target_data)
            #return ([encoder_input_data, decoder_input_data], decoder_target_data)

#### variable & Data check

In [311]:
max_index = 5
(0,3), (1,43), (2,106), (3,27), (4,479), (5, 2), (6,0)

((0, 3), (1, 43), (2, 106), (3, 27), (4, 479), (5, 2), (6, 0))

In [312]:
([test_in,text_in1], test_out) = batch_data_generator(train_inputs[:2], train_vector_targets[:2],vocab_list=text_vectorizer.get_vocabulary(), batch_size=1)

19
5


In [313]:
test_in.shape, text_in1.shape, test_out.shape

((1,), (1, 20), (1, 20, 10000))

In [314]:
test_in

array(['yeah well dont let it get out'], dtype=object)

In [315]:
train_vector_targets[:1]

<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[  3,  43, 106,  27, 479,   2,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]], dtype=int64)>

In [316]:
text_in1

array([[  3,  43, 106,  27, 479,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]])

In [317]:
test_out.shape

(1, 20, 10000)

In [318]:
test_out

array([[[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0]]])

In [321]:
np.where(test_out[0][4] == 1)

(array([2], dtype=int64),)

In [262]:
test_out[1]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [46]:
# Preparing dataset for training and validation
#train_data_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_targets))
#train_lables_dataset = tf.data.Dataset.from_tensor_slices(train_vector_targets)
#train_dataset = tf.data.Dataset.zip((train_data_dataset, train_lables_dataset))
#train_dataset = train_dataset.batch(64).prefetch(tf.data.AUTOTUNE)

#test_data_dataset = tf.data.Dataset.from_tensor_slices((test_inputs, test_targets))
#test_lables_dataset = tf.data.Dataset.from_tensor_slices(test_vector_targets)
#test_dataset = tf.data.Dataset.zip((test_data_dataset, test_lables_dataset))
#test_dataset = test_dataset.batch(64).prefetch(tf.data.AUTOTUNE)

In [47]:
#train_dataset, test_dataset

(<PrefetchDataset shapes: (((None,), (None,)), (None, 20)), types: ((tf.string, tf.string), tf.int64)>,
 <PrefetchDataset shapes: (((None,), (None,)), (None, 20)), types: ((tf.string, tf.string), tf.int64)>)

In [200]:
len(train_dataset)

964

#### code

In [367]:
training_batch = batch_data_generator(train_inputs, train_vector_targets, vocab_list=text_vectorizer.get_vocabulary(), batch_size=32)
test_batch = batch_data_generator(test_inputs, test_vector_targets, vocab_list=text_vectorizer.get_vocabulary(), batch_size=32)

In [368]:
model_train_history = model_train.fit(training_batch,
                                        steps_per_epoch=64,
                                        epochs=5,
                                        validation_data=test_batch,
                                        validation_steps=16)

Epoch 1/5
Epoch 2/5

InvalidArgumentError:    Incompatible shapes: [128,256] vs. [56,256]
	 [[{{node add}}]]
	 [[model_7/lstm_14/PartitionedCall]] [Op:__inference_test_function_424058]

Function call stack:
test_function -> test_function -> test_function


In [340]:
# decoder at test time
encoder_model = tf.keras.Model(encoder_inputs, encoder_states)

decoder_state_input_h = layers.Input(shape=(lstm_units,))
decoder_state_input_c = layers.Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_embedding(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)


In [341]:
vocab_list = text_vectorizer.get_vocabulary()

In [342]:
vocab_list[:10]

['', '[UNK]', '_END', 'START_', 'you', 'i', 'the', 'to', 'a', 'it']

In [353]:
def chat(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0][0] = vocab_list.index('START_')
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq]+states_value)
        print(output_tokens.shape)
        print(output_tokens)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = vocab_list[sampled_token_index]
        print(sampled_char)
        decoded_sentence += ' '+sampled_char
        if (sampled_char=='_END') or len(decoded_sentence)>19:
            stop_condition = True
        target_seq = np.zeros((1,1))
        target_seq[0][0] = sampled_token_index
        states_value = [h,c]
    return decoded_sentence

In [354]:
train_inputs[:10]

array(['yeah well dont let it get out', 'pretty advanced isnt it',
       'what do you think of me', 'the poor thing six years',
       'are you waiting for a bus',
       'im going to change for dinner ill see you shortly', 'for what',
       'why is it', 'having fun',
       'yes so do i but i think he is a little oldfashioned like a puff harold'],
      dtype=object)

In [355]:
train_inputs[0]

'yeah well dont let it get out'

In [356]:
chat([train_inputs[2]])

_END


' _END'