In [0]:
import tensorflow as tf
import numpy as np
import re

In [0]:
cd drive/My\ Drive

/content/drive/My Drive


In [0]:
#reading data from the movie_conversations.txt file
movie_conversations = open('movie_conversations.txt','r')
conversations = []    #contains all the conversations as list elements ( list within a list )
for i in movie_conversations.readlines():
  a = i.split('+++$+++')
  b = a[3].split(', ') 
  conv= []
  for j in b:
    conv.append(re.sub(r'[^\w]', ' ',j).replace(" ",""))
  conversations.append(conv)
conversations[:5]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208']]

In [0]:
#reading data from the movie_lines.txt
movie_lines = open('movie_lines.txt','rb')
code_vs_dialogue_list = dict()
for line in movie_lines:
  i = line.decode(encoding='utf-8',errors='ignore')
  a = i.split('+++$+++')
  conv_code = a[0].replace(" ","")
  dialogue = a[4].replace("\n","").strip()
  code_vs_dialogue_list.update({conv_code:dialogue})

In [0]:
def find_dialogue(x):
    return code_vs_dialogue_list[x]

In [0]:
questions = []
answers = []
for i in conversations:
  for k in range(len(i)-1):
    questions.append(find_dialogue(i[k]))
    answers.append(find_dialogue(i[k+1])) 

In [0]:
answers_with_tags = []
for i in range( len( answers ) ):
    if type( answers[i] ) == str:
        answers_with_tags.append( '<START> ' + answers[i] + ' <END>' )
    else:
        questions.pop( i )

In [0]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questions + answers_with_tags)
vocab_size = len(tokenizer.word_index) + 1
vocab_size

55227

In [0]:
upper_limit = 20
tokenized_questions = tokenizer.texts_to_sequences(questions)
tokenized_answers = tokenizer.texts_to_sequences( answers_with_tags )
final_tokens_questions = []
final_tokens_answers = []
for i in range(len(tokenized_questions)):
  if len(tokenized_questions[i])<=upper_limit:
    final_tokens_questions.append(tokenized_questions[i])
    final_tokens_answers.append(tokenized_answers[i])

In [0]:
#encoder_input_data_preprocessing
#maxlen_questions = max( [ len(x) for x in tokenized_questions ] )
#padded_questions = tf.keras.preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' )
padded_questions = tf.keras.preprocessing.sequence.pad_sequences( final_tokens_questions , maxlen=upper_limit , padding='post' )
encoder_input_data = np.array( padded_questions )
print( encoder_input_data.shape )

(195670, 20)


In [0]:
# decoder_input_data
#tokenized_answers = tokenizer.texts_to_sequences( answers_with_tags )
#maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_answers = tf.keras.preprocessing.sequence.pad_sequences( final_tokens_answers , maxlen=upper_limit , padding='post' )
decoder_input_data = np.array( padded_answers )
print( decoder_input_data.shape  )

(195670, 20)


In [0]:
for i in range(len(final_tokens_answers)) :
    final_tokens_answers[i] = final_tokens_answers[i][1:]

In [0]:
# decoder_output_data
#tokenized_answers = tokenizer.texts_to_sequences( answers_with_tags )
#maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_decout_answers = tf.keras.preprocessing.sequence.pad_sequences( final_tokens_answers , maxlen= upper_limit, padding='post' )
padded_decout_answers.shape

(195670, 20)

In [0]:
#Hyperparameters
embedding_dim = 256
lstm_units = 1024

In [0]:
#training cell with training model created.
inference_encoder_inputs = tf.keras.layers.Input(shape=(None,)) #'''batch_size=batch_size''' )
embedded_encoder_inputs = tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True)(inference_encoder_inputs)
encoder_output,state_h,state_c = tf.keras.layers.LSTM(lstm_units, return_state=True,return_sequences=True)(embedded_encoder_inputs) # '''input_shape=(batch_size,input_length,embedding_dim)'''
encoder_states = [state_h,state_c]

inference_decoder_inputs = tf.keras.layers.Input(shape=(None,)) #''',batch_size=batch_size''')
embedded_decoder_inputs = tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True)(inference_decoder_inputs)
decoder_lstm_layer = tf.keras.layers.LSTM(lstm_units, return_state = True, return_sequences=True) #''',input_shape=(batch_size,input_length,embedding_dim)'''
decoder_output,_,_ = decoder_lstm_layer(embedded_decoder_inputs, initial_state = encoder_states)
decoder_output = tf.reshape(decoder_output,(-1,decoder_output.shape[2]))
dense_layer = tf.keras.layers.Dense(vocab_size, activation=tf.keras.activations.softmax)
decoder_output = dense_layer(decoder_output)

training_model = tf.keras.models.Model([inference_encoder_inputs,inference_decoder_inputs],decoder_output)
training_model.compile(optimizer = tf.keras.optimizers.RMSprop(), loss = 'sparse_categorical_crossentropy')

training_model.summary()

# Some important points - 
# raw inputs and outputs are used to define the model ( without embedding ). 
# NEVER overwrite tensors, because the LSTM layer requires brand new embedded layer


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    14138112    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    14138112    input_2[0][0]                    
______________________________________________________________________________________________

In [0]:
training_model.load_weights("training_2/cp.ckpt")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc66c251dd8>

In [0]:
import os
checkpoint_path = "training_2/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)


In [0]:
training_model.fit([encoder_input_data,decoder_input_data],padded_decout_answers,batch_size=170, epochs = 2, callbacks=[cp_callback])
training_model.save('trainingmodel_2.h5')

Epoch 1/2
Epoch 00001: saving model to training_2/cp.ckpt
Epoch 2/2
Epoch 00002: saving model to training_2/cp.ckpt


In [0]:
def make_inference_models():
  inference_encoder_model = tf.keras.models.Model(inference_encoder_inputs,encoder_states)

  decoder_state_input_h = tf.keras.layers.Input(shape=(lstm_units, ))
  decoder_state_input_c = tf.keras.layers.Input(shape=(lstm_units, ))
  decoder_states_inputs = [decoder_state_input_h,decoder_state_input_c]

  decoder_output,state_h,state_c = decoder_lstm_layer(embedded_decoder_inputs, initial_state = decoder_states_inputs)
  decoder_states = [state_h,state_c]
  decoder_output = dense_layer(decoder_output)

  inference_decoder_model = tf.keras.models.Model([inference_decoder_inputs]+decoder_states_inputs, [decoder_output]+decoder_states)

  return inference_encoder_model,inference_decoder_model


In [0]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( tokenizer.word_index[ word ] ) 
    return tf.keras.preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=upper_limit , padding='post')

In [0]:
enc_model , dec_model = make_inference_models()

for _ in range(10):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > upper_limit:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )

Enter question : hi
 hi end
Enter question : how are you
 fine end
Enter question : good to know
 i don't know what you're talking about end
Enter question : you are smart
 yes end
Enter question : okay then bye
 bye end
Enter question : see you 
 yeah end
Enter question : good night
 good night end
Enter question : good morning
 good morning end
Enter question : good evening 
 good night end
Enter question : you are dumb 
 no end
