In [1]:
# Importing the required modules.
import pandas as pd
import numpy as np
import re
import string
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.model_selection import train_test_split

In [2]:
# Reading the required data files
# Reading the movie details meta data
with open('./data/movie_titles_metadata.txt', 'r', encoding='utf-8', errors='ignore') as mtm:
    movie_titles = mtm.read().split('\n')

# Reading the conversation meta data
with open('./data/movie_conversations.txt', 'r', encoding='utf-8', errors='ignore') as mc:
    movie_conversations = mc.read().split('\n')

# Reading the conversation lines
with open('./data/movie_lines.txt', 'r', encoding='utf-8', errors='ignore') as ml:
    movie_lines = ml.read().split('\n')

In [3]:
# Prepare dictionary for all data 
# Prepare dictionary for movie meta data
movie_title_list = []
for line in movie_titles:
    if not line:
        continue # for identifying and ignoring empty lines
    movie_title_info = {}
    movie_info = line.split(' +++$+++ ')
    movie_title_info['movie_id'] = movie_info[0].strip()
    movie_title_info['name'] = movie_info[1].strip()
    movie_title_info['year'] = movie_info[2].strip()
    movie_title_info['rating'] = movie_info[3].strip()
    movie_title_info['genre'] = movie_info[-1][2:-2].strip().split("', '") # this is for splitting the genres from ['comedy', 'romance'] to a list
    movie_title_list.append(movie_title_info)

# Prepare dictionary for movie convo meta data
movie_conversation_list = []
for line in movie_conversations:
    if not line:
        continue # for identifying and ignoring empty lines
    movie_conversation_info = {}
    conversation_info = line.split(' +++$+++ ')
    movie_conversation_info['speaker1'] = conversation_info[0].strip()
    movie_conversation_info['speaker2'] = conversation_info[1].strip()
    movie_conversation_info['movie_id'] = conversation_info[2].strip()
    movie_conversation_info['line_ids'] = conversation_info[-1][2:-2].strip().split("', '")# this is for splitting the conversation info from ['L198', 'L199'] to a list
    movie_conversation_list.append(movie_conversation_info)

# Prepare dictionary for movie dialogues
movie_lines_list = []
for line in movie_lines:
    if not line:
        continue # for identifying and ignoring empty lines
    movie_line_info = {}
    line_info = line.split(' +++$+++ ')
    movie_line_info['line_id'] = line_info[0].strip()
    movie_line_info['speaker'] = line_info[1].strip()
    movie_line_info['movie_id'] = line_info[2].strip()
    movie_line_info['character'] = line_info[3].strip()
    movie_line_info['dialogue'] = line_info[-1].strip()
    movie_lines_list.append(movie_line_info)

In [4]:
# create dataframe for all the above dicts for better processing
movie_title_df = pd.DataFrame.from_dict(movie_title_list)
movie_conversation_df = pd.DataFrame.from_dict(movie_conversation_list)
movie_lines_df = pd.DataFrame.from_dict(movie_lines_list)

In [5]:
# Get the list of available genres from the whole dataset 
genres = movie_title_df['genre'].to_numpy()
genre_set = set()
for genre_list in genres:
    for genre in genre_list:
        if genre:
            genre_set.add(genre)


In [6]:
# Checking the count of movies in each genres and storing the movies with respect to their genres in the dictionary
genre_dict = {}
for genre_name in genre_set:
    genre_dict[genre_name] = []
for movie, genre_list in movie_title_df[['movie_id', 'genre']].to_numpy():
    for genre in genre_list:
        if genre:
            genre_dict[genre].append(movie)

In [7]:
# Make conversation line dictionary for preparing the final dataset
dialogue_ids = movie_lines_df['line_id'].to_numpy()
dialogue_lines = movie_lines_df['dialogue'].to_numpy()
dialogue_dict = {}
for dialogue_id, dialogue_line in zip(dialogue_ids, dialogue_lines):
    dialogue_dict[dialogue_id] = dialogue_line

#len(dialogue_dict)

In [8]:
# prepare final/actual dictionary for creating the chat bot
# This dictionary will have the conversation wise data.
conversation_data_dict = {}
conversation_data_dict['movie_id'] = []
conversation_data_dict['input'] = []
conversation_data_dict['target'] = []
for movie_id, convo_list in movie_conversation_df[['movie_id', 'line_ids']].to_numpy():
    for convos in range(len(convo_list)-1):
        conversation_data_dict['movie_id'].append(movie_id)
        conversation_data_dict['input'].append(dialogue_dict[convo_list[convos]])
        conversation_data_dict['target'].append(dialogue_dict[convo_list[convos+1]])

# Prepare dataframe from the dictionary for better access
conversation_data_df = pd.DataFrame.from_dict(conversation_data_dict)

In [9]:
# create a function for data cleaning
def clean_text(input_text: str, add_tags: bool = False, start_tag: str = 'START_ ', end_tag: str = ' _END', 
                remove_punc: bool = True, remove_symbols: str = '[^0-9a-z #+_]', ignore_words: list = [], 
                remove_numbers: bool = True, replace_word_from: list = [], replace_word_to: list = []):
    """
    Input: input_text (string), add_tags (optional - bool), start_tag (optional - string), end_tag (optional - string), 
            remove_punc (optional - bool), remove_symbols (optional - string), ignore_words (optional - list), remove_numbers (optional - bool),
            replace_word_from (optional - bool), replace_word_to (optional - bool)
    Output: cleaned text (string)
    description:
        This function will clean the input text given by removong the bad symbols, numbers, punctuations, extra spaces... and return back the cleaned text
        if the add_tags value is True (it's False by default) it will add the start tag and end tags at the start and end of the text
        we can also define the start_tag and end_tag values
    """
    def remove_punctuation(text: str):
        punctuation_list = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in punctuation_list)

    def remove_bad_symbols(text: str, symbols: str):
        bad_symbols = re.compile(symbols)
        return bad_symbols.sub(' ', text)

    def remove_extra_space(text: str):
        extra_space = re.compile(' +')
        return extra_space.sub(' ', text)

    def remove_ignore_words(text: str, ignore_words_list: list):
        for word in ignore_words_list:
            text = text.replace(word, " ")
        return text
    
    def remove_digits(text:str):
        remove_digit = str.maketrans('', '', string.digits)
        return text.translate(remove_digit)

    def replace_words(text: str, replace_word_list_from: list, replace_word_list_to: list):
        for from_word, to_word in zip(replace_word_list_from, replace_word_list_to):
            text = text.replace(str(from_word).lower(), str(to_word).lower())
        return text

    def add_start_end_tags(text: str):
        return 'START_ ' + text + ' _END'

    input_text = input_text.lower()
    input_text = replace_words(input_text, replace_word_from, replace_word_to) if replace_word_from and (len(replace_word_from) == len(replace_word_to)) else input_text
    input_text = remove_ignore_words(input_text, ignore_words) if ignore_words else input_text
    input_text = remove_digits(input_text) if remove_numbers else input_text
    input_text = remove_punctuation(input_text) if remove_punc else input_text
    input_text = remove_bad_symbols(input_text, remove_symbols) if remove_symbols else input_text
    input_text = add_start_end_tags(input_text) if add_tags else input_text
    input_text = remove_extra_space(input_text)
    return input_text.strip()



In [10]:
conversation_data_df['input'] = conversation_data_df['input'].apply(clean_text)
conversation_data_df['target'] = conversation_data_df['target'].apply(clean_text, add_tags=True)

In [11]:
# Get only the comedy movies
comedy_movies_list = genre_dict['comedy']

# filter only the comedy movies from total dataframe
comedy_movie_line_df = conversation_data_df[conversation_data_df['movie_id'].isin(comedy_movies_list)]

In [12]:
# splitting data for training and validation
train_inputs, test_inputs, train_targets, test_targets = train_test_split(comedy_movie_line_df['input'].to_numpy(),
                                                                            comedy_movie_line_df['target'].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)
len(train_inputs), len(test_inputs), len(train_targets), len(test_targets)

(61668, 6852, 61668, 6852)

In [13]:
# Defining parameters text vectorizer & creating text vectorizer 
max_vocab_length = 5000
max_length = 20
text_vectorizer = layers.experimental.preprocessing.TextVectorization(
                    max_tokens=max_vocab_length,
                    output_mode="int",
                    output_sequence_length=max_length,
                    standardize=None)

In [14]:
# Adapting the training data for preparing the final dictionary
text_vectorizer.adapt(comedy_movie_line_df['target'].to_numpy())

In [15]:
# Converting the output text to vectors for training the model
train_vector_targets = text_vectorizer(train_targets)
test_vector_targets = text_vectorizer(test_targets)

In [16]:
test_vector_targets

<tf.Tensor: shape=(6852, 20), dtype=int64, numpy=
array([[  3,   4,   1, ...,   0,   0,   0],
       [  3,  91,   4, ...,   0,   0,   0],
       [  3,   4,  90, ...,   0,   0,   0],
       ...,
       [  3, 130,  14, ...,   0,   0,   0],
       [  3,  97, 395, ...,   0,   0,   0],
       [  3,  41,   4, ...,   0,   0,   0]], dtype=int64)>

In [17]:
from tensorflow.keras.utils import to_categorical

In [18]:
to_categorical(train_vector_targets[:15000], 8000).shape

(15000, 20, 8000)

In [19]:
# Preparing generator function for fetching dataset
def batch_data_generator(x_vec, y_vec, vocab_list: list, batch_size: int = 128, ):
    while True:
        for i in range(0, len(x_vec), batch_size):
            encoder_input_data = x_vec[i:i+batch_size]
            decoder_input_data = np.zeros((batch_size, y_vec[0].shape[0]), dtype=int) #y_vec[i:i+batch_size]
            decoder_target_data = np.zeros((batch_size, y_vec[0].shape[0], len(vocab_list)), dtype=int) #y_vec[i:i+batch_size] #tf.zeros((batch_size, max_length, max_vocab_length), dtype=tf.float32)
            start_index = vocab_list.index('START_')
            unk_index = vocab_list.index('[UNK]')
            end_index = vocab_list.index('_END')
            all_zero = np.zeros(len(vocab_list))
            end_vector = np.zeros(len(vocab_list))
            end_vector[end_index] = 1
            for j, target_vector in enumerate(y_vec[i:i+batch_size]):
                closing_index = np.where(target_vector.numpy() == end_index)[0].size
                max_index = len(target_vector.numpy()) - 1
                if closing_index:
                    max_index = np.where(target_vector.numpy() == end_index)[0][0]
                vector_length = len(target_vector.numpy()) -1
                for t, idx in enumerate(target_vector.numpy()):
                    if idx == end_index:
                        decoder_input_data[j][t] = 0
                    else:
                        decoder_input_data[j][t] = idx
                    if t == max_index:
                        if idx == end_index:
                            decoder_target_data[j][t-1][0] = 1
                        else:
                            decoder_target_data[j][t-1][idx] = 1
                    elif idx == unk_index:
                        decoder_target_data[j][t-1][0] = 1
                    elif t > 0:
                        decoder_target_data[j][t-1][idx] = 1
                    if t == vector_length:
                        decoder_target_data[j][t][idx] = 1
            yield ([encoder_input_data, decoder_input_data], decoder_target_data)

In [20]:
# Creating emmbedding object
embedding_output_dimension = 128
enc_embedding = layers.Embedding(input_dim=max_vocab_length,
                                output_dim=embedding_output_dimension,
                                #input_length=max_length,
                                mask_zero=True)

In [21]:
# Create encoder
lstm_units = 128
encoder_inputs = layers.Input(shape=(1,), dtype=tf.string)
encoder_vector = text_vectorizer(encoder_inputs)
enc_emd = enc_embedding(encoder_vector)
encoder_lstm = layers.LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emd)
encoder_states = [state_h, state_c]

In [22]:
# create embedding layer for decoder
dec_embedding = layers.Embedding(input_dim=max_vocab_length,
                                output_dim=embedding_output_dimension, # 128
                                #input_length=max_length,
                                mask_zero=True)

In [23]:
# Create decoder
decoder_inputs = layers.Input(shape=(None,))
#decoder_vector = text_vectorizer(decoder_inputs)
dec_emb = dec_embedding(decoder_inputs)
decoder_lstm = layers.LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = layers.Dense(max_vocab_length, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model_train = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [24]:
model_train.compile(loss='categorical_crossentropy',
                    optimizer=tf.keras.optimizers.Adam(),
                    metrics=['accuracy'])

In [25]:
#model_train.summary()

In [26]:
training_batch = batch_data_generator(train_inputs, train_vector_targets, vocab_list=text_vectorizer.get_vocabulary(), batch_size=32)
test_batch = batch_data_generator(test_inputs, test_vector_targets, vocab_list=text_vectorizer.get_vocabulary(), batch_size=32)

In [27]:
model_train_history = model_train.fit(training_batch,
                                        steps_per_epoch=64,
                                        epochs=10,
                                        validation_data=test_batch,
                                        validation_steps=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# decoder at test time
encoder_model = tf.keras.Model(encoder_inputs, encoder_states)

decoder_state_input_h = layers.Input(shape=(lstm_units,))
decoder_state_input_c = layers.Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_embedding(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [None]:
vocab_list = text_vectorizer.get_vocabulary()
vocab_list[:5]

['', '[UNK]', '_END', 'START_', 'you']

In [None]:
def chat(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0][0] = vocab_list.index('START_')
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq]+states_value)
        print(output_tokens[0, -1, :].shape)
        print(output_tokens)
        sampled_token_index = np.argmax(output_tokens[0, -1, :], axis=0)
        print(sampled_token_index)
        sampled_char = vocab_list[sampled_token_index]
        print(sampled_char)
        decoded_sentence += ' '+sampled_char
        if len(decoded_sentence.split())>19:
            stop_condition = True
        target_seq = np.zeros((1,1))
        target_seq[0][0] = sampled_token_index
        states_value = [h,c]
    return decoded_sentence

In [None]:
test_text = batch_data_generator(train_inputs, train_vector_targets, vocab_list=text_vectorizer.get_vocabulary(), batch_size=1)
(input_seq, actual_out), encoded_out = next(test_text)

In [None]:
input_seq, actual_out, encoded_out[0][4]

(array(['what are you talking about'], dtype=object),
 array([[   3,   18,  216,   43,    6,  183,   11,  119,    4,  655,   47,
            1, 2376,   49,    1, 1531,   11,  245,    0,    0]]),
 array([0, 0, 0, ..., 0, 0, 0]))

In [None]:
train_inputs[0], train_targets[0]

('what are you talking about',
 'START_ im talking about the kind of people you hang out withabout growing up assuming responsibility of yourself _END')

In [34]:
chat(input_seq)

(5000,)
[[[3.3183996e-02 2.3041398e-04 3.9212622e-07 ... 7.8432358e-06
   1.2928490e-06 5.2851087e-06]]]
5
i
(5000,)
[[[6.3329756e-02 3.7266460e-04 8.1085312e-07 ... 1.1807371e-05
   9.8221126e-06 1.2456473e-05]]]
17
dont
(5000,)
[[[7.4662186e-02 5.9135945e-04 9.7996383e-07 ... 1.3502102e-05
   1.1820738e-05 1.5076335e-05]]]
0

(5000,)
[[[0.00022741 0.00021205 0.0001944  ... 0.00020147 0.00020026 0.00019385]]]
0

(5000,)
[[[0.00022741 0.00021205 0.0001944  ... 0.00020147 0.00020026 0.00019385]]]
0

(5000,)
[[[0.00022741 0.00021205 0.0001944  ... 0.00020147 0.00020026 0.00019385]]]
0

(5000,)
[[[0.00022741 0.00021205 0.0001944  ... 0.00020147 0.00020026 0.00019385]]]
0

(5000,)
[[[0.00022741 0.00021205 0.0001944  ... 0.00020147 0.00020026 0.00019385]]]
0

(5000,)
[[[0.00022741 0.00021205 0.0001944  ... 0.00020147 0.00020026 0.00019385]]]
0

(5000,)
[[[0.00022741 0.00021205 0.0001944  ... 0.00020147 0.00020026 0.00019385]]]
0

(5000,)
[[[0.00022741 0.00021205 0.0001944  ... 0.00020147 0.

KeyboardInterrupt: 