<a href="https://colab.research.google.com/github/keshvi-srivastava/star-wars-dialogue-generation/blob/main/Model2_Sliding_Window.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Model to generate a sequence of following words:
1. Convert the data into token list
2. Convert data to token sentences with sliding windows
3. Encode the sentence
4. Simple LSTM model

- Makes sentences sequence from the whole token list
- Make a sliding window of size 5 each

Reference:

https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

https://www.kaggle.com/guidant/mimicking-star-wars-characters-using-a-i-rnn#2.-Data-Preparation

https://towardsdatascience.com/simple-text-generation-d1c93f43f340

In [None]:
import tensorflow as tf
import numpy as np
import os
import time
import pandas as pd
import re
from numpy import array
from pickle import dump
import string
from random import randint
from pickle import load
from tensorflow.keras.layers.experimental import preprocessing

from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import TimeDistributed
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [None]:
path_to_file = '/content/drive/MyDrive/SNLP Project/Filtered_Data/'

data = pd.DataFrame(columns = ['character', 'dialogue'])

for file in os.listdir(path_to_file):
    print(file)
    df = pd.read_csv(path_to_file+file)
    data = data.append(df, ignore_index=True)

data['character'] = data["character"].str.lower()

data['character'] = data.character.replace("anakin", "vader", regex=True)
data['character'] = data.character.replace("obi-wan", "ben", regex=True)
data['character'] = data.character.replace("c-3po", "threepio", regex=True)

unique_characters = data.character.unique()

data_dict = data.groupby('character')['dialogue'].apply(lambda g: g.values.tolist()).to_dict()

SW_EpisodeI.csv
SW_EpisodeII.csv
SW_EpisodeIII.csv
SW_EpisodeIV.csv
SW_EpisodeV.csv
SW_EpisodeVI.csv


In [None]:
def preprocess_text(sen):

    # Remove numbers
    sentence = re.sub(" \d+", " ", sen)

    # # Single character removal
    # sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sen)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    # Remove ....
    sentence = re.sub('\.+', ' ', sentence)

    # Remove punctuations
    sentence = re.sub('[%s]' % re.escape(string.punctuation), '', sentence)

    # Lower case
    sentence = sentence.lower()

    # Return a list of tokens (words)
    sentence = sentence.split()

    return sentence

In [None]:
obi_wan_tokens = [preprocess_text(row) for row in data_dict['ben']]
print(obi_wan_tokens[:5])

obi_wan_data = [' '.join(row) for row in obi_wan_tokens]
print(obi_wan_data)

obi_wan_token_list = [item for sublist in obi_wan_tokens for item in sublist]
print(obi_wan_token_list)

print("Total # of tokens(words)")
print(len(obi_wan_token_list))

print("Total # of unique tokens(words)")
print(len(set(obi_wan_token_list)))

[['i', 'have', 'a', 'bad', 'feeling', 'about', 'this'], ['its', 'not', 'about', 'the', 'mission', 'master', 'its'], ['master', 'yoda', 'says', 'i', 'should', 'be', 'mindful', 'of', 'the', 'future'], ['yes', 'master', 'how', 'do', 'you', 'think', 'the', 'trade', 'viceroy', 'will', 'deal', 'with'], ['offhand', 'id', 'say', 'this', 'mission', 'is', 'past', 'the', 'negotiaion', 'stage']]
Total # of tokens(words)
6196
Total # of unique tokens(words)
1450


In [None]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(obi_wan_token_list)
unique_words = set(obi_wan_token_list)
sequences_tokenised = tokenizer.texts_to_sequences(obi_wan_tokens)

n_vocab = len(unique_words)
n_sentences = len(obi_wan_tokens)

In [None]:
print(unique_words)
print(sequences_tokenised)

print(n_vocab)
print(n_sentences)

[[4, 12, 5, 275, 276, 91, 15], [29, 13, 91, 1, 277, 23, 29], [23, 79, 550, 4, 124, 11, 384, 7, 1, 551], [98, 23, 125, 27, 2, 68, 1, 552, 553, 25, 554, 30], [555, 385, 143, 15, 277, 10, 386, 1, 556, 557], [42, 12, 558, 559], [29, 111, 560, 278], [2, 44, 50, 91, 87, 160, 23, 1, 561, 44], [144, 23, 1, 562, 563, 37, 228], [98, 23], [279, 15], [59, 42, 99, 74, 42, 25, 564, 74, 565, 74, 145, 161], [28, 191, 15, 43, 13, 60, 112, 280, 26, 566, 567], [281, 162, 80, 88, 192, 7, 1, 568, 42, 25, 146], [2, 8, 1, 282, 283, 5, 569, 570, 31, 571, 3, 572, 7], [23, 279, 5, 573], [24, 22, 284, 7, 51, 23], [23, 75, 27, 2, 163, 574, 164, 387, 165, 575, 285], [75, 44, 2, 388, 166, 166], [42, 388, 2, 229, 69, 389], [193, 21, 1, 286], [44, 576, 167], [390, 100], [2, 577, 9], [31, 10, 9], [47, 81, 39], [18, 19, 287], [230, 88, 288, 7, 18, 168, 578, 391, 1, 579, 7], [45, 169, 39, 8, 163, 63, 7, 289], [39, 23, 231, 29, 290, 63, 7, 1, 101, 580, 1], [29, 392, 32, 170, 34, 581], [582, 9, 231], [144, 583, 50, 194, 2

In [None]:
#Create sliding windows
seq_length = 5
dataX = []
dataY = []
for dialogue in sequences_tokenised:
  window = []
  for i in range(len(dialogue)-5):
    dataX.append(dialogue[i:i+5])
    dataY.append(dialogue[i+5])

In [None]:
print(len(dataX[0]))

5


In [None]:
print(len(dataY))

3596


In [None]:
np.asarray(dataY).shape

(3596,)

In [None]:
pd.get_dummies(np.asarray(dataY))

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,1404,1405,1406,1407,1409,1411,1412,1413,1414,1417,1418,1419,1420,1421,1422,1423,1426,1427,1428,1429,1430,1431,1432,1433,1434,1435,1436,1437,1438,1439,1440,1441,1442,1443,1444,1445,1446,1447,1448,1450
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3591,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3592,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3593,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3594,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# define model
model_2 = Sequential([
    Embedding(n_vocab+1, 50, input_length=5),
    LSTM(100, return_sequences=True),
    LSTM(100),
    Dense(100, activation='relu'),
    Dropout(0.1),
    Dense(1100, activation='softmax')
])

# Train model with checkpoints
model_2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
filepath = "./model_2_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
history = model_2.fit(np.asarray(dataX),
         pd.get_dummies(np.asarray(dataY)),
         epochs = 300,
         batch_size = 128,
         callbacks = callbacks_list,
         verbose = 1)

Epoch 1/300

Epoch 00001: loss improved from inf to 6.89914, saving model to ./model_2_weights.hdf5
Epoch 2/300

Epoch 00002: loss improved from 6.89914 to 6.14770, saving model to ./model_2_weights.hdf5
Epoch 3/300

Epoch 00003: loss improved from 6.14770 to 5.99249, saving model to ./model_2_weights.hdf5
Epoch 4/300

Epoch 00004: loss improved from 5.99249 to 5.94814, saving model to ./model_2_weights.hdf5
Epoch 5/300

Epoch 00005: loss improved from 5.94814 to 5.90963, saving model to ./model_2_weights.hdf5
Epoch 6/300

Epoch 00006: loss improved from 5.90963 to 5.88460, saving model to ./model_2_weights.hdf5
Epoch 7/300

Epoch 00007: loss improved from 5.88460 to 5.85053, saving model to ./model_2_weights.hdf5
Epoch 8/300

Epoch 00008: loss improved from 5.85053 to 5.80133, saving model to ./model_2_weights.hdf5
Epoch 9/300

Epoch 00009: loss improved from 5.80133 to 5.78271, saving model to ./model_2_weights.hdf5
Epoch 10/300

Epoch 00010: loss improved from 5.78271 to 5.77164, sa

In [None]:
reverse_word_dict = {v: k for k, v in tokenizer.word_index.items()}

In [None]:
def gen(model,seq,max_len = 6):
    ''' Generates a sequence given a string seq using specified model until the total sequence length
    reaches max_len'''
    # Tokenize the input string
    tokenized_sent = tokenizer.texts_to_sequences([seq])
    max_len = max_len+len(tokenized_sent[0])
    # If sentence is not as long as the desired sentence length, we need to 'pad sequence' so that
    # the array input shape is correct going into our LSTM. the `pad_sequences` function adds 
    # zeroes to the left side of our sequence until it becomes 19 long, the number of input features.
    while len(tokenized_sent[0]) < max_len:
        padded_sentence = pad_sequences(tokenized_sent[-5:],maxlen=6)
        op = model.predict(np.asarray(padded_sentence).reshape(1,-1))
        tokenized_sent[0].append(op.argmax()+1)
    return " ".join(map(lambda x : reverse_word_dict[x],tokenized_sent[0]))

In [None]:
gen(model_2, 'i got')

'i got in and how distracted as seems'