<a href="https://colab.research.google.com/github/keshvi-srivastava/star-wars-dialogue-generation/blob/main/Model5_GLOVE_Sliding_window_bidirectional_with_return_seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Model to generate a sequence of following words:
1. Convert the data into token list
2. Convert data to token sentences with sliding windows
3. Encode the sentence
4. Simple LSTM model
5. Create a bidirectional model
6. Add Glove word embeddings

- Makes sentences sequence from the whole token list
- Make a sliding window of size 5 each

Reference:

https://medium.com/@plusepsilon/the-bidirectional-language-model-1f3961d1fb27

In [1]:
import tensorflow as tf
import numpy as np
import os
import time
import pandas as pd
import re
from numpy import array
from pickle import dump
import string
from random import randint
from pickle import load
from tensorflow.keras.layers.experimental import preprocessing

from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import TimeDistributed
from keras.layers import Embedding
from keras.layers import GlobalMaxPool1D
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


from __future__ import print_function
#import Keras library
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.metrics import categorical_accuracy

#import spacy, and spacy french model
# spacy is used to work on text

#import other libraries
import numpy as np
import random
import sys
import os
import time
import codecs
import collections
from six.moves import cPickle

In [2]:
path_to_file = '/content/drive/MyDrive/SNLP Project/Filtered_Data/'

data = pd.DataFrame(columns = ['character', 'dialogue'])

for file in os.listdir(path_to_file):
    print(file)
    df = pd.read_csv(path_to_file+file)
    data = data.append(df, ignore_index=True)

data['character'] = data["character"].str.lower()

data['character'] = data.character.replace("anakin", "vader", regex=True)
data['character'] = data.character.replace("obi-wan", "ben", regex=True)
data['character'] = data.character.replace("c-3po", "threepio", regex=True)

unique_characters = data.character.unique()

data_dict = data.groupby('character')['dialogue'].apply(lambda g: g.values.tolist()).to_dict()

SW_EpisodeI.csv
SW_EpisodeII.csv
SW_EpisodeIII.csv
SW_EpisodeIV.csv
SW_EpisodeV.csv
SW_EpisodeVI.csv


In [3]:
def preprocess_text(sen):

    # Remove numbers
    sentence = re.sub(" \d+", " ", sen)

    # # Single character removal
    # sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sen)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    # Remove ....
    sentence = re.sub('\.+', ' ', sentence)

    # Remove punctuations
    sentence = re.sub('[%s]' % re.escape(string.punctuation), '', sentence)

    # Lower case
    sentence = sentence.lower()

    # Return a list of tokens (words)
    sentence = sentence.split()

    return sentence

In [4]:
obi_wan_tokens = [preprocess_text(row) for row in data_dict['ben']]
print(obi_wan_tokens[:5])

obi_wan_data = [' '.join(row) for row in obi_wan_tokens]
print(obi_wan_data)

obi_wan_token_list = [item for sublist in obi_wan_tokens for item in sublist]
print(obi_wan_token_list)

print("Total # of tokens(words)")
print(len(obi_wan_token_list))

print("Total # of unique tokens(words)")
print(len(set(obi_wan_token_list)))

[['i', 'have', 'a', 'bad', 'feeling', 'about', 'this'], ['its', 'not', 'about', 'the', 'mission', 'master', 'its'], ['master', 'yoda', 'says', 'i', 'should', 'be', 'mindful', 'of', 'the', 'future'], ['yes', 'master', 'how', 'do', 'you', 'think', 'the', 'trade', 'viceroy', 'will', 'deal', 'with'], ['offhand', 'id', 'say', 'this', 'mission', 'is', 'past', 'the', 'negotiaion', 'stage']]
Total # of tokens(words)
6196
Total # of unique tokens(words)
1450


In [5]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(obi_wan_token_list)
unique_words = set(obi_wan_token_list)
sequences_tokenised = tokenizer.texts_to_sequences(obi_wan_tokens)

vocab_size = len(unique_words)+1
n_sentences = len(obi_wan_tokens)

In [6]:
print(unique_words)
print(sequences_tokenised)

print(vocab_size)
print(n_sentences)

[[4, 12, 5, 275, 276, 91, 15], [29, 13, 91, 1, 277, 23, 29], [23, 79, 550, 4, 124, 11, 384, 7, 1, 551], [98, 23, 125, 27, 2, 68, 1, 552, 553, 25, 554, 30], [555, 385, 143, 15, 277, 10, 386, 1, 556, 557], [42, 12, 558, 559], [29, 111, 560, 278], [2, 44, 50, 91, 87, 160, 23, 1, 561, 44], [144, 23, 1, 562, 563, 37, 228], [98, 23], [279, 15], [59, 42, 99, 74, 42, 25, 564, 74, 565, 74, 145, 161], [28, 191, 15, 43, 13, 60, 112, 280, 26, 566, 567], [281, 162, 80, 88, 192, 7, 1, 568, 42, 25, 146], [2, 8, 1, 282, 283, 5, 569, 570, 31, 571, 3, 572, 7], [23, 279, 5, 573], [24, 22, 284, 7, 51, 23], [23, 75, 27, 2, 163, 574, 164, 387, 165, 575, 285], [75, 44, 2, 388, 166, 166], [42, 388, 2, 229, 69, 389], [193, 21, 1, 286], [44, 576, 167], [390, 100], [2, 577, 9], [31, 10, 9], [47, 81, 39], [18, 19, 287], [230, 88, 288, 7, 18, 168, 578, 391, 1, 579, 7], [45, 169, 39, 8, 163, 63, 7, 289], [39, 23, 231, 29, 290, 63, 7, 1, 101, 580, 1], [29, 392, 32, 170, 34, 581], [582, 9, 231], [144, 583, 50, 194, 2

In [7]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!ls
!pwd

--2021-04-14 20:04:01--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-04-14 20:04:01--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-04-14 20:04:02--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [8]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('/content/glove.6B.300d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 300))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


In [9]:
print(embedding_matrix.shape)

(1451, 300)


In [10]:
#Create sliding windows
seq_length = 5
sequences_step = 1
dataX = []
dataY = []
for dialogue in sequences_tokenised:
  window = []
  for i in range(len(dialogue)-5):
    dataX.append(dialogue[i:i+5])
    dataY.append(dialogue[i+5])


In [11]:
print(len(dataX))
print(len(dataY))
print(dataY[1450])

3596
3596
72


In [12]:
print(dataX)
print(pd.get_dummies(dataY))

[[4, 12, 5, 275, 276], [12, 5, 275, 276, 91], [29, 13, 91, 1, 277], [13, 91, 1, 277, 23], [23, 79, 550, 4, 124], [79, 550, 4, 124, 11], [550, 4, 124, 11, 384], [4, 124, 11, 384, 7], [124, 11, 384, 7, 1], [98, 23, 125, 27, 2], [23, 125, 27, 2, 68], [125, 27, 2, 68, 1], [27, 2, 68, 1, 552], [2, 68, 1, 552, 553], [68, 1, 552, 553, 25], [1, 552, 553, 25, 554], [555, 385, 143, 15, 277], [385, 143, 15, 277, 10], [143, 15, 277, 10, 386], [15, 277, 10, 386, 1], [277, 10, 386, 1, 556], [2, 44, 50, 91, 87], [44, 50, 91, 87, 160], [50, 91, 87, 160, 23], [91, 87, 160, 23, 1], [87, 160, 23, 1, 561], [144, 23, 1, 562, 563], [23, 1, 562, 563, 37], [59, 42, 99, 74, 42], [42, 99, 74, 42, 25], [99, 74, 42, 25, 564], [74, 42, 25, 564, 74], [42, 25, 564, 74, 565], [25, 564, 74, 565, 74], [564, 74, 565, 74, 145], [28, 191, 15, 43, 13], [191, 15, 43, 13, 60], [15, 43, 13, 60, 112], [43, 13, 60, 112, 280], [13, 60, 112, 280, 26], [60, 112, 280, 26, 566], [281, 162, 80, 88, 192], [162, 80, 88, 192, 7], [80, 8

In [13]:
# X = np.zeros((len(dataX), seq_length, vocab_size), dtype=np.bool)
# y = np.zeros((len(dataX), vocab_size), dtype=np.bool)
# for i, sentence in enumerate(dataX):
#   for t, word in enumerate(sentence):
#     X[i, t, word] = 1
#   y[i, dataY[i]] = 1

In [14]:
# print(X.shape)
# print(y.shape)

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
y = np.zeros((len(dataX), vocab_size), dtype=np.bool)
for i, sentence in enumerate(dataX):
  y[i, dataY[i]] = 1

print(y.shape)

X = pad_sequences(dataX, maxlen=5)
print(X.shape)

(3596, 1451)
(3596, 5)


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout, GRU, Flatten
def bidirectional_lstm_model(seq_length, vocab_size):
    # print('Build LSTM model.')
    # model = Sequential()
    # model.add(Embedding(vocab_size,300,weights=[embedding_matrix],input_length=5,trainable=False))
    # model.add(Bidirectional(LSTM(rnn_size,dropout=0.1,recurrent_dropout=0.1, activation="relu"),input_shape=(seq_length, vocab_size)))
    # model.add(Dropout(0.6))
    # model.add(Dense(vocab_size))
    # model.add(Activation('softmax'))
    
    # optimizer = Adam(lr=learning_rate)
    # callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    # model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    # print("model built!")

    embedding_layer = Embedding(vocab_size,
                                300,
                                weights=[embedding_matrix],
                                input_length=5,
                                trainable=False)
    inp = Input(shape=(5,))
    x = embedding_layer(inp)
    x = Bidirectional(LSTM(200,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(vocab_size,activation='relu')(x)
    x = Dense(vocab_size,activation='sigmoid')(x)
    model = Model(inputs=inp,outputs=x)
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    #call the functions in the metrics 
    model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['accuracy'])

    return model

In [17]:
rnn_size = 256 # size of RNN
seq_length = 5 # sequence length
learning_rate = 0.001 #learning rate

md = bidirectional_lstm_model(seq_length, vocab_size)
md.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 5)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 5, 300)            435300    
_________________________________________________________________
bidirectional (Bidirectional (None, 5, 400)            801600    
_________________________________________________________________
global_max_pooling1d (Global (None, 400)               0         
_________________________________________________________________
dense (Dense)                (None, 1451)              581851    
_________________________________________________________________
dense_1 (Dense)              (None, 1451)              2106852   
Total params: 3,925,603
Trainable params: 3,490,303
Non-trainable params: 435,300
_____________________________________________

In [38]:
batch_size = 32 # minibatch size
num_epochs = 100 # number of epochs

callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath="./" + 'my_model_gen_sentences.{epoch:02d}-{val_loss:.2f}.hdf5',\
                           monitor='val_loss', verbose=0, mode='auto', period=2)]
#fit the model
history = md.fit(X, y,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=num_epochs,
                 callbacks=callbacks,
                 validation_split=0.1)

#save the model
md.save("/content/drive/MyDrive/Colab Notebooks/" + "my_model_generate_sentences.h5")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [39]:
md.save("/content/drive/MyDrive/Colab Notebooks/" + "my_model_generate_sentences.h5")

In [40]:
model = load_model("/content/drive/MyDrive/Colab Notebooks/" + "my_model_generate_sentences.h5")

In [41]:
def sample(preds, temperature=2.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [42]:
reverse_word_dict = {v: k for k, v in tokenizer.word_index.items()}

In [50]:

#initiate sentences
generated = ''
# sentence = ['anakin has turned to the']
sentence = dataX[randint(0,len(dataX))]
print(sentence)
sentence = [' '.join([reverse_word_dict[word] for word in sentence])]

generated += sentence[0]

#the, we generate the text
for i in range(5):
  
    seq = tokenizer.texts_to_sequences(sentence)
    padded = pad_sequences(seq, maxlen=5)
    #calculate next word
    preds = model.predict(padded, verbose=0)[0]
    # print(preds)

    next_index = sample(preds, 0.33)
    next_word = reverse_word_dict[next_index]

    #add the next word to the text
    generated += " " + next_word
    print("generated sentence: ", generated)
    sentence = [' '.join(sentence[0].split()[1:]) + " " + next_word]
    
#print the whole text
print(generated)

[3, 157, 3, 1, 1164]
generated sentence:  to move to the higher levels
generated sentence:  to move to the higher levels the
generated sentence:  to move to the higher levels the force
generated sentence:  to move to the higher levels the force be
generated sentence:  to move to the higher levels the force be with
to move to the higher levels the force be with


wants that is why your -> wants that is why your not to be able to

anakin has turned to the -> anakin has turned to the dark side of the force, anakin has turned to the council on ship its to

must be delivered safely or -> must be delivered safely or other star systems will suffer


In [None]:
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu

In [None]:
gen = ['wants that is why your not to be able to', 'anakin has turned to the dark side of the force']
ref = obi_wan_tokens

In [None]:
print(ref)
print(gen[1].split())

['anakin', 'has', 'turned', 'to', 'the', 'dark', 'side', 'of', 'the', 'force']


In [None]:
print('BLEU score -> {}'.format(sentence_bleu(ref, gen[0].split())))

BLEU score -> 0.631196907822589
