<a href="https://colab.research.google.com/github/keshvi-srivastava/star-wars-dialogue-generation/blob/main/Model1_Basic_Word_level_NLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Model to generate a sequence of following words:
1. Convert the data into token list
2. Convert data to token sentences
3. Encode the sentence
4. Simple LSTM model

- Makes sentences sequence from the whole token list.
- No sentence padding, hence sentences don't necessarily make sense

Reference:
https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/

In [None]:
import tensorflow as tf
import numpy as np
import os
import time
import pandas as pd
import re
from numpy import array
from pickle import dump
import string
from random import randint
from pickle import load

from tensorflow.keras.layers.experimental import preprocessing

from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import TimeDistributed
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences



In [None]:
path_to_file = '/content/drive/MyDrive/SNLP Project/Filtered_Data/'

In [None]:
data = pd.DataFrame(columns = ['character', 'dialogue'])

In [None]:
for file in os.listdir(path_to_file):
    print(file)
    df = pd.read_csv(path_to_file+file)
    data = data.append(df, ignore_index=True)

SW_EpisodeI.csv
SW_EpisodeII.csv
SW_EpisodeIII.csv
SW_EpisodeIV.csv
SW_EpisodeV.csv
SW_EpisodeVI.csv


In [None]:
data

Unnamed: 0,character,dialogue
0,OBI-WAN,I have a bad feeling about this.
1,OBI-WAN,"It's not about the mission, Master, it's"
2,OBI-WAN,Master Yoda says I should be mindful of the fu...
3,OBI-WAN,"Yes, Master...how do you think the trade vicer..."
4,OBI-WAN,"Offhand, I'd say this mission is past the nego..."
...,...,...
2899,VADER,Nothing can stop that now. Just for once... le...
2900,ANAKIN,"Now...go, my son. Leave me."
2901,ANAKIN,"You already have, Luke. You were right about m..."
2902,HAN,Lando...


In [None]:
data['character'] = data["character"].str.lower()

data['character'] = data.character.replace("anakin", "vader", regex=True)
data['character'] = data.character.replace("obi-wan", "ben", regex=True)
data['character'] = data.character.replace("c-3po", "threepio", regex=True)

unique_characters = data.character.unique()

data_dict = data.groupby('character')['dialogue'].apply(lambda g: g.values.tolist()).to_dict()

* Analysis for OBI-WAN data

In [None]:
def preprocess_text(sen):

    # Remove numbers
    sentence = re.sub(" \d+", " ", sen)

    # # Single character removal
    # sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sen)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    # Remove ....
    sentence = re.sub('\.+', ' ', sentence)

    # Remove punctuations
    sentence = re.sub('[%s]' % re.escape(string.punctuation), '', sentence)

    # Lower case
    sentence = sentence.lower()

    # Return a list of tokens (words)
    sentence = sentence.split()
    
    return sentence

In [None]:
obi_wan_tokens = [preprocess_text(row) for row in data_dict['ben']]
print(obi_wan_tokens[:5])

obi_wan_data = [' '.join(row) for row in obi_wan_tokens]
print(obi_wan_data)

obi_wan_token_list = [item for sublist in obi_wan_tokens for item in sublist]
print(obi_wan_token_list)

print("Total # of tokens(words)")
print(len(obi_wan_token_list))

print("Total # of unique tokens(words)")
print(len(set(obi_wan_token_list)))

[['i', 'have', 'a', 'bad', 'feeling', 'about', 'this'], ['its', 'not', 'about', 'the', 'mission', 'master', 'its'], ['master', 'yoda', 'says', 'i', 'should', 'be', 'mindful', 'of', 'the', 'future'], ['yes', 'master', 'how', 'do', 'you', 'think', 'the', 'trade', 'viceroy', 'will', 'deal', 'with'], ['offhand', 'id', 'say', 'this', 'mission', 'is', 'past', 'the', 'negotiaion', 'stage']]
Total # of tokens(words)
6196
Total # of unique tokens(words)
1450


In [None]:
sent_len_list = [len(x) for x in obi_wan_tokens]
print(sent_len_list)
print(len(sent_len_list))

from scipy import stats
print(stats.describe(sent_len_list))
print(stats.mode(sent_len_list))

[7, 7, 10, 12, 10, 4, 4, 10, 7, 2, 2, 12, 11, 11, 13, 4, 6, 11, 6, 6, 4, 3, 2, 3, 3, 3, 3, 11, 8, 11, 6, 3, 8, 11, 5, 11, 7, 12, 10, 10, 13, 2, 3, 5, 7, 11, 10, 4, 11, 4, 3, 12, 7, 13, 6, 9, 11, 8, 4, 11, 8, 13, 6, 11, 2, 1, 2, 8, 11, 11, 12, 8, 8, 7, 5, 9, 11, 14, 1, 14, 22, 15, 3, 4, 14, 5, 9, 1, 3, 4, 4, 26, 26, 16, 4, 7, 5, 23, 10, 9, 3, 5, 9, 7, 4, 1, 11, 10, 5, 4, 1, 5, 10, 4, 7, 10, 16, 9, 4, 1, 1, 8, 7, 8, 5, 5, 12, 7, 5, 10, 4, 7, 9, 10, 3, 5, 2, 20, 17, 6, 6, 15, 6, 18, 4, 5, 2, 2, 2, 7, 7, 11, 20, 4, 3, 6, 13, 2, 1, 6, 18, 14, 3, 1, 5, 6, 7, 22, 29, 12, 14, 34, 16, 16, 13, 2, 2, 6, 4, 3, 3, 4, 6, 1, 7, 3, 2, 2, 17, 2, 2, 12, 10, 3, 10, 1, 6, 10, 3, 1, 14, 12, 3, 5, 16, 24, 16, 15, 11, 18, 10, 9, 11, 8, 7, 5, 8, 13, 5, 3, 8, 2, 4, 6, 1, 11, 12, 5, 6, 11, 5, 6, 6, 2, 12, 16, 10, 16, 10, 6, 4, 5, 5, 17, 14, 7, 6, 5, 8, 5, 11, 10, 10, 17, 3, 8, 4, 4, 6, 8, 16, 5, 11, 15, 20, 9, 7, 2, 8, 8, 4, 17, 4, 3, 7, 12, 1, 18, 10, 5, 8, 12, 10, 7, 11, 6, 1, 22, 1, 5, 20, 9, 19, 9, 3, 3, 4,

In [None]:
# organize into sequences of tokens
length = 10 + 1
sequences = list()
for i in range(length, len(obi_wan_token_list)):
	# select sequence of tokens
	seq = obi_wan_token_list[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 6185


In [None]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequences_tokenised = tokenizer.texts_to_sequences(sequences)

In [None]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

# separate into input and output
sequences_tokenised = array(sequences_tokenised)
X, y = sequences_tokenised[:,:-1], sequences_tokenised[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

1451


In [None]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 50)            72550     
_________________________________________________________________
lstm (LSTM)                  (None, 10, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 1451)              146551    
Total params: 370,001
Trainable params: 370,001
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f31a11b6810>

In [None]:
# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [None]:
seq_length = 10

In [None]:
# load the model
model = load_model('model.h5')
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

In [None]:
# select a seed text
seed_text = sequences[randint(0,len(sequences))]
print(seed_text + '\n')

encoded = tokenizer.texts_to_sequences([seed_text])[0]
encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')

the chancellor does not this assignment is not to be on



In [None]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)

In [None]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 10)
print(generated)



record its became i was once a am not a
