t### Basic Word Tokenization

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Just a simple test
sentences = [
    "I like eggs and ham.",
    "I love chocolate and bunnies.",
    "I hate onions and ham."
]

In [3]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

print(sentences[0])
print(sequences[0])
print(sentences[1])
print(sequences[1])
print(sentences[2])
print(sequences[2])

I like eggs and ham.
[1, 4, 5, 2, 3]
I love chocolate and bunnies.
[1, 6, 7, 2, 8]
I hate onions and ham.
[1, 9, 10, 2, 3]


In [4]:
# How to get the word to index mapping?
tokenizer.word_index

{'i': 1,
 'and': 2,
 'ham': 3,
 'like': 4,
 'eggs': 5,
 'love': 6,
 'chocolate': 7,
 'bunnies': 8,
 'hate': 9,
 'onions': 10}

In [5]:
# use the defaults
data = pad_sequences(sequences)
print(data)

[[ 1  4  5  2  3]
 [ 1  6  7  2  8]
 [ 1  9 10  2  3]]


In [6]:
MAX_SEQUENCE_LENGTH = 5
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data)

[[ 1  4  5  2  3]
 [ 1  6  7  2  8]
 [ 1  9 10  2  3]]


In [7]:
data = pad_sequences(sequences, maxlen=10, padding='post')
print(data)

[[ 1  4  5  2  3  0  0  0  0  0]
 [ 1  6  7  2  8  0  0  0  0  0]
 [ 1  9 10  2  3  0  0  0  0  0]]


In [8]:
# too much padding
data = pad_sequences(sequences, maxlen=20)
print(data)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  4  5  2  3]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  6  7  2  8]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  9 10  2  3]]


### Examples of stemming and lemmatization

In [9]:
word1 = input('Word 1: ')
word2 = input('Word 2: ')
word3 = input('Word 3: ')
word4 = input('Word 4: ')
word5 = input('Word 5: ')

Word 1: test
Word 2: test
Word 3: test
Word 4: test
Word 5: test


In [10]:
words = [word1, word2, word3, word4, word5]

#### Stemming

In [11]:
# Import the NLTK and the Porter Stemmer library
import nltk

from nltk.stem.porter import *
p_stemmer = PorterStemmer()
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

test --> test
test --> test
test --> test
test --> test
test --> test


In [12]:
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')

for word in words:
    print(word+' --> '+s_stemmer.stem(word))

test --> test
test --> test
test --> test
test --> test
test --> test


#### Lemmatization

In [13]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [14]:
doc = nlp(u"I saw many bikes on the road the other day")

show_lemmas(doc)

I            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
many         ADJ    9720044723474553187    many
bikes        NOUN   16029548483725639901   bike
on           ADP    5640369432778651323    on
the          DET    7425985699627899538    the
road         NOUN   13540101588783668053   road
the          DET    7425985699627899538    the
other        ADJ    1176656782636220709    other
day          NOUN   1608482186128794349    day


### Example on a Larger Data Set - Bag-of-Words Method

In [16]:
import re # to preprocess the text
from google.colab import files
files.upload()

lines = open('movie_lines.txt', encoding='utf-8',errors='ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding='utf-8',errors='ignore').read().split('\n')

Saving movie_conversations.txt to movie_conversations.txt
Saving movie_lines.txt to movie_lines (1).txt


In [17]:
# we can check the structure of the lists - lines, conversations
lines

# We can see the lines list has index column
# the conversations list has a vector with references to those index values

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.",
 'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow',
 "L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.",
 'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No',
 'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?',
 'L868 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ The "real you".',
 'L867 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ What good stuff?',
 "L866 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ I figured yo

In [18]:
# we need to construct a python dictionary
# to map each line index to lines

# We need to a dataset that maps inputs to outputs - supervised ML datasets

# we also need to conduct several data cleaning steps

id_to_line = {} # initialize the dictionary

for line in lines:
  _line = line.split(' +++$+++ ')
  if len(_line) == 5:
    id_to_line[_line[0]] = _line[4] # creates the dictionary entry

id_to_line

{'L1045': 'They do not!',
 'L1044': 'They do to!',
 'L985': 'I hope so.',
 'L984': 'She okay?',
 'L925': "Let's go.",
 'L924': 'Wow',
 'L872': "Okay -- you're gonna need to learn how to lie.",
 'L871': 'No',
 'L870': 'I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869': 'Like my fear of wearing pastels?',
 'L868': 'The "real you".',
 'L867': 'What good stuff?',
 'L866': "I figured you'd get to the good stuff eventually.",
 'L865': 'Thank God!  If I had to hear one more story about your coiffure...',
 'L864': "Me.  This endless ...blonde babble. I'm like, boring myself.",
 'L863': 'What crap?',
 'L862': 'do you listen to this crap?',
 'L861': 'No...',
 'L860': 'Then Guillermo says, "If you go any lighter, you\'re gonna look like an extra on 90210."',
 'L699': 'You always been this selfish?',
 'L698': 'But',
 'L697': "Then that's all you had to say.",
 'L696': 'Well, no...',
 'L695': "You never wanted to go out with 'me, did y

In [19]:
# create a list with the line ids from the conversations list

conversations_ids = []

for conversation in conversations[:-1]:

  # we need to clean up the data, remove the quotes, square brackets, white space etc.
  _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
  # this returns the last element and removes the square brackets
  # removes the quotes and whiate spaces
  conversations_ids.append(_conversation.split(","))

In [20]:
# Develop inputs and outputs to create a supervised ML dataset

# in our conversation_ids dataset, the first element is the question
# the second element is the answer

# we create two lists, one for questions, one for answers...
# we need to map our dictionary keys to our lines


questions = []
answers = []

for conversation in conversations_ids:
  for i in range(len(conversation) - 1):
    questions.append(id_to_line[conversation[i]])
    answers.append(id_to_line[conversation[i+1]]) # the next element in the row

Please note this next cell ***may return offensive language*** due to the nature of the dataset used.  This is a widely used dataset in the field of Natural Language Processing.

Simply run the cells to generate an alternative random sample.

In [21]:
# generate some random sentences from our list

# Please note this cell returns a random list from a large dataset
# of actual movie scripts, that is widely used in teaching NLP.

# Therefore there is a chance your random generation
# may return swear words or offensive language.

# We run this to show you what the data looks like before it is processed

from random import randrange
x = (randrange(len(questions)))
questions[x:x+5]

['What is so incredibly great about New York? It\'s a dying city! You-you read "Death in Venice".',
 'You didn\'t read "Death in Venice" till I gave it to you!',
 "It's an important issue.",
 "You're like New York. You're an island.",
 'Excuse... excuse me, when do I go on?']

In [22]:
# we need to clean up the data, expanding the abbreviations etc.
# remove apostrophes, removing duplication, unimportant words

def clean_text(text):
  text = text.lower() # lower case
  text = re.sub(r"i'm","i am", text) # use re library to replace apostrophes
  text = re.sub(r"he's","he is", text)
  text = re.sub(r"she's","she is", text)
  text = re.sub(r"that's","that is", text)
  text = re.sub(r"what's","what is", text)
  text = re.sub(r"where's","where is", text)
  text = re.sub(r"\'ll"," will", text)
  text = re.sub(r"\'ve"," have", text)
  text = re.sub(r"\'re"," are", text)
  text = re.sub(r"\'d"," would", text)
  text = re.sub(r"won't","will not", text)
  text = re.sub(r"can't","can not", text)
  text = re.sub(r"[^a-zA-Z0-9]+"," ", text) # remove special characters
  return text

In [23]:
# Clean up the questions
clean_questions = []

for question in questions:
  clean_questions.append(clean_text(question))

In [24]:
# Clean up the answers
clean_answers = []

for answer in answers:
  clean_answers.append(clean_text(answer))

In [25]:
# check our clean lists

# notice the difference before and after.

print(questions[x:x+2])
print(clean_questions[x:x+2])
print("---------------------------------")
print(answers[x:x+1])
print(clean_answers[x:x+1])

['What is so incredibly great about New York? It\'s a dying city! You-you read "Death in Venice".', 'You didn\'t read "Death in Venice" till I gave it to you!']
['what is so incredibly great about new york it s a dying city you you read death in venice ', 'you didn t read death in venice till i gave it to you ']
---------------------------------
['You didn\'t read "Death in Venice" till I gave it to you!']
['you didn t read death in venice till i gave it to you ']


In [26]:
# the next thing we do is remove the infrequent and unneccass ry words
# we don't need to slow the training down for very few words

# We want to optimise the vocab
# we should focus on the most commonly used words in the corpus

# create a dictionary - word distribution histogram

word_hist = {}

for question in clean_questions:
  for word in question.split(): # do we see the word the first time?
    if word not in word_hist:
      word_hist[word] = 1
    else:
      word_hist[word] += 1

for answer in clean_answers:
  for word in answer.split(): # do we see the word the first time?
    if word not in word_hist:
      word_hist[word] = 1
    else:
      word_hist[word] += 1 # add it to existing count

In [27]:
# how many words do we have in our corpus?

len(word_hist)

49322

In [28]:
# We are now going to perform tokenization

# then filter the words by a certain threshold, so we can exclude infrequent words

threshold = 20   # any word occuring less than this, drop...

question_words_to_int = {} # dictionary of words to integers

word_number = 0  # this is our index, set of unique integers

for word, count in word_hist.items():
  if count >= threshold:
    question_words_to_int[word] = word_number
    word_number += 1

In [29]:
answer_words_to_int = {} # dictionary of words to integers

word_number = 0  # this is our index, set of unique integers

for word, count in word_hist.items():
  if count >= threshold:
    answer_words_to_int[word] = word_number
    word_number += 1

We now have a dictionary (vector) with a unique integer assigned to each word.

In [30]:
# Add the tokens for the encoder / decoder

# SOS & EOS tokens
# we can replace the infrequent word with tokens = OUT

tokens = ['<PAD>', '<EOS>', '<OUT>','<SOS>']

# add to our dictionaries

for token in tokens:
  question_words_to_int[token] = len(question_words_to_int) + 1

for token in tokens:
 answer_words_to_int[token] = len(answer_words_to_int) + 1

In [31]:
# We create the inverse dictionary for the answer_words_to_int

# we need this inverse mapping for the Seq2Seq model
# dictionary inversing is an important step in many ML models

inverse_ans_int_to_word = {w_i: w for w, w_i in answer_words_to_int.items()}

In [32]:
# we need to add the EOS token to signal end of every answer for the decoder
# loop over our clean answer and add EOS token

for i in range(len(clean_answers)):
  clean_answers[i] += ' <EOS>' # add a space


# we have to specify the EOS for the model to work

In [33]:
# check that this has been added...

clean_answers

['well i thought we would start with pronunciation if that is okay with you  <EOS>',
 'not the hacking and gagging and spitting part please  <EOS>',
 'okay then how bout we try out some french cuisine saturday night  <EOS>',
 'forget it  <EOS>',
 'cameron  <EOS>',
 'the thing is cameron i am at the mercy of a particularly hideous breed of loser my sister i can not date until she does  <EOS>',
 'seems like she could get a date easy enough  <EOS>',
 'unsolved mystery she used to be really popular when she started high school then it was just like she got sick of it or something  <EOS>',
 'that is a shame  <EOS>',
 'let me see what i can do  <EOS>',
 'right see you are ready for the quiz  <EOS>',
 'i don t want to know how to say that though i want to know useful things like where the good stores are how much does champagne cost stuff like chat i have never in my life had to point out my head to someone  <EOS>',
 'that is because it s such a nice one  <EOS>',
 'forget french  <EOS>',
 'we

We no need to translate all our questions and answers into integers.

We also need to replace all the words that we filtered out (below the threshold) with the 'OUT' token

In [34]:
# We map our clean lists to the dictionary of integers
# to create numerical vectors

# replace filtered words with the <OUT> token

questions_to_integers = []

for question in clean_questions:
  ints = []
  for word in question.split():
    if word not in question_words_to_int:
      ints.append(question_words_to_int['<OUT>'])
    else:
      ints.append(question_words_to_int[word])
  questions_to_integers.append(ints)

In [35]:
answers_to_integers = []

for answer in clean_answers:
  ints = []
  for word in answer.split():
    if word not in answer_words_to_int:
      ints.append(answer_words_to_int['<OUT>'])
    else:
      ints.append(answer_words_to_int[word])
  answers_to_integers.append(ints)

In [36]:
# We now need to sort the questions and answers by
# the length of the questions.  This speeds up the training
# and reduces loss - by reducing the padding

# We make a new list - we can's sort an existing list in python

sorted_clean_questions = []
sorted_clean_answers = []

# we don't want to include long sentences - say a length of 20 words in a sentence
# this is a hyperparameter

for length in range(1, 25 + 1):# 25 words in longest sentence
   for i in enumerate(questions_to_integers):
     if len(i[1]) == length:
       sorted_clean_questions.append(questions_to_integers[i[0]])
       sorted_clean_answers.append(answers_to_integers[i[0]])


In [37]:
sorted_clean_questions[x:x+2]

[[19, 52, 920, 68, 495, 14], [19, 52, 1156, 2834, 42, 156]]