<a href="https://colab.research.google.com/github/RubyNixx/machine_learning/blob/main/6_LanguagePreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

ht### Basic Word Tokenization

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Just a simple test
sentences = [
    "I like eggs and ham.",
    "I love chocolate and bunnies.",
    "I hate onions and ham."
]

In [None]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

print(sentences[0])
print(sequences[0])
print(sentences[1])
print(sequences[1])
print(sentences[2])
print(sequences[2])

I like eggs and ham.
[1, 4, 5, 2, 3]
I love chocolate and bunnies.
[1, 6, 7, 2, 8]
I hate onions and ham.
[1, 9, 10, 2, 3]


In [None]:
# How to get the word to index mapping?
tokenizer.word_index

{'i': 1,
 'and': 2,
 'ham': 3,
 'like': 4,
 'eggs': 5,
 'love': 6,
 'chocolate': 7,
 'bunnies': 8,
 'hate': 9,
 'onions': 10}

In [None]:
# use the defaults
data = pad_sequences(sequences)
print(data)

[[ 1  4  5  2  3]
 [ 1  6  7  2  8]
 [ 1  9 10  2  3]]


In [None]:
MAX_SEQUENCE_LENGTH = 5
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data)

[[ 1  4  5  2  3]
 [ 1  6  7  2  8]
 [ 1  9 10  2  3]]


In [None]:
data = pad_sequences(sequences, maxlen=10, padding='post')
print(data)

[[ 1  4  5  2  3  0  0  0  0  0]
 [ 1  6  7  2  8  0  0  0  0  0]
 [ 1  9 10  2  3  0  0  0  0  0]]


In [None]:
# too much padding
data = pad_sequences(sequences, maxlen=20)
print(data)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  4  5  2  3]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  6  7  2  8]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  9 10  2  3]]


### Examples of stemming and lemmatization

In [None]:
word1 = input('Word 1: ')
word2 = input('Word 2: ')
word3 = input('Word 3: ')
word4 = input('Word 4: ')
word5 = input('Word 5: ')

Word 1: house
Word 2: cat
Word 3: caterpillar
Word 4: hungry
Word 5: fiddle


In [None]:
words = [word1, word2, word3, word4, word5]

#### Stemming

In [None]:
# Import the NLTK and the Porter Stemmer library
import nltk

from nltk.stem.porter import *
p_stemmer = PorterStemmer()
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

house --> hous
cat --> cat
caterpillar --> caterpillar
hungry --> hungri
fiddle --> fiddl


In [None]:
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')

for word in words:
    print(word+' --> '+s_stemmer.stem(word))

house --> hous
cat --> cat
caterpillar --> caterpillar
hungry --> hungri
fiddle --> fiddl


#### Lemmatization

In [None]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [None]:
doc = nlp(u"I saw many bikes on the road the other day")

show_lemmas(doc)

I            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
many         ADJ    9720044723474553187    many
bikes        NOUN   16029548483725639901   bike
on           ADP    5640369432778651323    on
the          DET    7425985699627899538    the
road         NOUN   13540101588783668053   road
the          DET    7425985699627899538    the
other        ADJ    1176656782636220709    other
day          NOUN   1608482186128794349    day


### Example on a Larger Data Set - Bag-of-Words Method

In [None]:
import re # to preprocess the text
from google.colab import files
files.upload()

lines = open('movie_lines.txt', encoding='utf-8',errors='ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding='utf-8',errors='ignore').read().split('\n')

FileNotFoundError: [Errno 2] No such file or directory: 'movie_lines.txt'

In [None]:
# we can check the structure of the lists - lines, conversations
lines

# We can see the lines list has index column
# the conversations list has a vector with references to those index values

In [None]:
# we need to construct a python dictionary
# to map each line index to lines

# We need to a dataset that maps inputs to outputs - supervised ML datasets

# we also need to conduct several data cleaning steps

id_to_line = {} # initialize the dictionary

for line in lines:
  _line = line.split(' +++$+++ ')
  if len(_line) == 5:
    id_to_line[_line[0]] = _line[4] # creates the dictionary entry

id_to_line

In [None]:
# create a list with the line ids from the conversations list

conversations_ids = []

for conversation in conversations[:-1]:

  # we need to clean up the data, remove the quotes, square brackets, white space etc.
  _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
  # this returns the last element and removes the square brackets
  # removes the quotes and whiate spaces
  conversations_ids.append(_conversation.split(","))

In [None]:
# Develop inputs and outputs to create a supervised ML dataset

# in our conversation_ids dataset, the first element is the question
# the second element is the answer

# we create two lists, one for questions, one for answers...
# we need to map our dictionary keys to our lines


questions = []
answers = []

for conversation in conversations_ids:
  for i in range(len(conversation) - 1):
    questions.append(id_to_line[conversation[i]])
    answers.append(id_to_line[conversation[i+1]]) # the next element in the row

Please note this next cell ***may return offensive language*** due to the nature of the dataset used.  This is a widely used dataset in the field of Natural Language Processing.

Simply run the cells to generate an alternative random sample.

In [None]:
# generate some random sentences from our list

# Please note this cell returns a random list from a large dataset
# of actual movie scripts, that is widely used in teaching NLP.

# Therefore there is a chance your random generation
# may return swear words or offensive language.

# We run this to show you what the data looks like before it is processed

from random import randrange
x = (randrange(len(questions)))
questions[x:x+5]

In [None]:
# we need to clean up the data, expanding the abbreviations etc.
# remove apostrophes, removing duplication, unimportant words

def clean_text(text):
  text = text.lower() # lower case
  text = re.sub(r"i'm","i am", text) # use re library to replace apostrophes
  text = re.sub(r"he's","he is", text)
  text = re.sub(r"she's","she is", text)
  text = re.sub(r"that's","that is", text)
  text = re.sub(r"what's","what is", text)
  text = re.sub(r"where's","where is", text)
  text = re.sub(r"\'ll"," will", text)
  text = re.sub(r"\'ve"," have", text)
  text = re.sub(r"\'re"," are", text)
  text = re.sub(r"\'d"," would", text)
  text = re.sub(r"won't","will not", text)
  text = re.sub(r"can't","can not", text)
  text = re.sub(r"[^a-zA-Z0-9]+"," ", text) # remove special characters
  return text

In [None]:
# Clean up the questions
clean_questions = []

for question in questions:
  clean_questions.append(clean_text(question))

In [None]:
# Clean up the answers
clean_answers = []

for answer in answers:
  clean_answers.append(clean_text(answer))

In [None]:
# check our clean lists

# notice the difference before and after.

print(questions[x:x+2])
print(clean_questions[x:x+2])
print("---------------------------------")
print(answers[x:x+1])
print(clean_answers[x:x+1])

In [None]:
# the next thing we do is remove the infrequent and unneccass ry words
# we don't need to slow the training down for very few words

# We want to optimise the vocab
# we should focus on the most commonly used words in the corpus

# create a dictionary - word distribution histogram

word_hist = {}

for question in clean_questions:
  for word in question.split(): # do we see the word the first time?
    if word not in word_hist:
      word_hist[word] = 1
    else:
      word_hist[word] += 1

for answer in clean_answers:
  for word in answer.split(): # do we see the word the first time?
    if word not in word_hist:
      word_hist[word] = 1
    else:
      word_hist[word] += 1 # add it to existing count

In [None]:
# how many words do we have in our corpus?

len(word_hist)

In [None]:
# We are now going to perform tokenization

# then filter the words by a certain threshold, so we can exclude infrequent words

threshold = 20   # any word occuring less than this, drop...

question_words_to_int = {} # dictionary of words to integers

word_number = 0  # this is our index, set of unique integers

for word, count in word_hist.items():
  if count >= threshold:
    question_words_to_int[word] = word_number
    word_number += 1

In [None]:
answer_words_to_int = {} # dictionary of words to integers

word_number = 0  # this is our index, set of unique integers

for word, count in word_hist.items():
  if count >= threshold:
    answer_words_to_int[word] = word_number
    word_number += 1

We now have a dictionary (vector) with a unique integer assigned to each word.

In [None]:
# Add the tokens for the encoder / decoder

# SOS & EOS tokens
# we can replace the infrequent word with tokens = OUT

tokens = ['<PAD>', '<EOS>', '<OUT>','<SOS>']

# add to our dictionaries

for token in tokens:
  question_words_to_int[token] = len(question_words_to_int) + 1

for token in tokens:
 answer_words_to_int[token] = len(answer_words_to_int) + 1

In [None]:
# We create the inverse dictionary for the answer_words_to_int

# we need this inverse mapping for the Seq2Seq model
# dictionary inversing is an important step in many ML models

inverse_ans_int_to_word = {w_i: w for w, w_i in answer_words_to_int.items()}

In [None]:
# we need to add the EOS token to signal end of every answer for the decoder
# loop over our clean answer and add EOS token

for i in range(len(clean_answers)):
  clean_answers[i] += ' <EOS>' # add a space


# we have to specify the EOS for the model to work

In [None]:
# check that this has been added...

clean_answers

We no need to translate all our questions and answers into integers.

We also need to replace all the words that we filtered out (below the threshold) with the 'OUT' token

In [None]:
# We map our clean lists to the dictionary of integers
# to create numerical vectors

# replace filtered words with the <OUT> token

questions_to_integers = []

for question in clean_questions:
  ints = []
  for word in question.split():
    if word not in question_words_to_int:
      ints.append(question_words_to_int['<OUT>'])
    else:
      ints.append(question_words_to_int[word])
  questions_to_integers.append(ints)

In [None]:
answers_to_integers = []

for answer in clean_answers:
  ints = []
  for word in answer.split():
    if word not in answer_words_to_int:
      ints.append(answer_words_to_int['<OUT>'])
    else:
      ints.append(answer_words_to_int[word])
  answers_to_integers.append(ints)

In [None]:
# We now need to sort the questions and answers by
# the length of the questions.  This speeds up the training
# and reduces loss - by reducing the padding

# We make a new list - we can's sort an existing list in python

sorted_clean_questions = []
sorted_clean_answers = []

# we don't want to include long sentences - say a length of 20 words in a sentence
# this is a hyperparameter

for length in range(1, 25 + 1):# 25 words in longest sentence
   for i in enumerate(questions_to_integers):
     if len(i[1]) == length:
       sorted_clean_questions.append(questions_to_integers[i[0]])
       sorted_clean_answers.append(answers_to_integers[i[0]])


In [None]:
sorted_clean_questions[x:x+2]