# Natural Language Processing

In [79]:
import tensorflow as tf
from tensorflow import keras

# Shakespeare
surl="https://homl.info/shakespeare"
filepath=keras.utils.get_file("shakespeare.txt",surl)
with open(filepath) as fp:
    stext = fp.read()

In [84]:
print("Text len = %d"%len(stext))
print(stext[:50])

Text len = 1115394
First Citizen:
Before we proceed any further, hear


In [90]:
print("Given array of string, tokenizer chops to words")
tokenizer = keras.preprocessing.text.Tokenizer(char_level=False,lower=True)
text_as_array=[stext]
tokenizer.fit_on_texts(text_as_array)   # this is fast and word-leel
encoding = tokenizer.word_index.items()
print("Num encoded words = "+str(len(encoding)))
print("Encoding = "+str(encoding)[:50])

Given array of string, tokenizer chops to words
Num encoded words = 12632
Encoding = dict_items([('the', 1), ('and', 2), ('to', 3), ('i


In [91]:
print("Given a string, tokenizer chops to chars regardless of parameter...")
tokenizer = keras.preprocessing.text.Tokenizer(char_level=False,lower=True)
tokenizer.fit_on_texts(stext)  
encoding = tokenizer.word_index.items()
print("Num encoded words = "+str(len(encoding)))
print("Encoding = "+str(encoding)[:50])
# This is slow.
# At word level, tokenizer filters non-word characters.

Given a string, tokenizer chops to chars regardless of parameter...
Num encoded words = 28
Encoding = dict_items([('e', 1), ('t', 2), ('o', 3), ('a', 4)


In [94]:
print("Given array of string and char_level param tokenizer chops to chars")
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True,lower=True)
text_as_array=[stext] 
tokenizer.fit_on_texts(text_as_array) 
encoding = tokenizer.word_index.items()
print("Num encoded words = "+str(len(encoding)))
print("Encoding = "+str(encoding)[:50])
# This is fast.
# At char level, tokenzer leaves non-word characters.

Given array of string and char_level param tokenizer chops to chars
Num encoded words = 39
Encoding = dict_items([(' ', 1), ('e', 2), ('t', 3), ('o', 4)


In [104]:
# Here are different ways of retrieving the encoded sequence.
array_of_seq = tokenizer.texts_to_sequences(text_as_array)
print("%d total sequences"%len(array_of_seq))
print("%d array[0] len"%len(array_of_seq[0]))
[sequence] = tokenizer.texts_to_sequences(text_as_array)
print("%d seq len"%len(sequence))
print(stext[:10])
print(sequence[:10])

1 total sequences
1115394 array[0] len
1115394 seq len
First Citi
[20, 6, 9, 8, 3, 1, 19, 6, 3, 6]


In [107]:
# If we want encoding to start at 0,
# this doesn't work on python list: encoded=sequence-1
# but it does work in numpy.
import numpy as np
encoded=np.array(sequence)-1
print(encoded[:10])

[19  5  8  7  2  0 18  5  2  5]
