In [None]:
# course information
# Coursera, Natural Language Processing in TensorFlow, deeplearning.ai,

# https://www.coursera.org/learn/natural-language-processing-tensorflow#about

# Week 1: Tokenization, ASCII character codes have some problem during word encoding(e.g. LISTEN and SILENT). 
# Week2: Word Embedding. 
# Week3: Sequence models.
# Week4: Sequence models and literature.

In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!'
] #punctuation does not impact tokenization(clean up), not case sensitivity

tokenizer = Tokenizer(num_words=100, oov_token='<OOV>') 
# num_words is not necessary
#do not ignore untokenized words
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index #key-value pairs, key is the word and value is the token of the word
print(word_index)
print("*"*10)

sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)
print("*"*10)

test_data = [
    'I really love my dog',
    'My dog loves my manatee',
    'Do you think my dog is amazing?'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)
print("*"*10)

padded = pad_sequences(test_seq)
print(padded)
print("*"*10)

padded = pad_sequences(test_seq, padding='post', maxlen=5) #pre more words are lost
print(padded)
print("*"*10)

{'<OOV>': 1, 'love': 2, 'my': 3, 'i': 4, 'dog': 5, 'cat': 6, 'you': 7}
**********
[[4, 2, 3, 5], [4, 2, 3, 6], [7, 2, 3, 5]]
**********
[[4, 1, 2, 3, 5], [3, 5, 1, 3, 1], [1, 7, 1, 3, 5, 1, 1]]
**********
[[0 0 4 1 2 3 5]
 [0 0 3 5 1 3 1]
 [1 7 1 3 5 1 1]]
**********
[[4 1 2 3 5]
 [3 5 1 3 1]
 [1 3 5 1 1]]
**********


In [None]:
# !python3 -m pip install -q tensorflow-datasets
#IMDB sentiment classification
import tensorflow_datasets as tfds
import numpy as np

imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
for s,l in train_data:
    training_sentences.append(str(s.tonumpy()))
    training_labels.append(l.tonumpy())
    
for s,l in test_data:
    testing_sentences.append(str(s.tonumpy()))
    testing_labels.append(l.tonumpy())
    
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = '<OOV>'

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok) #out of vocabulary token
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length,truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

# model structure
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

num_epochs = 10
model.fit(padded,
         training_labels_final,
          epochs=num_epochs,
          validation_data=(testing_padded,testing_labels_final))

e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) #shape: (vocab_size, embedding_dim)

reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])

#tensorboard word embedding visualization files
import io
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + '\n')
    out_v.write('\t'.join([str(x) for x in embeddings])+'\n')
out_v.close()
out_m.close()

try:
    from google.colab import files
except ImportError:
    pass
else:
    files.download('vecs.tsv')
    files.download('meta.tsv')


In [None]:
#subword tokenization
import tensorflow as tf
print(tf.__version__)

import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews/subwords8k". with_info=True, as_supervised=True)

train_data, test_data = imdb['train'], imdb['test']
tokenizer = info.features['text'].encoder
print(tokenizer.subwords)

sample_string = 'TensorFlow, from basics to mastery'

tokenized_string = tokenizer.encode(sample_string)
print('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer.decode(tokenized_string)
print('The original string:{}'.format(original_string))

for ts in tokenized_string:
    print('{}---->{}'.format(ts,tokenizer.decode([ts])))

embedding_dim = 64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6,activation='relu')
    tf.keras.layers.Dense(1,activation='sigmoid')
])

model.summary()

num_epochs = 10
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(train_data, epochs=num_epochs, validation_data=test_data)

import matplotlib.pyplot as plt
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel('Epochs')
    plt.ylabel(string)
    plt.legend([string,'val_'+string])
    plt.show()

plot_graph(history, 'accuracy')
plot_graph(history, 'loss')



In [None]:
# 2 layers LSTM
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)) #hyper-parameter means dimensionality of the output space
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [None]:
# 1D convolution layers for NLP
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(128,5,activation='relu'), #128 kinds of fliters and each for 5 words
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(24,activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
# lyric prediction
tokenizer = Tokenizer()

data = "In the town of Athy one Jeremy Lanigan \n Battered away ... ..."
corpus = data.lower().split("\n")

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1 # 1 for out of vocabulary

input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
        
max_sequence_len = max([len(x) for x in input_sequences])

input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
# last one is the label and previous ones are the input
xs = input_sequences[:,:-1]
labels = input_sequences[:,-1]

ys = tf.keras.utils.to_categorical(labels, num_classes = total_words) #one-hot all labels

print(tokenizer.word_index)

model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(20)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(xs, ys, epochs=500, verbose=1)

import matplotlib.pyplot as plt
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.show()
    
plot_graphs(history,'acc')

seed_text = "Laurence went to dublin"
next_words = 100

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict_classes(token_list, verbose=0)
    output_word = ""
    for word,index in tokenizer.word_index.items():
        if index==predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)


