<a href="https://colab.research.google.com/github/ch00226855/CMP414765Spring2022/blob/main/Week13_AnalyzingTexts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 13
# Analyzing Texts

This notebook classifies movie reviews as positive or negative using the text of the review.

We'll use the [IMDB dataset](https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb) that contains the text of 50,000 movie reviews from the Internet Movie Database. These reviews are split into 25,000 reviews for training and 25,000 reviews for testing. The training and testing sets are balanced, meaning they contain an equal number of positive and negative reviews.

**Please turn on GPU computing from the menu.**

In [None]:
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

## Download the dataset

In [None]:
# Split the training set into 60% and 40%, so we'll end up with 15,000 examples
# for training, 10,000 examples for validation and 25,000 examples for testing.
train_data, validation_data, test_data = tfds.load(
    name="imdb_reviews", 
    split=('train[:60%]', 'train[60%:]', 'test'),
    as_supervised=True)

## Explore the Data

In [None]:
?train_data

In [None]:
# Extract the first batch of 10 reviews
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10))) # The next() function returns the next item of an iterator
train_examples_batch

In [None]:
# Display the labels of the first 10 reviews
train_labels_batch

## Building the Model
- Represent words as vectors using pre-trained encoder
- Decide the number of hidden layers
- Decide the number of hidden units for each layer

For this example we will use a pre-trained text embedding model from TensorFlow Hub called `gnews-swivel-20dim`, which represents each word with a vector of length 20.

# Word Embedding

## Why transform words into vectors?

## Challenges for word embedding
- curse of dimensionality
- performance metrics
- training algorithm

# Popular embedding models
- Word2Vec
- BERT
- Train your own embedding

In [None]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)
hub_layer(train_examples_batch[:3])

In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1))

model.summary()

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
# the fit() methods returns a collection of intermediate results, which can be useful
# to evaluate the model
history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=20,
                    validation_data=validation_data.batch(512),
                    verbose=1)

## Evaluate the model

In [None]:
results = model.evaluate(test_data.batch(512), verbose=2)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

In [None]:
# How about my own reviews?
my_review = np.array(["This movie is the worst action movie I have ever watched in my entire life.",
                      "I really enjoyed the plot, but the lead actor didn't portray his character well.",
                      "It is the most visually stunning movie in the series. The acting is outstanding too.",
                      "I really like that everyone in this movie makes it crystal clear that they don't care the quality at all.",
                      "There is nothing about the movie that I don't like. I wish everyone else just stop making movies since no moive can be better than this one."])
model(my_review).numpy()

In [None]:
# Extract 20 reviews from the test set
reviews, labels = next(iter(test_data.batch(20)))
predictions = model(reviews).numpy()

In [None]:
labels.numpy()

In [None]:
(predictions > 0).astype(int).reshape(-1)

In [None]:
reviews[0]

# Text Generation with Recurrent Neural Networks

## Idea
- In some applications, data arrive in a sequence.
- Output is context-dependent, so each node should remember its previous status.

<img src="https://www.tensorflow.org/tutorials/text/images/text_generation_training.png" width="700">


- We will work with a dataset of Shakespeare's writing
- Build a model with `tf.keras` to analyze the sequence of characters
- Apply the model to write new text in Shakespeare's style

This project is adapted from [TensorFlow tutorial](https://www.tensorflow.org/tutorials/sequences/text_generation)

In [None]:
# Get the text file from:
# https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

## Read the data

In [None]:
# Read the text as a string
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

print ('Length of text: {} characters'.format(len(text)))

In [None]:
# Take a look at the first 250 characters in text
print(text[:250])

In [None]:
# The unique characters in the file
# Python set is a data structure containing unique elements
vocab = sorted(set(text))
# Print with formatted string: {index:format}
print ('{} unique characters'.format(len(vocab)))

## Vectorize the text

In [None]:
# Creating a mapping from unique characters to indices
# enumerate: returns index and the value
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [None]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

In [None]:
# Show how the first 13 characters from the text are mapped to integers
# repr: string representation of an object
print ('{} ---- characters mapped to int ---- > {}'.format(str(text[:13]), text_as_int[:13]))

In [None]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//seq_length

# Create training examples / targets
# Dataset.from_tensor_slices(): convert a numpy array to tf Dataset
# Dataset.take(): create a sub Dataset
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

In [None]:
# batch(): cut the dataset into chunks
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

# join(): concatenate a list of elements and form a string
for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

## Match input character with output character

In [None]:
# For each sequence, duplicate and shift it to form the input and target text by using the `map` method
# to apply a simple function to each batch
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

# map(): similar to pandas.DataFrame.apply()
dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

In [None]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

## Shuffle and create training batches

In [None]:
# Batch size 
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch//BATCH_SIZE

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences, 
# so it doesn't attempt to shuffle the entire sequence in memory. Instead, 
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

## Build the training model

- Embedding layer
- GRU layer

In [None]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension 
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [None]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

In [None]:
model.summary()

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [None]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

In [None]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

In [None]:
model.compile(optimizer='adam', loss=loss)

In [None]:
import os
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
EPOCHS=10

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))