In [None]:
import os
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")


In [None]:
# Split the training set into 60% and 40% to end up with 15,000 examples
# for training, 10,000 examples for validation and 25,000 examples for testing.
train_data, validation_data, test_data = tfds.load(
    name="imdb_reviews", 
    split=('train[:60%]', 'train[60%:]', 'test'),
    as_supervised=True)

In [None]:
train_data

In [None]:
for example in train_data:
     print(f"Features :{example[0].numpy()}")
     print(f"Labels : {example[1].numpy()}")

In [None]:

def get_labels_from_tfdataset(tfdataset, batched=False):

    labels = list(map(lambda x: x[1], tfdataset)) # Get labels 

    if not batched:
        return tf.concat(labels, axis=0) # concat the list of batched labels

    return labels

In [None]:

def get_features_from_tfdataset(tfdataset, batched=False):

    features = list(map(lambda x: x[0], tfdataset)) # Get labels 

    if not batched:
        return tf.concat(features, axis=0) # concat the list of batched labels

    return features

In [None]:
labels = get_labels_from_tfdataset(train_data)

In [None]:
labels

In [None]:
features = get_features_from_tfdataset(train_data)

In [None]:
features.shape , features.ndim

In [None]:
# How long is each sentence on average?
sent_lens = [len(sentence.split()) for sentence in features.numpy()]
avg_sent_len = np.mean(sent_lens)
avg_sent_len # return average sentence length (in tokens)

In [None]:
import matplotlib.pyplot as plt
plt.hist(sent_lens, bins=7);

In [None]:


# How long of a sentence covers 95% of the lengths?
output_seq_len = int(np.percentile(sent_lens, 95))
output_seq_len


In [None]:
features[1].numpy() ,features[1]

In [None]:
from tensorflow.keras import layers

max_vocab = 7000  # Maximum vocab size.
max_seq_len = 6000  # Sequence length to pad the outputs to.

# Create the layer.
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab,
    output_mode='int',
    output_sequence_length=max_seq_len)

# Now that the vocab layer has been created, call `adapt` on the
# text-only dataset to create the vocabulary. You don't have to batch,
# but for large datasets this means we're not keeping spare copies of
# the dataset.
vectorize_layer.adapt(features)

embedding_layers = layers.Embedding(input_dim=max_vocab,
                                     output_dim=5,
                                     embeddings_initializer="uniform",
                                     input_length = max_seq_len,
                                     name="embedding_layers")


In [None]:
import random
random_sentence = random.choice(features)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
vectorize_layer([random_sentence])

In [None]:
test = vectorize_layer(features)
test

In [None]:
test_emb = embedding_layers(test)
test_emb

In [None]:
model_2 = tf.keras.Sequential()
model_2.add(tf.keras.layers.Input(shape=(1,), dtype="string", name="Input"))
model_2.add(vectorize_layer)
model_2.add(embedding_layers)
model_2.add(tf.keras.layers.Dense(32, activation='relu'))
model_2.add(tf.keras.layers.Dense(16, activation='relu'))
model_2.add(tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid))

model_2.summary()

model_2.compile(optimizer=tf.keras.optimizers.Adam(),
                loss = tf.keras.losses.binary_crossentropy,
                metrics=["accuracy"])



In [None]:
# Build model with functional API
inputs = layers.Input(shape=(1,), dtype= tf.string)
x = vectorize_layer(inputs)
x = embedding_layers(x)
x = layers.LSTM(64)(x)
outputs = layers.Dense(1, activation=tf.keras.activations.sigmoid)(x)
model = tf.keras.Model(inputs, outputs)

model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
history = model.fit(features,
                    labels,
                    epochs=10,
                    verbose=1)
