In [7]:
pip install tensorflow_datasets

Collecting tensorflow_datasets
  Downloading tensorflow_datasets-4.9.2-py3-none-any.whl (5.4 MB)
[K     |████████████████████████████████| 5.4 MB 2.0 MB/s eta 0:00:01
Collecting promise
  Downloading promise-2.3.tar.gz (19 kB)
Collecting array-record
  Downloading array_record-0.4.0-py38-none-any.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 6.1 MB/s eta 0:00:01
[?25hCollecting importlib-resources
  Downloading importlib_resources-6.0.0-py3-none-any.whl (31 kB)
Collecting protobuf>=3.20
  Downloading protobuf-4.23.4-cp37-abi3-macosx_10_9_universal2.whl (400 kB)
[K     |████████████████████████████████| 400 kB 4.4 MB/s eta 0:00:01
[?25hCollecting tensorflow-metadata
  Downloading tensorflow_metadata-1.13.1-py3-none-any.whl (28 kB)
Collecting etils[enp,epath]>=0.9.0
  Downloading etils-1.3.0-py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 3.5 MB/s eta 0:00:01
[?25hCollecting dm-tree
  Downloading dm_tree-0.1.8-cp38-cp38-macosx_10_9_x86_64

In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout

Definition of a plot function for training result visualization

In [5]:
def plot_results(history):
    hist_df = pd.DataFrame(history.history)
    hist_df.columns=["loss", "accuracy", "val_loss", "val_accuracy"]
    hist_df.index = np.arange(1, len(hist_df)+1)
    
    fig, axs = plt.subplots(nrows=2, sharex=True, figsize=(16, 10))
    axs[0].plot(hist_df.val_accuracy, lw=3, label='Validation Accuracy')
    axs[0].plot(hist_df.accuracy, lw=3, label='Training Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].grid()
    axs[0].legend(loc=0)
    axs[1].plot(hist_df.val_loss, lw=3, label='Validation Loss')
    axs[1].plot(hist_df.loss, lw=3, label='Training Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].grid()
    axs[1].legend(loc=0)
    
    plt.show();

## Preprocessing of the data

We get the IMDB dataset directly from the tensorflow_datasets API and we do the usual preprocessing before feeding a neural network

In [8]:
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

train_size = info.splits["train"].num_examples
batch_size = 32

train_set = datasets["train"].shuffle(10000).repeat().batch(batch_size).prefetch(tf.data.AUTOTUNE)

test_size = info.splits["test"].num_examples
test_set = datasets["test"].repeat().batch(batch_size).prefetch(tf.data.AUTOTUNE)

2023-07-22 17:41:41.200536: W tensorflow/core/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata".


[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /Users/skyemalfoy/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /Users/skyemalfoy/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete2X4R5M/imdb_reviews-tr…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /Users/skyemalfoy/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete2X4R5M/imdb_reviews-te…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /Users/skyemalfoy/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete2X4R5M/imdb_reviews-un…

[1mDataset imdb_reviews downloaded and prepared to /Users/skyemalfoy/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


2023-07-22 17:42:30.061874: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Use of a pretrained embedding

We use of pretrained embedding directly from tensorflow_hub

In [9]:
embed = hub.load("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1")

We test on two (famous) lines and check the shapes of the embedding results

In [10]:
embeddings = embed(["A thing of beauty is a joy forever", "If by dull rhymes our English must be chain'd"])
print(embeddings)
print(embeddings.shape)

tf.Tensor(
[[ 0.03275988  0.18106811  0.13030443  0.05100623  0.12367279 -0.11072872
   0.1655957  -0.0049278  -0.3281556   0.05204761  0.17150185  0.01282718
  -0.09332222  0.1672171  -0.05711355 -0.22492586 -0.15962309 -0.00958291
  -0.11166596 -0.42931503 -0.0194127  -0.20494537  0.25295272  0.05954154
  -0.25411132  0.12579551 -0.16218384 -0.10604351  0.27133545 -0.15765025
  -0.31424785  0.21318786 -0.10896667  0.14070608 -0.24665987  0.1579746
   0.24865562  0.04819695  0.10051076 -0.24969979  0.15491936 -0.0360333
   0.07346644  0.10915987 -0.08220651  0.12550174  0.16840625 -0.01693668
   0.0715794  -0.04162662]
 [ 0.16800539  0.24028125 -0.30071175  0.07007764 -0.18024668  0.07986181
   0.05427119 -0.28110817 -0.22582981  0.26624134  0.13623291 -0.11988997
   0.16064322 -0.04873525 -0.08858649 -0.15337813  0.00109797 -0.26315662
   0.3372981  -0.14884004  0.17933601 -0.12853579 -0.15982151 -0.10252967
  -0.03884843  0.08044805 -0.20275603 -0.17167023  0.20971875 -0.12899558
  

## Neural network model definition

Build a neural network using keras sequential layers

(you may have a look at https://keras.io/api/layers/)

In [None]:
# Question 1: Build a neural network using relevant layers, dimensions and activation function (the input layer is already defined to help you)
model = tf.keras.models.Sequential([
    hub.KerasLayer(embed,
                   dtype=tf.string, input_shape=[], output_shape=[50]),
    #??????
    #??????
    #....
])

We check that everything is fine with the model as we defined it

In [None]:
model.summary()

We compile the model, choosing the relevant loss function, optimizer and metrics

(You may have a look at
https://keras.io/api/losses/
and
https://keras.io/api/optimizers/)

In [None]:
# Question 2: Choose a relevant loss fonction and optimizer for the training
loss_function = # ?????
optimizer = # ??????

model.compile(loss=loss_function, optimizer=optimizer,
              metrics=["accuracy"])

We train the model on the dataset

In [None]:
# Question 3: Choose relevant values for epochs
# (Start with small values for epochs in order to save some computation time)
epochs = # ?????

history = model.fit(train_set, steps_per_epoch=train_size // batch_size, epochs=epochs, validation_data=test_set, validation_steps=test_size // batch_size)

## Result visualization

In [None]:
plot_results(history)

In [None]:
# Question 4: What can you tell about the results? Does it seem satisfying to you? Do you see any hint of an over-fitting? If yes, what kind of layers can you use into the Keras model in order to prevent this phenomenon?