In [1]:
import os
import warnings

warnings.filterwarnings('ignore')
os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"

In [2]:
import datetime
import shutil
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [3]:
tf.get_logger().setLevel('ERROR')

In [4]:
print('Number of GPUs available:',len(tf.config.list_physical_devices('GPU')))

Number of GPUs available: 1


In [5]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
path = './dataset/'
dataset = tf.keras.utils.get_file(
    'aclImdb_v1.tar.gz', url, cache_dir=path, cache_subdir='.', untar=True 
)

dataset_dir = os.path.join(os.path.dirname(dataset), "aclImdb")
train_dir = os.path.join(dataset_dir, "train")

remove_dir = os.path.join(train_dir, "unsup")
shutil.rmtree(remove_dir)

In [6]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    path+"aclImdb/train",
    validation_split=0.2,
    batch_size=batch_size,
    seed=seed,
    subset='training',
)

class_names = raw_train_ds.class_names
train_ds  = raw_train_ds.cache().prefetch(buffer_size = AUTOTUNE)

val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    path+ "aclImdb/train",
    validation_split=0.2,
    subset='validation',
    batch_size=batch_size,
    seed=seed,
)

val_ds = val_ds.cache().prefetch(buffer_size = AUTOTUNE)

test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    path+"aclImdb/test",
    batch_size=batch_size,
)

test_ds = test_ds.cache().prefetch(buffer_size = AUTOTUNE)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [7]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(3):
        print(f"Review:{text_batch.numpy()[i]}")
        label = label_batch.numpy()[i]
        print(f"Label:{label} ({class_names[label]})")

Review:b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
Label:0 (neg)
Review:b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose perspective of life changes as they get

In [10]:
encoder_path=''
preprocessor_path = './bert_preprocess_model/'

In [11]:
bert_preprocess_model=hub.KerasLayer(preprocessor_path,)
bert_encoder_model = hub.KerasLayer(encoder_path, trainable=False)

ValueError: Unknown signature default in https://tfhub.dev/google/small_bert/bert_uncased_L-4_H-512_A-8/2 (available signatures: _SignatureMap({'mlm': <ConcreteFunction pruned(input_ids, input_mask, mlm_positions, segment_ids) at 0x143692506A0>, 'tokenization_info': <ConcreteFunction pruned() at 0x1436A8BE040>, 'tokens': <ConcreteFunction pruned(input_ids, input_mask, segment_ids) at 0x1436A885C40>})).

In [None]:
example_text = ['This movie is good']
example_preprocessed=bert_preprocess_model(example_text)
print(example_preprocessed.keys())

dict_keys(['input_type_ids', 'input_word_ids', 'input_mask'])


In [None]:
example_preprocessed['input_word_ids']

<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[ 101, 2023, 3185, 2003, 2204,  102,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]])>

In [None]:
example_results = bert_encoder_model(example_preprocessed)
print(example_results.keys())

dict_keys(['sequence_output', 'encoder_outputs', 'pooled_output', 'default'])


In [None]:
example_results['sequence_output']

<tf.Tensor: shape=(1, 128, 512), dtype=float32, numpy=
array([[[ 0.27502304,  0.2389847 ,  0.59988314, ...,  0.20327666,
          0.5059539 , -0.06914642],
        [-0.06251876,  0.3401326 , -0.33693755, ...,  0.32076624,
         -0.38289124,  0.75448704],
        [ 0.05320063,  0.01017252, -0.201627  , ...,  0.63700867,
          0.91227466,  1.0662935 ],
        ...,
        [-0.03345858, -0.77042055, -0.11749834, ...,  0.55917525,
          0.9602638 ,  0.7657006 ],
        [ 0.23096976, -0.59058255, -0.37921903, ...,  0.24811526,
          0.9153042 ,  0.6856426 ],
        [ 0.70448947, -0.95711213, -0.2863833 , ...,  0.28426343,
          0.7088048 , -0.00712512]]], dtype=float32)>

In [None]:
def classifier_model(dropout_rate):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessor_model=bert_preprocess_model(text_input)
    encoder_model=bert_encoder_model(preprocessor_model)
    dropout_layer = tf.keras.layers.Dropout(dropout_rate)(encoder_model['pooled_output'])
    result = tf.keras.layers.Dense(1, activation='sigmoid', name = 'classifier')(dropout_layer)
    return tf.keras.Model(text_input, result)

In [None]:
model = classifier_model(0.15)

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [None]:
tf.keras.utils.plot_model(model)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(x=train_ds, validation_data=val_ds, epochs=5)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2a2eda32b80>

In [None]:
model.evaluate(test_ds)



[0.517680287361145, 0.7460399866104126]

In [None]:
y_pred = model.predict(test_ds)



In [None]:
test_ds[0]