In [9]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras import layers

In [10]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file(
    "aclImdb_v1", 
    url,
    untar=True,
    cache_dir='.',
    cache_subdir=''
)

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [11]:
os.listdir(dataset_dir)


['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [12]:
train_dir = os.path.join(dataset_dir,"train")
os.listdir(train_dir)


['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [13]:
shutil.rmtree(os.path.join(train_dir,"unsup"))

In [14]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)


Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [15]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed
)


Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [16]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
'aclImdb/test',
batch_size=batch_size)

Found 25000 files belonging to 2 classes.


In [21]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', '')
    return tf.strings.regex_replace(
        stripped_html,
        f'[{re.escape(string.punctuation)}]',
        ""
    )

In [22]:


max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)



In [23]:
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [24]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)  # Add an extra dimension to the text
    return vectorize_layer(text), label


In [26]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds =raw_train_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
val_ds =raw_val_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
test_ds =raw_test_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)


In [27]:
embedding_dim = 16
model = tf.keras.Sequential([
    layers.Embedding(max_features + 1, embedding_dim),
    layers.Conv1D(8, 7, activation="relu"),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(8, activation="relu"),
    layers.Dense(1)
])

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          160016    
                                                                 
 conv1d (Conv1D)             (None, None, 8)           904       
                                                                 
 global_average_pooling1d (G  (None, 8)                0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout (Dropout)           (None, 8)                 0         
                                                                 
 dense (Dense)               (None, 8)                 72        
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                        

In [28]:
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy']
)

model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    callbacks=[
        tf.keras.callbacks.TensorBoard(log_dir="logs")
    ]
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x175a9685310>

In [1]:
%load_ext tensorboard
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 9228), started 6 days, 23:02:15 ago. (Use '!kill 9228' to kill it.)

In [30]:
loss,accuracy = model.evaluate(test_ds)
print("Loss:",loss)
print("Accuracy:",accuracy)


Loss: 0.4672938585281372
Accuracy: 0.8461599946022034


In [31]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    layers.Activation('sigmoid')
])

export_model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer="adam",
    metrics=['accuracy']
)

loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)


0.846560001373291


In [32]:
export_model.save("sentence_classificati`on_model")



INFO:tensorflow:Assets written to: sentence_classificati`on_model\assets


INFO:tensorflow:Assets written to: sentence_classificati`on_model\assets


In [33]:
reviews = tf.constant(['The movie is very boring', 'A Good Movie', 'very bad worst movie', 'Worst movie, boring'])

print("## Inference")
res = export_model(reviews)

for review, val in zip(reviews, res):
    review = review.numpy().decode()
    val = val.numpy().squeeze()
    print(f"{review:<30}: {val:>.3f}")


## Inference
The movie is very boring      : 0.455
A Good Movie                  : 0.756
very bad worst movie          : 0.343
Worst movie, boring           : 0.483
