In [1]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  2095k      0  0:00:39  0:00:39 --:--:-- 2298k0   147k      0  0:09:17  0:00:03  0:09:14  147k


In [2]:
!rm -r aclImdb/train/unsup

In [5]:
import os, pathlib, shutil, random

base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"

for category in ("neg", "pos"):
    os.makedirs(val_dir/category)
    files = os.listdir(train_dir/category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir/category/fname, val_dir/category/fname)

In [7]:
from tensorflow import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size=batch_size
)

val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size = batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)

Found 20000 files belonging to 2 classes.


2022-10-21 22:47:54.724794: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: system has unsupported display driver / cuda driver combination
2022-10-21 22:47:54.724930: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: v2fftb-HP-Pavilion-Gaming-Laptop-15-dk1xxx
2022-10-21 22:47:54.724946: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: v2fftb-HP-Pavilion-Gaming-Laptop-15-dk1xxx
2022-10-21 22:47:54.725316: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 510.85.2
2022-10-21 22:47:54.725366: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.65.1
2022-10-21 22:47:54.725379: E tensorflow/stream_executor/cuda/cuda_diagnostics.cc:313] kernel version 515.65.1 does not match DSO version 510.85.2 -- cannot find working devices in this configuration
2022-10-21 22:47:54.727696: 

Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [8]:
from tensorflow import keras
from tensorflow.keras import layers
def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop",loss="binary_crossentropy",metrics=["accuracy"])
    return model

In [10]:
from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="multi_hot",
)

In [11]:
text_only_train_ds = train_ds.map(lambda x, y: x)

In [15]:
text_vectorization.adapt(text_only_train_ds)

binary_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y), num_parallel_calls=4
)
binary_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y), num_parallel_calls=4
)
binray_2gram_test_ds = test_ds.map(
    lambda text, label: (text_vectorization(text), label), num_parallel_calls=4
)


2022-10-21 23:00:12.486337: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 32613480 exceeds 10% of free system memory.


In [17]:
model = get_model()
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________


In [19]:

callbacks = [
    keras.callbacks.ModelCheckpoint("binary_2gram.keras", save_best_only=True)
]


In [20]:
model.fit(binary_2gram_train_ds, validation_data=binary_2gram_val_ds, epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f42f4445120>

In [24]:
model = keras.models.load_model("binary_2gram.keras")
print(f"Test [loss, acc]: {model.evaluate(binray_2gram_test_ds)}")

Test [loss, acc]: [0.26725658774375916, 0.8978000283241272]
