## Download and pre-process data

In [None]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

In [None]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

In [None]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

## Data Loading

In [1]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [2]:
batch_size = 1024
seed = 12345
train_ds = tf.keras.utils.text_dataset_from_directory(
                            '../aclImdb/train', batch_size=batch_size, 
                            validation_split=0.2,
                            subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(
                            '../aclImdb/train', batch_size=batch_size, 
                            validation_split=0.2,
                            subset='validation', seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [6]:
for text_batch, label_batch in train_ds:
    print(label_batch[0].numpy())
    print(text_batch[0].numpy().decode())
    break

1
How to lose friends and alienate people is decent comedy with a bit of romantic approach.<br /><br />It's actually a story of Sidney Young(Simon Pegg) breaking through in journalist and magazine writing business which is interpreted in a funny way. Simon Pegg made an OK appearance, slightly worse than his usual. Movie is not hilarious or funny all the way or anything like that but it has its moments, and those moments are really hilarious.<br /><br />I recommend this fun and worth watching American with English cream comedy to all people who just wanna sit, relax and enjoy movie for what it is. If you're about to watch this movie with critical approach then you should pass unless you want to be disappointed and start trashing it.


In [7]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

In [9]:
output = custom_standardization(text_batch[0].numpy().decode())
print(output.numpy().decode())

how to lose friends and alienate people is decent comedy with a bit of romantic approach  its actually a story of sidney youngsimon pegg breaking through in journalist and magazine writing business which is interpreted in a funny way simon pegg made an ok appearance slightly worse than his usual movie is not hilarious or funny all the way or anything like that but it has its moments and those moments are really hilarious  i recommend this fun and worth watching american with english cream comedy to all people who just wanna sit relax and enjoy movie for what it is if youre about to watch this movie with critical approach then you should pass unless you want to be disappointed and start trashing it


## Vectorization

In [10]:
vocal_size   = 20000
sequence_len = 200

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

vectorization = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocal_size,
    output_mode="int",
    output_sequence_length=sequence_len,
)

vectorization.adapt(train_ds.map(lambda text, label: text))

In [11]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorization(text), label

train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)

In [13]:
for text_batch, label_batch in train_ds:
    print(label_batch[0].numpy())
    print(text_batch.numpy()[0])
    break

1
[  280    43   457  1037   107    31     2   201    58    31     2   201
   268     9    13    37   270    15    11   647    12    10    67  4890
    42     2     1     5     2   134     3   199     9  1823   368    46
   113    18     9     7    21     9     7    30   146     2    62     7
   160   156    15     2    80     5     2  4434  2751   326  3058     9
     1   708    20     2  8875     5     2    80    32  4899  1714     2
 18710     2  4535     1     5     2    80    32  4149  1829  1072    20
     2  1837   505     6     2   212    12     2  1242  1588  6270    41
    54     2     1     1    32     2   166  1059  4751    36  1207    99
   519     6 11394     2  4822    82   574     3  1597 16682    65 18015
     4  7910  1902     8     2    19  4679    11    77    12    34   456
     6    66     2  1873  6245    20     2  1837    30     5   128  6705
    25    75   249     8     1  2837   862     8     2   751  3175   450
    29   484    36    43  2377   447 18015     3 

In [14]:
for text_batch, label_batch in val_ds:
    print(label_batch[0].numpy())
    print(text_batch.numpy()[0])
    break

1
[    4  2413  1262    17    12    43     4     1   496  4734 11601     7
    14   202   873  6500     1    43   349   104    85   341    18    47
     7    40   139    42    11    28    74  6861   122  1782    87     5
    30   822   182     6    66    11   469     1   162     1     1    10
    13    37  7521    32    39  1297    10    67     6   810     2    17
    37    10    97   103    39   170     3   170    53     7     4  4328
     5  2981  3638  4224    32     2   320   970   683   142    21     2
    60    28    36    13 13690    32    11   180   509     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0 

In [None]:
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)