# Phân loại IMDB với Regular Expressions và Embedding

In [1]:
import tensorflow as tf
import numpy as np

## Tạo raw dataset

In [2]:
batch_size = 32
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    "Data/aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=1337,
)
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    "Data/aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=1337,
)
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    "Data/aclImdb/test", batch_size=batch_size
)

print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
Number of batches in raw_train_ds: 625
Number of batches in raw_val_ds: 157
Number of batches in raw_test_ds: 782


In [3]:
for text_batch, label_batch in raw_train_ds.take(1):
    print(text_batch.numpy())
    print(label_batch.numpy())


[b'I\'ve seen tons of science fiction from the 70s; some horrendously bad, and others thought provoking and truly frightening. Soylent Green fits into the latter category. Yes, at times it\'s a little campy, and yes, the furniture is good for a giggle or two, but some of the film seems awfully prescient. Here we have a film, 9 years before Blade Runner, that dares to imagine the future as somthing dark, scary, and nihilistic. Both Charlton Heston and Edward G. Robinson fare far better in this than The Ten Commandments, and Robinson\'s assisted-suicide scene is creepily prescient of Kevorkian and his ilk. Some of the attitudes are dated (can you imagine a filmmaker getting away with the "women as furniture" concept in our oh-so-politically-correct-90s?), but it\'s rare to find a film from the Me Decade that actually can make you think. This is one I\'d love to see on the big screen, because even in a widescreen presentation, I don\'t think the overall scope of this film would receive it

## Regular Expression for preprocessing

In [4]:
import string

punctuation=string.punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
import re
# Hàm regex text

def custom_standardization(input_data):
    # to lowercase
    lowercase = tf.strings.lower(input_data)
    # delete <br />
    pattern = r"<\s*br\s*/?\s*>"
    stripped_html = tf.strings.regex_replace(lowercase, pattern, "")
    # delete !@#$%...
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(punctuation)}]", "")


In [6]:
# Kiểm thử hàm vừa tạo.
test_sentence = "I must confess to not having read the original M R James story although I have read many of his other supernatural tales. I've also seen most of the previous BBC Christmas Ghost Stories and this one, in my opinion, surpasses most of them, only equalling The Signalman. < br / > < br / >I can't really fault A View From a Hill - the direction and 'mood' is perfect, as is the acting, lighting and, of course, the story and writing. I thoroughly enjoyed this and can only hope for more of this quality from the same director and production team. I understand that the BBC plan to make some more (not necessarily based on M R James stories) so that's promising. < br / > < br / >10/10"
custom_standardization(test_sentence).numpy()


b'i must confess to not having read the original m r james story although i have read many of his other supernatural tales ive also seen most of the previous bbc christmas ghost stories and this one in my opinion surpasses most of them only equalling the signalman  i cant really fault a view from a hill  the direction and mood is perfect as is the acting lighting and of course the story and writing i thoroughly enjoyed this and can only hope for more of this quality from the same director and production team i understand that the bbc plan to make some more not necessarily based on m r james stories so thats promising  1010'

## Thích ứng layer TextVectorization

In [7]:
from keras.layers import TextVectorization
# Vocal
VOCAL_SIZE = 20000
# Input
SEQUENCE_LENGTH = 5000
# Ouput
EMBEDD_DIM = 64

# Khởi tạo layer TextVectorization với các tham số:
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=VOCAL_SIZE,
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH,
)


In [8]:
# Loại bỏ label để adapt
text_ds = raw_train_ds.map(lambda x, y: x)
        
# Thích ứng sentence trong train_ds vào vectorization
vectorize_layer.adapt(text_ds)

In [9]:
vocal = vectorize_layer.get_vocabulary()
print("Vocal: ", vocal)
print("Vocal Length: ", len(vocal))
vocabulary_to_index = {word: index for index, word in enumerate(vocal)}
print("Index: ", vocabulary_to_index)


Vocal Length:  20000


In [10]:
vectorize_layer('New sentence <br/> New line <br> Next line <br />')

<tf.Tensor: shape=(5000,), dtype=int64, numpy=array([ 155, 4345,  155, ...,    0,    0,    0], dtype=int64)>

## Apply Vectorization to Dataset

In [35]:
def vectorize_text(text, label):
    return vectorize_layer(text), label

# Map sentence thành số nguyên, với bộ từ điển vocabulary_to_index, sentense sẽ được padding cho
# đủ độ dài 5000
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [44]:
for f,l in train_ds.take(1):
    print(f)

tf.Tensor(
[[  619   356     7 ...     0     0     0]
 [    2 11730     7 ...     0     0     0]
 [  914   201   278 ...     0     0     0]
 ...
 [   45    10     7 ...     0     0     0]
 [    1 13864     7 ...     0     0     0]
 [    4  2888     1 ...     0     0     0]], shape=(32, 5000), dtype=int64)


In [43]:
# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

## Build Model

In [16]:
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv1D, GlobalMaxPooling1D, Dropout


In [48]:
inputs = tf.keras.Input(shape=(None,))

x = Embedding(VOCAL_SIZE, EMBEDD_DIM)(inputs)
x = Dropout(0.5)(x)

x = Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = GlobalMaxPooling1D()(x)

x = Dense(128, activation="relu")(x)
x = Dropout(0.5)(x)

predictions = Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

model.compile(loss="binary_crossentropy",
              optimizer="adam", metrics=["accuracy"])


In [49]:
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_4 (Embedding)     (None, None, 64)          1280000   
                                                                 
 dropout_6 (Dropout)         (None, None, 64)          0         
                                                                 
 conv1d_7 (Conv1D)           (None, None, 128)         57472     
                                                                 
 conv1d_8 (Conv1D)           (None, None, 128)         114816    
                                                                 
 global_max_pooling1d_3 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                           

In [50]:
# Không có GPU nên training rất lâu.
model.fit(train_ds, validation_data=val_ds, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1c2013ad090>