In [0]:
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np

In [0]:
df = pd.read_csv('/content/drive/My Drive/Sentiment Review Movie data/movie_data.csv', encoding = 'utf-8')


In [0]:
target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))


In [4]:
count = 0
for ex in ds_raw:
    count +=1
print(count)

50000


In [0]:
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(count, reshuffle_each_iteration=False)


In [0]:
ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [9]:
# for (e,l) in ds_raw_train.take(2):
#     print(e.numpy()[0][:20])
#     print(l)
#     print('----')

b'okay, this movie f*c'
tf.Tensor(0, shape=(), dtype=int64)
----
b'The world is facing '
tf.Tensor(0, shape=(), dtype=int64)
----


## Thu gon cau truc code


In [0]:
from collections import Counter

def preprocessing_datasets(ds_raw_train,
                           ds_raw_valid,
                           ds_raw_test,
                           max_seq_length = None,
                           batch_size = 32):
    ## find unique tokens
    tokenizer = tfds.features.text.Tokenizer()
    token_counts = Counter()

    for example in ds_raw_train:
        tokens = tokenizer.tokenize(example[0].numpy()[0])
        if max_seq_length is not None:
            tokens = tokens[-max_seq_length:]
        token_counts.update(tokens)

    print('Vocab-size: ', len(token_counts))

    ## encoding the texts
    encoder = tfds.features.text.TokenTextEncoder(token_counts)

    def encode(text_tensor, label):
        text = text_tensor.numpy()[0]
        encoded_text = encoder.encode(text)
        if max_seq_length is not None:
            encoded_text = encoded_text[-max_seq_length:]
        return encoded_text, label
    
    def encode_map_fn(text,label):
        return tf.py_function(encode, inp = [text, label],
                                Tout = (tf.int64, tf.int64))
        
    ds_train = ds_raw_train.map(encode_map_fn)
    ds_valid = ds_raw_valid.map(encode_map_fn)
    ds_test = ds_raw_test.map(encode_map_fn)

    ## batching the datasets
    train_data = ds_train.padded_batch(
        batch_size, padded_shapes = ([-1], []))
    valid_data = ds_valid.padded_batch(
        batch_size, padded_shapes = ([-1], []))
    test_data = ds_test.padded_batch(
        batch_size, padded_shapes = ([-1], []))
    
    return (train_data, valid_data, test_data, len(token_counts))

In [0]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU
from tensorflow.keras.layers import Dense, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

In [0]:
def build_rnn_model(embedding_dim, vocab_size,
                    recurrent_type = 'SimpleRNN',
                    n_recurrent_units = 64,
                    n_recurrent_layers = 1,
                    bidirectional = True):
    tf.random.set_seed(1)

    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim,
                        name = 'embed-layer'))
    for i in range(n_recurrent_layers):
        return_sequences = (i<n_recurrent_layers-1)
        if recurrent_type =='SimpleRNN':
            recurrent_layer = SimpleRNN(n_recurrent_units,
                                        return_sequences = return_sequences,
                                        name = 'simpRNN-layer-{}'.format(i))
        elif recurrent_type == 'LSTM':
            recurrent_layer = LSTM(n_recurrent_units,
                                return_sequences = return_sequences,
                                name = 'LSTM-layer-{}'.format(i))
        elif recurrent_type == 'GRU':
            recurrent_layer = GRU(n_recurrent_units,
                                return_sequences = return_sequences,
                                name = 'GRU-layer-{}'.format(i))    
        if bidirectional:
            recurrent_layer = Bidirectional(recurrent_layer, 
                                name = 'bidir_'+
                                recurrent_layer.name)
        model.add(recurrent_layer)

    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation = 'sigmoid'))

    return model

In [21]:
batch_size = 32
embedding_dim = 20
max_seq_length = 100

train_data, valid_data, test_data, n = preprocessing_datasets(
    ds_raw_train, ds_raw_valid, ds_raw_test, 
    max_seq_length = max_seq_length,
    batch_size = batch_size)

vocab_size = n+2

rnn_model = build_rnn_model(embedding_dim=embedding_dim,
                            vocab_size = vocab_size,
                            recurrent_type = 'GRU',
                            n_recurrent_units = 64,
                            n_recurrent_layers = 2,
                            bidirectional = True)
rnn_model.summary()

Vocab-size:  58063
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, None, 20)          1161300   
_________________________________________________________________
bidir_GRU-layer-0 (Bidirecti (None, None, 128)         33024     
_________________________________________________________________
bidir_GRU-layer-1 (Bidirecti (None, 128)               74496     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 1,277,141
Trainable params: 1,277,141
Non-trainable params: 0
_________________________________________________________________


In [0]:
rnn_model.compile(optimizer= Adam(0.003),
                  loss= BinaryCrossentropy(from_logits=False),
                  metrics = ['accuracy'])

In [23]:
history = rnn_model.fit(train_data,
                        validation_data = valid_data,
                        epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
results = rnn_model.evaluate(test_data)
print('Test Acc.: {:.2f}%'.format(results[1]*100))

Test Acc.: 83.78%
