In [2]:
import os
import sys
from datasets import load_dataset
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds = load_dataset("stanfordnlp/imdb")
train_data = ds['train']
test_data = ds['test']
X_train = train_data['text']
y_train = train_data['label']
X_test = test_data['text']
y_test = test_data['label']


In [4]:
Tokenizer = Tokenizer(num_words=100000)
Tokenizer.fit_on_texts(X_train)
maxlen = max([len(x.split()) for x in X_train])
X_train = Tokenizer.texts_to_sequences(X_train)
X_test = Tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)


In [5]:
X_train_tensor = tf.convert_to_tensor(X_train)
y_train_tensor = tf.convert_to_tensor(y_train)
X_test_tensor = tf.convert_to_tensor(X_test)
y_test_tensor = tf.convert_to_tensor(y_test)

In [16]:
def embedding_model():
    vocab_size = min(Tokenizer.num_words,len(Tokenizer.word_index) + 1)
    embedding_dim = 16
    model = keras.Sequential([
        keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=2470),
        keras.layers.SimpleRNN(16),
        keras.layers.Dense(16, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    return model


embedding_model = embedding_model()
embedding_model.compile(optimizer='adam',
                          loss='binary_crossentropy',
                          metrics=['accuracy'])

embedding_model.fit(X_train_tensor, y_train_tensor, epochs=10, batch_size=128, validation_data=(X_test_tensor, y_test_tensor), verbose=1)
embedding_model.save('imdb_embedding_model.h5')


Epoch 1/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 263ms/step - accuracy: 0.6167 - loss: 0.6373 - val_accuracy: 0.8254 - val_loss: 0.4177
Epoch 2/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 264ms/step - accuracy: 0.8698 - loss: 0.3378 - val_accuracy: 0.8382 - val_loss: 0.3787
Epoch 3/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 263ms/step - accuracy: 0.9243 - loss: 0.2157 - val_accuracy: 0.8601 - val_loss: 0.3640
Epoch 4/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 260ms/step - accuracy: 0.9605 - loss: 0.1207 - val_accuracy: 0.8446 - val_loss: 0.3954
Epoch 5/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 262ms/step - accuracy: 0.9787 - loss: 0.0735 - val_accuracy: 0.8520 - val_loss: 0.4429
Epoch 6/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 264ms/step - accuracy: 0.9918 - loss: 0.0317 - val_accuracy: 0.8472 - val_loss: 0.5336
Epoch 7/10



In [17]:
print('accuracy:', embedding_model.evaluate(X_test_tensor, y_test_tensor, verbose=0)[1])


accuracy: 0.8452399969100952
