In [13]:
import os
import sys
from datasets import load_dataset
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [14]:
ds = load_dataset("stanfordnlp/imdb")
train_data = ds['train']
test_data = ds['test']
X_train = train_data['text']
y_train = train_data['label']
X_test = test_data['text']
y_test = test_data['label']


In [15]:
print("X_train shape:", len(X_train))

X_train shape: 25000


In [16]:
Tokenizer = Tokenizer(num_words=100000)
Tokenizer.fit_on_texts(X_train)
maxlen = max([len(x.split()) for x in X_train])
X_train = Tokenizer.texts_to_sequences(X_train)
X_test = Tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)


In [18]:
print("X_train shape:", X_train.shape)

X_train shape: (25000, 2470)


In [19]:
X_train_tensor = tf.convert_to_tensor(X_train)
y_train_tensor = tf.convert_to_tensor(y_train)
X_test_tensor = tf.convert_to_tensor(X_test)
y_test_tensor = tf.convert_to_tensor(y_test)
print("X_train shape:", X_train_tensor.shape)

X_train shape: (25000, 2470)


In [34]:
def embedding_model():
    vocab_size = min(Tokenizer.num_words,len(Tokenizer.word_index) + 1)
    embedding_dim = 1028
    model = keras.Sequential([
        keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=2470),
        keras.layers.GlobalAveragePooling1D(),
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(16, activation='relu'),
        keras.layers.Dense(8, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    return model


embedding_model = embedding_model()
embedding_model.compile(optimizer='adam',
                          loss='binary_crossentropy',
                          metrics=['accuracy'])

embedding_model.fit(X_train_tensor, y_train_tensor, epochs=25, batch_size=128, validation_data=(X_test_tensor, y_test_tensor), verbose=1)
embedding_model.save('imdb_embedding_model.h5')


Epoch 1/25
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 748ms/step - accuracy: 0.5021 - loss: 0.6932 - val_accuracy: 0.5002 - val_loss: 0.6931
Epoch 2/25
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 789ms/step - accuracy: 0.4980 - loss: 0.6932 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 3/25
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 760ms/step - accuracy: 0.5034 - loss: 0.6931 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 4/25
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 682ms/step - accuracy: 0.5045 - loss: 0.6932 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 5/25
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 692ms/step - accuracy: 0.5052 - loss: 0.6931 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 6/25
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 688ms/step - accuracy: 0.4987 - loss: 0.6932 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoc

KeyboardInterrupt: 

In [7]:
print('accuracy:', embedding_model.evaluate(X_test_tensor, y_test_tensor, verbose=0)[1])


accuracy: 0.5
