In [None]:
import tensorflow_datasets as tfdb

imdb_plaintext, info_plaintext = tfdb.load("imdb_reviews", with_info=True, as_supervised=True)
imdb_subword, info_subword = tfdb.load("imdb_reviews/subwords8k", with_info=True, as_supervised=True)

In [None]:
import numpy as np
info_plaintext.features 

In [None]:
for example in imdb_plaintext['train'].take(2):
    print(example[0].numpy())

In [None]:
info_subword.features
for example in imdb_subword['train'].take(2):
    print(example)

In [None]:
tokenizer_subword = info_subword.features['text'].encoder

for example in imdb_subword['train'].take(2):
    print(tokenizer_subword.decode(example[0]))

In [None]:
train_data = imdb_plaintext['train']

training_seq = []

for s,_ in train_data:
    training_seq.append(s.numpy().decode('utf8'))

In [None]:
print(training_seq)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size =10000
oov_tok = "<OOV>"

tokenizer_context = Tokenizer(num_words= 10000, oov_token = oov_tok)
tokenizer_context.fit_on_texts(training_seq)
sequences = tokenizer_context.texts_to_sequences(training_seq) 


In [None]:
tokenizer_context.sequences_to_texts(sequences[0:1])

In [None]:
token_string = tokenizer_subword.encode(training_seq[0])
print(token_string)

original_string = tokenizer_subword.decode(token_string)

print(original_string)

In [None]:
sample_string = 'This is just a sample.'

tokenized_string = tokenizer_context.texts_to_sequences([sample_string])
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_context.sequences_to_texts(tokenized_string)
print ('The original string: {}'.format(original_string))

In [None]:
tokenized_string = tokenizer_subword.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_subword.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer_subword.decode([ts])))

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 128

train_data, test_data = imdb_subword['train'], imdb_subword['test'], 

train_dataset = train_data.shuffle(BUFFER_SIZE)

train_dataset = train_dataset.padded_batch(BATCH_SIZE)
test_dataset = test_data.padded_batch(BATCH_SIZE)

In [None]:
import tensorflow as tf
embedding_dim = 64


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer_subword.vocab_size, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

In [None]:
num_epochs = 15

# Set the training parameters
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Start training
history = model.fit(train_dataset, epochs=num_epochs, validation_data=test_dataset)

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
embedding_layer = model.layers[0]

embedding_weights = embedding_layer.get_weights()[0]
print(embedding_weights.shape) 
reverse_word_index = tokenizer_context.index_word
print(reverse_word_index)

# import io

# out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
# out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# for word_num in range(1, vocab_size):

#   if(word_num<8085):
#     word_name = reverse_word_index[word_num]
#     word_embedding = embedding_weights[word_num]
#     out_m.write(word_name + "\n")
#     out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")
  
#   else:
#     break

# out_v.close()
# out_m.close()


In [27]:
user_input = input("Enter a movie review: ")

# Preprocess user input
user_input_encoded = tokenizer_subword.encode(user_input)
# Predict sentiment
predicted_prob = model.predict([user_input_encoded])
# Interpret the prediction
if predicted_prob >= 0.5:
    sentiment = 'positive'
else:
    sentiment = 'negative'

print(f'The sentiment of the movie review is {sentiment} (Probability: {predicted_prob[0][0]:.4f})')

The sentiment of the movie review is positive (Probability: 1.0000)
