In [1]:
import tensorflow as tf 
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np 
import io

print(tf.__version__)

2.4.0-rc0


In [2]:
tf.executing_eagerly() # if 1.x use `tf.enable_eager_execution()`

True

In [3]:
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True) # loading the data

In [5]:
tfds.list_builders()[:5] # the list of all datasets

['abstract_reasoning', 'accentdb', 'aeslc', 'aflw2k3d', 'ag_news_subset']

In [7]:
train_data, test_data = imdb['train'], imdb['test'] # 25k train and 25k testing

training_sentences = []
training_labels = []
testing_sentences = []
testing_labels = []

for sample, label in train_data:
    training_sentences.append(sample.numpy().decode('utf8'))
    training_labels.append(label.numpy())

for sample, label in test_data:
    testing_sentences.append(sample.numpy().decode('utf8'))
    testing_labels.append(label.numpy())

In [8]:
print(training_sentences[1]) 
print(">> label", training_labels[1]) # 0 negative, 1 pos

I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of what was causing them or why. I admit, I may have missed part of the film, but i watched the majority of it and everything just seemed to happen of its own accord without any real concern for anything else. I cant recommend this film at all.
>> label 0


In [9]:
print(len(training_sentences))
print(len(training_labels))
print(len(testing_sentences))
print(len(testing_labels))

25000
25000
25000
25000


In [10]:
# converting to numpy arrays
training_labels_final = np.array(training_labels) 
testing_labels_final = np.array(testing_labels)

In [11]:
training_labels_final.shape

(25000,)

In [12]:
# Preparing data for training by tokenizing

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post' # [4, 4, 5, 6, ..... 0, 0, 0] - zeros at the end 
oov_tok = "<OOV>" # out of vocabulary

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index # all 10000 words with tokens in a dictionary 
sequences = tokenizer.texts_to_sequences(training_sentences) # all sentences represented only with tokens
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type) # make all sentences the same size

# the same for testing set
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length)

padded.shape

(25000, 120)

In [89]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[1]))
print()
print(training_sentences[1])

? ? ? ? ? ? ? ? i have been known to fall asleep during films but this is usually due to a combination of things including really tired being warm and comfortable on the <OOV> and having just eaten a lot however on this occasion i fell asleep because the film was rubbish the plot development was constant constantly slow and boring things seemed to happen but with no explanation of what was causing them or why i admit i may have missed part of the film but i watched the majority of it and everything just seemed to happen of its own <OOV> without any real concern for anything else i cant recommend this film at all

I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of wha

In [64]:
print(training_sentences[1]) 
print(">> original length", len(training_sentences[1]))
print(">> label", training_labels[1])

print()
print(sequences[1])
print(">> sequence lenght", len(sequences[1]))
print()
print(padded[1])
padded[1].shape

I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of what was causing them or why. I admit, I may have missed part of the film, but i watched the majority of it and everything just seemed to happen of its own accord without any real concern for anything else. I cant recommend this film at all.
>> original length 617
>> label 0

[11, 26, 75, 571, 6, 805, 2354, 313, 106, 19, 12, 7, 629, 686, 6, 4, 2219, 5, 181, 584, 64, 1454, 110, 2263, 3, 3951, 21, 2, 1, 3, 258, 41, 4677, 4, 174, 188, 21, 12, 4078, 11, 1578, 2354, 86, 2, 20, 14, 1907, 2, 112, 940, 14, 1811, 1340, 548, 3, 355, 181, 466, 6, 591, 19, 17, 55, 1817, 5, 49, 14, 4044, 96, 40, 136, 11, 972, 11, 201, 26, 1046, 1

(120,)

In [56]:
# len(list(word_index)) # 90000 appr
list(word_index)[57565] # even we defined vocab_size = 10000, tensorflow tokenizes all words, but in backed end it will work with 10000 words, 
# num_words=n parameter specifies the maximum number of words to be tokenized, and picks the most common ‘n’ words

'bintang'

In [71]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(), # GlobalAveragePooling1D()
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 11526     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [106]:
# Training own modelg

num_epochs = 10
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x29b991ee0>

In [76]:
e = model.layers
e

[<tensorflow.python.keras.layers.embeddings.Embedding at 0x29bb18880>,
 <tensorflow.python.keras.layers.core.Flatten at 0x29bb18460>,
 <tensorflow.python.keras.layers.core.Dense at 0x29bb18070>,
 <tensorflow.python.keras.layers.core.Dense at 0x29bb20370>]

In [79]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 16)


In [87]:
weights[1] # each word has its own weight

array([-0.08942658,  0.00486923, -0.05935808, -0.06226563, -0.04867279,
        0.04237117,  0.04769849,  0.03356505, -0.03730453,  0.00785854,
        0.03105144,  0.0776749 ,  0.05284716,  0.025134  , -0.03554538,
       -0.04298926], dtype=float32)

In [86]:
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  
  if word_num < 5:
    print(f">> word {word_num}", word)
    print(">> embeddings", embeddings)

  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

>> word 1 <OOV>
>> embeddings [-0.08942658  0.00486923 -0.05935808 -0.06226563 -0.04867279  0.04237117
  0.04769849  0.03356505 -0.03730453  0.00785854  0.03105144  0.0776749
  0.05284716  0.025134   -0.03554538 -0.04298926]
>> word 2 the
>> embeddings [-0.08670148  0.01641071 -0.02393427 -0.07146466  0.01603186  0.06126428
  0.06148115  0.00766911  0.04187395  0.05556076  0.01930173  0.0744463
  0.01907398  0.01339489  0.00941497 -0.0138381 ]
>> word 3 and
>> embeddings [ 0.01113727 -0.03538265 -0.05725451 -0.01636735 -0.00596739 -0.00635358
  0.03053617  0.05559737  0.0871934   0.04494542  0.02274616  0.07229666
  0.01994341  0.01223046 -0.05789011 -0.04256919]
>> word 4 a
>> embeddings [-0.05104827 -0.01813413 -0.04630557 -0.02343593 -0.03323779  0.06510878
 -0.00737528  0.02424134  0.0825871   0.00570629 -0.01472468  0.12047923
  0.01702527 -0.04734353 -0.05681538 -0.06954415]


In [99]:
if tf.test.gpu_device_name(): 
    print('Default GPU Device:'.format(tf.test.gpu_device_name()))
else:
   print("Please install GPU version of TF")

Please install GPU version of TF


In [102]:
print(tf.config.list_physical_devices('GPU'))
tf.config.list_physical_devices()

[]


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]