- ### Import Libraries and APIs

In [1]:
# import the required libraries and APIs
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




  from .autonotebook import tqdm as notebook_tqdm


- ### Downloading the Tensorflow `imdb_review` dataset

In [2]:
## load the imdb reviews datset
data, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

- ### Segregating training and testing sets

In [3]:
## segregate training and test set
train_data, test_data = data['train'], data['test']

# create empty list to store sentences and labels
train_sentences = []
test_sentences = []

train_labels = []
test_labels = []

In [4]:
# iterate over the train data to extract sentences and labels
for sent, label in train_data:
    train_sentences.append(str(sent.numpy().decode('utf8')))
    train_labels.append(label.numpy())

# iterate over the test set to extract sentences and labels
for sent, label in test_data:
    test_sentences.append(str(sent.numpy().decode('utf8')))
    test_labels.append(label.numpy())

In [5]:
# convert lists to numpy array
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

- ### Data preparation `&` setting up the tokenizer

In [6]:
# define the parameters for the tokenizing and padding

vocab_size = 10000
embedding_dim = 16
max_length = 150
trunc_type = "post"
oov_tok = "<oov>"

In [7]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

# training sequences and labels
train_seqs = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_seqs, maxlen=max_length, truncating=trunc_type)

# testing sequences and labels
test_seqs = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_seqs, maxlen=max_length, truncating=trunc_type)

In [8]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])


print(train_sentences[1])
print(train_padded[1])
print(decode_review(train_padded[1]))

I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of what was causing them or why. I admit, I may have missed part of the film, but i watched the majority of it and everything just seemed to happen of its own accord without any real concern for anything else. I cant recommend this film at all.
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0   11   26   75  571
    6  805 2354  313  106   19   12    7  629  686    6    4 2219    5
  181  584   64 1454  110 2263    3 3951   21    2    1    3  258   41
 4677    4  174  188   21  


<h3 style="color:orange; font-weight:bold" > - Define the Neural Network with Embedding layer </h3>

1. Use the Sequential API
2. Add an embedding input layer of input size equal to vocabulary size.
3. Add a flatten layer, and two dense layers.

In [9]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# compile the model with loss function optimizer and metrics
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
model.summary()













Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 16)           160000    
                                                                 
 flatten (Flatten)           (None, 2400)              0         
                                                                 
 dense (Dense)               (None, 6)                 14406     
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 174413 (681.30 KB)
Trainable params: 174413 (681.30 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
first_layer_weights = model.layers[0].get_weights()[0]
print(first_layer_weights)

[[ 0.04799053 -0.01425347 -0.03507526 ...  0.02928877  0.02433933
  -0.00768876]
 [ 0.03385986  0.04334049 -0.04110943 ... -0.02994746  0.01894024
  -0.00337299]
 [ 0.03901422  0.02907815 -0.00890483 ... -0.01555584  0.01566258
   0.0306213 ]
 ...
 [ 0.01779375 -0.02965666  0.04183738 ...  0.01348314  0.01223607
   0.01033864]
 [-0.0099184   0.00027473 -0.01601888 ... -0.02756344 -0.00509614
   0.0173244 ]
 [-0.03014993  0.03659819  0.01048886 ...  0.04039791 -0.02833651
   0.00141752]]


<h3 style="color:orange; font-weight:bold" > - Model Training </h3>

In [11]:
num_epochs = 10
# training the model with training and vilidation set
model.fit(
    train_padded,
    train_labels,
    epochs = num_epochs,
    validation_data = (test_padded, test_labels)

)

Epoch 1/10














Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x22f73e9df50>

<h3 style="color:orange; font-weight:bold" > - Deriving weights from the embedding layer </h3>

In [12]:
# isolating the first embedding layer
l1 = model.layers[0]

# extract learned weights
weights = l1.get_weights()[0]
print(weights.shape)
print(weights)

(10000, 16)
[[ 0.03669327 -0.01652714 -0.00586961 ... -0.00137452 -0.01815873
  -0.02270113]
 [-0.00842062  0.03190735 -0.03573254 ... -0.07154389 -0.04187592
  -0.03279363]
 [ 0.06009711  0.01284993 -0.01011654 ... -0.02180055  0.06905068
   0.02049881]
 ...
 [ 0.04614146  0.00052335 -0.05625988 ...  0.08298224 -0.00501019
   0.131492  ]
 [ 0.07467708 -0.04374067 -0.09197625 ... -0.01386266  0.16538572
   0.1335269 ]
 [ 0.02015016 -0.1445324  -0.09643715 ...  0.10487367 -0.05435461
   0.07187508]]


<h3 style="color:orange; font-weight:bold" > Downloading the vectors and metadata </h3>

In [14]:
# import I/O module in python
import io

# open the text stream for vectors
vectors = io.open('vectors.tsv', 'w', encoding='utf-8')

# open the text stream for metadata
meta = io.open('meta.tsv', 'w', encoding='utf-8')

# write eache word and its corresponding embedding
for index in range(1, vocab_size):
    word = reverse_word_index[index] # flipping the key-value in word_index
    embeddings = weights[index]
    meta.write(word + "\n")
    vectors.write('\t'.join([str(x) for x in embeddings]) + "\n")

# close the stream
vectors.close()
meta.close()