In [40]:
import re
import os
import string
import matplotlib.pyplot as plt
import numpy as np
from numpy import random
import datetime
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Input, Embedding, SimpleRNN, Dense, Bidirectional, LSTM, Dropout, GRU, Conv1D, Flatten, MultiHeadAttention, LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras import Layer
from tensorflow.keras.optimizers import Adam
import gensim.downloader as api
from tensorboard.plugins import projector

In [None]:
train_ds, val_ds, test_ds = tfds.load(
    "imdb_reviews",
    split=["train", "test[:50%]", "test[50%:]"],
    as_supervised=True
)

In [None]:
train_ds

In [None]:
for review, label in train_ds.take(2):
  print(review)
  print(label)

In [None]:
def standardization(input_data):
  lower_case = tf.strings.lower(input_data)
  no_tag = tf.strings.regex_replace(lower_case, "<[^>]+>", "") # remove html tag from text
  output = tf.strings.regex_replace(no_tag, "[%s]"%re.escape(string.punctuation), "") # remove punctuation

  return output

In [None]:
VOCB_SIZE = 10000
SEQUENCE_LENGTH = 250
BATCH_SIZE = 64
EMBEDING_DIM = 300

In [None]:
vectorize_layer = TextVectorization(
    standardize=standardization,
    max_tokens=VOCB_SIZE,
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH
)

In [None]:
# lengths = []
# words = []

# for review, label in train_ds.take(10):
#   for word in tf.strings.split(review, sep=" "):
#     if word in words:
#       pass
#     else:
#       words.append(word)
#   lengths.append(len(tf.strings.split(review, sep=" ")))

In [None]:
training_data = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(training_data)

In [None]:
vectorize_layer.get_vocabulary()

In [None]:
len(vectorize_layer.get_vocabulary())

In [None]:
for review, label in train_ds.take(1):
  print(review)
  print(label)

In [None]:
def vectorizer(review, label):
  return vectorize_layer(review), label

In [None]:
train_dataset = train_ds.map(vectorizer)
val_dataset = val_ds.map(vectorizer)

In [None]:
for review, label in train_dataset.take(1):
  print(review)
  print(label)

In [None]:
train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
EMBEDING_DIM = 64
model = tf.keras.Sequential([
    Input(shape=(SEQUENCE_LENGTH,)),
    Embedding(VOCB_SIZE, EMBEDING_DIM),
    SimpleRNN(32),
    Dense(1, activation="sigmoid")
])

model.summary()

In [None]:
EMBEDING_DIM = 64
model = tf.keras.Sequential([
    Input(shape=(SEQUENCE_LENGTH,)),
    Embedding(VOCB_SIZE, EMBEDING_DIM),

    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),

    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

model.summary()

In [None]:
EMBEDING_DIM = 64
model = tf.keras.Sequential([
    Input(shape=(SEQUENCE_LENGTH,)),
    Embedding(VOCB_SIZE, EMBEDING_DIM),

    Bidirectional(GRU(64, return_sequences=True)),
    Bidirectional(GRU(32)),

    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

model.summary()

In [None]:
EMBEDDING_DIM=300
model=tf.keras.models.Sequential([
    Input(shape=(SEQUENCE_LENGTH,)),
    Embedding(VOCB_SIZE,EMBEDDING_DIM),

    Conv1D(32, 3, activation='relu',),
    Flatten(),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1,activation='sigmoid'),
])
model.summary()

In [None]:
checkpoint_filepath = '/content/rnn.weights.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
log_dir = 'logs/imdb/fit'+datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '/'

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
history = model.fit(train_dataset, validation_data=val_dataset, epochs=5, callbacks=[tensorboard_callback])

In [None]:
with open(os.path.join(log_dir,'metadata.tsv'),"w",encoding="utf-8") as f:
  for i in range(VOCB_SIZE):
    f.write("{} {}\n".format(i,vectorize_layer.get_vocabulary()[i]))

In [None]:
embedding_weights=tf.Variable(model.layers[0].get_weights()[0])
print(embedding_weights.shape)

In [None]:
checkpoint=tf.train.Checkpoint(embedding=embedding_weights)
checkpoint.save(os.path.join(log_dir,"embedding.ckpt"))

config=projector.ProjectorConfig()
embedding=config.embeddings.add()

In [None]:
embedding.metadata_path='metadata.tsv'
projector.visualize_embeddings(log_dir,config)

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs/imdb/fit/

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title("model_loss")
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['tain', "val"], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title("model_accuracy")
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend(['tain', "val"], loc='upper left')
plt.show()

In [None]:
word2vec = api.load("word2vec-google-news-300")

In [None]:
word2vec.vectors.shape

In [None]:
word2vec.key_to_index

In [None]:
len(word2vec["The"])

In [None]:
word2vec.most_similar('Man')

In [None]:
def first_cap(word):
  return word[0].upper() + word[1:]

In [None]:
pretrained_embedding = []
for i in range(len(vectorize_layer.get_vocabulary())):
 try:
  pretrained_embedding.append(word2vec[vectorize_layer.get_vocabulary()[i]])
 except:
  print(vectorize_layer.get_vocabulary()[i])
  try:
    pretrained_embedding.append(word2vec[first_cap(vectorize_layer.get_vocabulary()[i])])
    print("toupper")
  except:
    print("nosloution")
    pretrained_embedding.append(random.normal(loc=0,scale=1,size=(EMBEDING_DIM,)))

  if i%1000==0:
    print("i is=== ", i)

In [None]:
pretrained_embedding_array = np.array(pretrained_embedding)

In [None]:
EMBEDING_DIM = 300
model = tf.keras.Sequential([
    Input(shape=(SEQUENCE_LENGTH,)),
    Embedding(VOCB_SIZE, EMBEDING_DIM, embeddings_initializer=tf.keras.initializers.Constant(pretrained_embedding), trainable=True),

    Conv1D(32, 3, activation="relu"),
    Flatten(),

    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

model.summary()

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history=model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,)

In [None]:
test_data=tf.data.Dataset.from_tensor_slices([["this movie looks very interesting, i love the fact that the actors do a great job in showing how people lived in the 18th century, which wasn't very good at all. But atleast this movie recreates this scenes! "],
                                              ["very good start, but movie started becoming interesting at some point and fortunately at some point it started becoming much more fun, though there was too much background noise, so in all i liked this movie "],])


In [None]:
def vectorizer_test(review):
    return vectorize_layer(review)
test_dataset=test_data.map(vectorizer_test)

In [None]:
model.predict(test_dataset)

In [None]:
inputs = Input(shape=(1,), dtype="string")
vectorized_inputs=vectorize_layer(inputs)
outputs = model(vectorized_inputs)
inference_ready_model = tf.keras.Model(inputs, outputs)
inference_ready_model.summary()

In [None]:
inference_ready_model.predict(["this movie looks very interesting, i love the fact that the actors do a great job in showing how people lived in the 18th century, which wasn't very good at all. But atleast this movie recreates this scenes! ",
                               "very good start, but movie started becoming interesting at some point and fortunately at some point it started becoming much more fun, though there was too much background noise, so in all i liked this movie "])

In [None]:
def positional_encoding(model_size,SEQUENCE_LENGTH):
  output=[]
  for pos in range(SEQUENCE_LENGTH):
    PE=np.zeros((model_size))
    for i in range(model_size):
      if i%2==0:
        PE[i]=np.sin(pos/(10000**(i/model_size)))
      else:
        PE[i]=np.cos(pos/(10000**((i-1)/model_size)))
    output.append(tf.expand_dims(PE,axis=0))
  out=tf.concat(output,axis=0)
  out=tf.expand_dims(out,axis=0)
  return tf.cast(out,dtype=tf.float32)

In [None]:
class Embeddings(Layer):
  def __init__(self, sequence_length, vocab_size, embed_dim,):
    super(Embeddings, self).__init__()
    self.token_embeddings=Embedding(
        input_dim=vocab_size, output_dim=embed_dim)
    self.sequence_length = sequence_length
    self.vocab_size = vocab_size
    self.embed_dim = embed_dim

  def call(self, inputs):
    embedded_tokens = self.token_embeddings(inputs)
    embedded_positions=positional_encoding(
        self.embed_dim,self.sequence_length)
    return embedded_tokens + embedded_positions

  def compute_mask(self, inputs, mask=None):
    return tf.math.not_equal(inputs, 0)

  def get_config(self):
      config = super().get_config()
      config.update({
        "sequence_length": self.sequence_length,
        "vocab_size": self.vocab_size,
        "embed_dim": self.embed_dim,
      })
      return config


In [None]:
class TransformerEncoder(Layer):
    def __init__(self, embed_dim, dense_dim, num_heads,):
        super(TransformerEncoder, self).__init__()
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim,
        )
        self.dense_proj=tf.keras.Sequential(
            [Dense(dense_dim, activation="relu"),Dense(embed_dim),]
        )
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
      if mask is not None:
        mask1 = mask[:, :, tf.newaxis]
        mask2 = mask[:,tf.newaxis, :]
        padding_mask = tf.cast(mask1&mask2, dtype="int32")

      attention_output = self.attention(
          query=inputs, key=inputs,value=inputs,attention_mask=padding_mask
      )

      proj_input = self.layernorm_1(inputs + attention_output)
      proj_output = self.dense_proj(proj_input)
      return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
      config = super().get_config()
      config.update({
        "embed_dim": self.embed_dim,
        "num_heads": self.num_heads,
        "dense_dim": self.dense_dim,
      })
      return config

In [None]:
EMBEDDING_DIM=128
D_FF=1024
NUM_HEADS=8
NUM_LAYERS=1
NUM_EPOCHS=20

In [None]:
encoder_input=Input(shape=(None,), dtype="int64", name="input")
x = Embeddings(SEQUENCE_LENGTH,VOCB_SIZE,EMBEDDING_DIM)(encoder_input)

for _ in range(NUM_LAYERS):
  x=TransformerEncoder(EMBEDDING_DIM,D_FF,NUM_HEADS)(x)

x = Flatten()(x)
output=Dense(1, activation="sigmoid")(x)

transformer = tf.keras.Model(
    encoder_input, output, name="transformer"
)
transformer.summary()

In [None]:
transformer.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history=transformer.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,)