In [4]:
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub # Open repository and library for reusing trained models
import tensorflow_datasets as tfds

import matplotlib.pyplot as plt

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly()) # Dynamic graph just like Pytorch
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

Version:  2.9.1
Eager mode:  True
Hub version:  0.12.0
GPU is NOT AVAILABLE


In [24]:
# Bag of the words

# words = input('Enter the word :').lower().split(' ')
# print(words)
vocab = dict()

def bagofwords(words):
    words = words.lower().split(' ')
    i = 1
    bag = dict()
    for word in words:
        if word not in vocab.keys():
            vocab[word] = i
            i+=1
    for word in words:
        bag[vocab[word]] = words.count(word)
            
    return bag

print(bagofwords("I thought the movie was going to be bad but it was actually amazing"))
print(bagofwords("I thought the movie was going to be amazing but it was actually bad"))


{1: 1, 2: 1, 3: 1, 4: 1, 5: 2, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1}
{1: 1, 2: 1, 3: 1, 4: 1, 5: 2, 6: 1, 7: 1, 8: 1, 13: 1, 10: 1, 11: 1, 12: 1, 9: 1}


In [1]:
# Integer Encoding - this keeps the order of the words intact 
vocab = dict()

def integerencoding(words):
    words = words.lower().split(' ')
    i = 1
    bag = []
    for word in words:
        if word not in vocab.keys():
            vocab[word] = i
            i+=1
    for word in words:
        bag.append(vocab[word])
            
    return bag

print(integerencoding("I thought the movie was going to be bad but it was actually amazing"))
print(integerencoding("I thought the movie was going to be amazing but it was actually bad"))

# Word embedding - to keep the track of order aswell as labelling similar word as same
# One hot encoding - Each word is represented by vector of size equal to dictionary and contains more sparse and doesn't take in account relationship

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 5, 12, 13]
[1, 2, 3, 4, 5, 6, 7, 8, 13, 10, 11, 5, 12, 9]


In [13]:
from keras.datasets import imdb
from keras_preprocessing.sequence import pad_sequences
import keras
import tensorflow as tf
import os
import numpy as np

VOCAB_SIZE = 88584 # This much words will be only kept which are encountered during training, more repeated words are given preference

MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)


In [14]:
train_data[1]

[1,
 194,
 1153,
 194,
 8255,
 78,
 228,
 5,
 6,
 1463,
 4369,
 5012,
 134,
 26,
 4,
 715,
 8,
 118,
 1634,
 14,
 394,
 20,
 13,
 119,
 954,
 189,
 102,
 5,
 207,
 110,
 3103,
 21,
 14,
 69,
 188,
 8,
 30,
 23,
 7,
 4,
 249,
 126,
 93,
 4,
 114,
 9,
 2300,
 1523,
 5,
 647,
 4,
 116,
 9,
 35,
 8163,
 4,
 229,
 9,
 340,
 1322,
 4,
 118,
 9,
 4,
 130,
 4901,
 19,
 4,
 1002,
 5,
 89,
 29,
 952,
 46,
 37,
 4,
 455,
 9,
 45,
 43,
 38,
 1543,
 1905,
 398,
 4,
 1649,
 26,
 6853,
 5,
 163,
 11,
 3215,
 10156,
 4,
 1153,
 9,
 194,
 775,
 7,
 8255,
 11596,
 349,
 2637,
 148,
 605,
 15358,
 8003,
 15,
 123,
 125,
 68,
 23141,
 6853,
 15,
 349,
 165,
 4362,
 98,
 5,
 4,
 228,
 9,
 43,
 36893,
 1157,
 15,
 299,
 120,
 5,
 120,
 174,
 11,
 220,
 175,
 136,
 50,
 9,
 4373,
 228,
 8255,
 5,
 25249,
 656,
 245,
 2350,
 5,
 4,
 9837,
 131,
 152,
 491,
 18,
 46151,
 32,
 7464,
 1212,
 14,
 9,
 6,
 371,
 78,
 22,
 625,
 64,
 1382,
 9,
 8,
 168,
 145,
 23,
 4,
 1690,
 15,
 16,
 4,
 1355,
 5,
 28,
 6,
 52,
 

In [15]:
train_data = pad_sequences(train_data, MAXLEN, padding = 'post') #Removes data from end and make all the tensors equals
test_data = pad_sequences(test_data, MAXLEN, padding = 'post')

In [26]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32), # Embedding allows similar words with similar encoding input_lenght = 250, 32 is the single word will be represented by vector of size 32
    tf.keras.layers.LSTM(32), # output of (batch_size, 32)
    tf.keras.layers.Dense(1, activation="sigmoid")
])


In [27]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, None, 32)          2834688   
                                                                 
 lstm_10 (LSTM)              (None, 32)                8320      
                                                                 
 dense_10 (Dense)            (None, 1)                 33        
                                                                 
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=['acc'])

history = model.fit(train_data, train_labels, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
results = model.evaluate(test_data, test_labels)
print(results)

[0.49467000365257263, 0.8303200006484985]
