In [1]:
# example from https://www.youtube.com/watch?v=6g4O5UOH304&t=3909s
# classify that whether the review thinks the movie is good or bad
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [2]:
# imdb: Dataset of 25,000 movies reviews
data = keras.datasets.imdb
# use the data encoded by 10000-words tokenizer: choose 10000 most frequent words from the sentences to tokenize
(train_data, train_label), (test_data, test_label) = data.load_data(num_words=10000)

# returns a dictionary of key value pairs where 
# the key is the word in the sentence and the value is the label assigned to it.
word_index = data.get_word_index()

# reconstruct the dictionary to recover the original plaintext reviews
word_index = {k:(v+3) for k,v in word_index.items()}
# v+3 to spare the value for padding, start, unknown, and unused
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3
# To recover the encoded data to plaintext
reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])
plaintext = " ".join([reverse_word_index.get(i, '?') for i in test_data[0]])
print("test_data: \n", test_data[0])
print("origin review: \n", plaintext)

test_data: 
 [1, 591, 202, 14, 31, 6, 717, 10, 10, 2, 2, 5, 4, 360, 7, 4, 177, 5760, 394, 354, 4, 123, 9, 1035, 1035, 1035, 10, 10, 13, 92, 124, 89, 488, 7944, 100, 28, 1668, 14, 31, 23, 27, 7479, 29, 220, 468, 8, 124, 14, 286, 170, 8, 157, 46, 5, 27, 239, 16, 179, 2, 38, 32, 25, 7944, 451, 202, 14, 6, 717]
origin review: 
 <START> please give this one a miss br br <UNK> <UNK> and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite <UNK> so all you madison fans give this a miss


In [3]:
# preprocessing data
# padding at the end of sequences that length is shorter than 300
# extraxt 300 words from the beginning of the sequences that lenths are longer than 300
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], 
                                                        padding="post", maxlen=256)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], 
                                                        padding="post", maxlen=256)

### Sigmoid or Logistic Activation Function
    - The main reason why we use sigmoid function is because it exists between (0 to 1). Therefore, it is especially used for models where we have to predict the probability as an output.Since probability of anything exists only between the range of 0 and 1, sigmoid is the right choice.
    - The softmax function is a more generalized logistic activation function which is used for multiclass classification.
### Tanh or hyperbolic tangent Activation Function
    - The range of the tanh function is from (-1 to 1). The tanh function is mainly used classification between two classes.
### ReLU (Rectified Linear Unit) Activation Function: Range: [ 0 to infinity)
    -The biggest advantage of ReLu is indeed non-saturation of its gradient, which greatly accelerates the convergence of stochastic gradient descent compared to the sigmoid / tanh functions (paper by Krizhevsky et al).
    -Because of the horizontal line in ReLu( for negative X ), the gradient can go towards 0.
### Leaky ReLU: range of the Leaky ReLU is (-infinity to infinity)

#### https://towardsdatascience.com/activation-functions-neural-networks-1cbd9f8d91d6
![title](activationFunction.JPG)

In [4]:
# build model
model = keras.Sequential()
# embedding layers group similar words to gether (word vectors that have small differences in degrees)
# represent each word in 1*16 word vector: [c1, c2, ..., c16] = a certain word
model.add(keras.layers.Embedding(10000,16)) # 10000 num_words = 10000 word vectors in 16 dimension
model.add(keras.layers.GlobalAveragePooling1D()) # average the value for each word vector
model.add(keras.layers.Dense(16, activation='relu'))
# sigmoid let the result between 0~1 to show the probabilities of 0:bad or 1:good
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


#### loss function: https://machinelearningmastery.com/how-to-choose-loss-functions-when-training-deep-learning-neural-networks/
1. Regression Loss Functions
    - Mean Squared Error Loss
    - Mean Squared Logarithmic Error Loss
    - Mean Absolute Error Loss
2. Binary Classification Loss Functions
    - Binary Cross-Entropy
    - Hinge Loss
    - Squared Hinge Loss
3. Multi-Class Classification Loss Functions
    - Multi-Class Cross-Entropy Loss
    - Sparse Multiclass Cross-Entropy Loss
    - Kullback Leibler Divergence Loss
    
#### The batch size is a number of samples processed before the model is updated. Epoch is the number of complete passes through the training dataset. The size of a batch must be more than or equal to one and less than or equal to the number of samples in the training dataset.

### Protocol of building a solid model: 
- splitting your data into three sets: One for training, one for validation and one for final evalution, which is the test set.
- you train on your training data and tune your model with the results of metrics (accuracy, loss etc) that you get from your validation set.
- Your model doesn't "see" your validation set and isn´t in any way trained on it, but you as the architect and master of the hyperparameters tune the model according to this data. Therefore it indirectly influences your model because it directly influences your design decisions. You nudge your model to work well with the validation data and that can possibly bring in a tilt.

#### verbose 0 (silent), 1 (progress bar) or 2 (number) you just say how do you want to 'see' the training progress for each epoch.

In [5]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
# divide training data into training dataset and validation dataset
x_val, y_val = train_data[:10000], train_label[:10000]
x_train, y_train = train_data[10000:], train_label[10000:]
fitModel = model.fit(x_train,y_train, epochs=50, batch_size=512, 
                     validation_data=(x_val,y_val), verbose=0)

loss, accuracy = model.evaluate(test_data, test_label)
print("Loss: ", loss, " / Accuracy: ", accuracy)
print()

# print(test_data.shape)
prediction = model.predict(test_data)
for i in range(3):
    print()
    print("Review: \n", " ".join([reverse_word_index.get(i, '?') for i in test_data[i]]))
    print("Prediction: ", str(prediction[i]))
    print("Actual: ", str(test_label[i]))
    print()

# save trained model
model.save("Movie_Review_Model.h5")

Loss:  0.370737602519989  / Accuracy:  0.86784


Review: 
 <START> please give this one a miss br br <UNK> <UNK> and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite <UNK> so all you madison fans give this a miss <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD

In [7]:
trained_model = keras.models.load_model("Movie_Review_Model.h5")

import re 
def multiple_replace(dict, text):
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) 

def review_encode(s):
    encoded = [1]
    for word in s:
        if word.lower() in word_index:
            encoded.append(word_index[word.lower()])
        else:
            encoded.append(2)
    return encoded

replace_dict = {",":"", ".":"", "(":"", ")":"", ":":"", "\"":""}
with open("five_star_review.txt", "r") as f:
    for line in f.readlines():
        cleaned_line = multiple_replace(replace_dict, line)
        encode = review_encode(cleaned_line)
        encode = keras.preprocessing.sequence.pad_sequences([encode], value=word_index["<PAD>"], 
                                                            padding="post", maxlen=256)
        predict = trained_model.predict(encode)
        print("Review: \n", line)
        print("eEncode: \n", encode)
        print(predict[0])
        

Review: 
 Fabulous movie. We saw it on the Philippines on the 20th and it made us want to leave our tropical paradise for snow covered mountains. Great to look at and a movie for the whole family from the young uns to grandparents. A must see for families. Probably not one for jaded snowflakes without kids
eEncode: 
 [[1657 1657   13 3363  963  590    2 1604 3363    2  830 2023  963    2
   241 2241  830 2023    2    6 3363 1095    2   13  830    2 1983    6
  1095  963    2 1206  590    2 1992    6 3363  830    2  830 1604    2
  2014  963    6 1964  963    2 1604 1206 1479    2  830 1479 1604 1657
    13 1148    6 2014    2 1657    6 1479    6 1095   13  590  963    2
  1209 1604 1479    2  590 3363 1604 1992    2 1148 1604 1964  963 1479
   963 1095    2 1983 1604 1206 3363  830    6   13 3363  590    2 1331
  1479  963    6  830    2  830 1604    2 2014 1604 1604 2295    2    6
   830    2    6 3363 1095    2    6    2 1983 1604 1964   13  963    2
  1209 1604 1479    2  830 2023  