In [1]:
#Importing the libraries
import numpy as np
import matplotlib.pyplot as plt

from keras.utils import to_categorical

from keras.models import Sequential
from keras.layers import Dense

from keras.callbacks import History 

from keras import optimizers
from keras.datasets import imdb

In [9]:
# Split dataset into train and test set
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=5000)

print("Numero di esempi nel train set: %d" % len(X_train))
print("Numero di esempi nel test set: %d" % len(X_test))

Numero di esempi nel train set: 25000
Numero di esempi nel test set: 25000


In [10]:
# Coding of sentences 
def onehot_encoding(data, size):
    onehot = np.zeros((len(data), size))
    for i, d in enumerate(data):
        onehot[i,d] = 1.
    return onehot

In [11]:
X_train_oh = onehot_encoding(X_train, 5000)
X_test_oh = onehot_encoding(X_test, 5000)

X_train_oh.shape


(25000, 5000)

In [12]:
# Implementation of the model
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(5000,)))
model.add(Dense(128,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(8,activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [13]:
model.compile(optimizer='adamax', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
model.fit(X_train_oh, y_train, epochs=10, batch_size=512)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1335b7fc430>

In [None]:
# Implementation of dropout and  regularization l2 for overfitting
from keras.regularizers import l2
from keras.layers import dropout
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(5000,), kernel_regularizer=l2(0.1)))
model.add(Dropout(0.5))
model.add(Dense(128,activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(32,activation='relu',kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(8,activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adamax', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=512)

In [18]:
# Classification of a movie review from the internet
word_index = imdb.get_word_index()

review = "what a waste of time and cash.. the movie was pointless. with no flow. no questions answered. just a waste. I never review movies but had to share how bad this was..compared to part 1- 2- and 3.... i don't know what else to say other than how misleading the commercial is.. the commercial was cut and spliced with video and audio that didn't even match what happened in the movie... you have been warned. when the movie was over.. people actually Boo'd. hopefully people will spread the word, and save others from throwing their money away. i know die-hard fans will go and give it a shot, but will be disappointed as well. Sinister was better and actually made you jump quite a few times."
from re import sub

def preprocess(review):
    
    # Removing any punctuation
    review = sub(r'[^\w\s]','',review) 
    # Lowercase conversion
    review = review.lower()
    # Creating an array of words
    review = review.split(" ")

    # Insering the ID
    review_array = []

    # Iterating through the words of the review
    for word in review:
        # we continue if the word is inside
        # of the word list of the training corpus
        if word in word_index:
            # we extract the index of the word
            index = word_index[word] 
            # We continue if the index is less than or equal to 5000
            # that is the number of words used for training
            if index <= 5000:
                # adding the ID to the array
                review_array.append(word_index[word]+3)
                
    # Performing the one hot encoding on the list of indices
    review_array = onehot_encoding([review_array],5000)
    
    return review_array

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [22]:
#Prediction
x = preprocess(review)
y = model.predict(x)[0]
print("REVIEW: %s" % review)
print("\n")


REVIEW: what a waste of time and cash.. the movie was pointless. with no flow. no questions answered. just a waste. I never review movies but had to share how bad this was..compared to part 1- 2- and 3.... i don't know what else to say other than how misleading the commercial is.. the commercial was cut and spliced with video and audio that didn't even match what happened in the movie... you have been warned. when the movie was over.. people actually Boo'd. hopefully people will spread the word, and save others from throwing their money away. i know die-hard fans will go and give it a shot, but will be disappointed as well. Sinister was better and actually made you jump quite a few times.




In [23]:
y

array([0.00083727], dtype=float32)