In [51]:
import json
import re
import string
import numpy as np
import tensorflow as tf 
import tensorflow.keras.preprocessing.text as kpt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import model_from_json
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import Model, Sequential 
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D, Dropout

In [2]:
# we're still going to use a Tokenizer here, but we don't need to fit it
tokenizer = Tokenizer(num_words=3000)
# for human-friendly printing
labels = ['negative', 'positive']

# read in our saved dictionary
data = pd.read_csv("IMDB Dataset.csv")



In [3]:
# this utility makes sure that all the words in your input
# are registered in the dictionary
# before trying to turn them into a matrix.
def convert_text_to_index_array(text):
    words = kpt.text_to_word_sequence(text)
    wordIndices = []
    for word in words:
        if word in dictionary:
            wordIndices.append(dictionary[word])
        else:
            print("'%s' not in training corpus; ignoring." %(word))
    return wordIndices



In [18]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [26]:
le = LabelEncoder()
sentiment_1 = le.fit_transform(data["sentiment"])
data["sentiment_2"] = sentiment_1

In [28]:
X = data.review
y = data.sentiment_2

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [34]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation), '')

custom_standardization(input_data=X_train)
custom_standardization(input_data=X_test)

<tf.Tensor: shape=(10000,), dtype=string, numpy=
array([b'this is in my opinion much better than either of the 2 1990s versions but is still not all that good it feels dated probably because it is but it does stand up well compared to other bbc 1980s period pieces such as mansfield park and northanger abbey  the length of this adaptation allows for a much better adaptation of the book than either of the 2 90s versions and st john rivers is at least covered although not very well timothy dalton is very good as rochester but the actress playing jane is much too old there is definitely scope for a tv adaptation of this length that has more than a tenner spent on it',
       b'paranormal state is an interesting show for most paranormal believers i enjoy watching what the team has to say and what they find however i know that the entire show along with its build ups and story lines are completely set up they go to real haunted locations and i suspect that they speak with actual witnesses i 

In [35]:
max_words = 1000
max_sentence = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
dictionary = tokenizer.word_index

In [36]:
with open("dictionary.json", "w+") as dictionary_file:
    json.dump(dictionary, dictionary_file)

In [45]:
def convert_text(text):
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

In [46]:
indices = []
for text in X_train:
    wordindex = convert_text(text)
    indices.append(wordindex)

In [47]:
indices = np.asarray(indices)

In [48]:
X_train = tokenizer.sequences_to_matrix(indices, mode='binary')


In [49]:
y_train = tf.keras.utils.to_categorical(y_train, 2)
y_train.shape

(40000, 2)

In [56]:
model = Sequential()
model.add(Dense(128, input_shape=(max_words,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 128)               128128    
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 130       
Total params: 136,514
Trainable params: 136,514
Non-trainable params: 0
_________________________________________________________________


In [58]:
model.compile(loss='categorical_crossentropy',
  optimizer='adam',
  metrics=['accuracy'])

In [59]:
model.fit(X_train, y_train, epochs=5, validation_split=0.2, shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x11beb4e29d0>

In [60]:
model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('model_tokenizer.h5')

In [69]:
labels = ['negative', 'positive']

# this utility makes sure that all the words in your input
# are registered in the dictionary
# before trying to turn them into a matrix.
def convert_text_to_index_array(text):
    words = kpt.text_to_word_sequence(text)
    wordIndices = []
    for word in words:
        if word in dictionary:
            wordIndices.append(dictionary[word])
        else:
            print("'%s' not in training corpus; ignoring." %(word))
    return wordIndices

# read in your saved model structure
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
# and create a model from that
model = model_from_json(loaded_model_json)
# and weight your nodes with your saved values
model.load_weights('model.h5')

OSError: Unable to open file (unable to open file: name = 'model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [68]:
# okay here's the interactive part
while True:
    sentence = input('Input a sentence to be evaluated, or Enter to quit: ')

    if len(sentence) == 0:
        break

    # format your input for the neural net
    testArr = convert_text_to_index_array(sentence)
    input = tokenizer.sequences_to_matrix([testArr], mode='binary')
    # predict which bucket your input belongs in
    pred = model.predict(input)
    # and print it for the humons
    print("%s sentiment; %f%% confidence" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))

TypeError: 'numpy.ndarray' object is not callable