In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as ddf
import re
from pymongo import MongoClient

# tensorflow imports
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Activation,
    Dropout,
    Dense,
    Flatten,
    GlobalMaxPooling1D,
    Embedding,
    LSTM
)
from tensorflow.keras.initializers import RandomUniform, glorot_uniform
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

## Collect all Data

In [2]:
# get data from database
client = MongoClient('localhost:27017')
db = client.hotels
collection = db.reviews.find()

# transform to pandas dataframe
all_reviews = pd.DataFrame(list(collection), columns=['review', 'positive'])

## Function to pre proces text

In [3]:
# this function is used for apply
def proces_text(row):
    
    the_text = row['review']

    row['review'] = proces_text_only(the_text)    
    
    return row

# this function is used for only text
def proces_text_only(text_only):
    # Remove all the special characters (pro_fea = processed feature)
    pro_fea = re.sub(r'\W', ' ', text_only)
    # remove all single characters
    pro_fea = re.sub(r'\s+[a-zA-Z]\s+', ' ', pro_fea)
    # Remove single characters from the start
    pro_fea = re.sub(r'\^[a-zA-Z]\s+', ' ', pro_fea) 
    # Substituting multiple spaces with single space
    pro_fea = re.sub(r'\s+', ' ', pro_fea, flags=re.I)
    # Removing prefixed 'b'
    pro_fea = re.sub(r'^b\s+', '', pro_fea)
    # Converting to Lowercase
    return pro_fea.lower()    

## Apply all the function to all rows

In [4]:
# proc_rev = all_reviews.apply(proces_text, axis=1)
ddf_reviews = ddf.from_pandas(all_reviews, npartitions=7)
ddf_rev_pr = ddf_reviews.apply(proces_text, axis=1, meta={'review': 'object', 'positive': 'int64'})

In [5]:
df_rev = ddf_rev_pr.compute()

## Set Axis info

In [6]:
# Max limit
REV_LIMIT = 100000
# Get negative and positive reviews
rev_negatives = df_rev[df_rev.positive == 0][:REV_LIMIT]
rev_positives = df_rev[df_rev.positive == 1][:REV_LIMIT]

# Concat positive and negative reviews
rev_balanced = pd.concat([rev_negatives, rev_positives]).reset_index(drop=True)

# Convert the column review and positive to np array
X = np.array(list(rev_balanced.loc[:, 'review']))
y = np.array(list(rev_balanced.loc[:, 'positive']))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=42)

## Tokenizer to save vocabulary

In [8]:
# Set max words of the vocabulary
max_words=5000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

# Save the tokenizer as pickle file so it can be loaded from disk
# with open('tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# ADD EXPLANATION HERE

In [9]:
# Convert into 2D numpy array
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Add layers to model

In [10]:
# the lower/higer the paramaters, the higher te contrast of random numbers, seed is used to reproduce the same effect
# this uniform is later used in the embedding layer of the model
e_init = RandomUniform(-0.01, 0.01, seed=1)
# This uniform is used for for the LSTM and Dense layer of the model
init = glorot_uniform(seed=1)
# used in compile as optimizer
simple_adam = Adam()
# Total weights is also calculated using this variable
embed_vec_len = 32  # values per word

In [11]:
model = Sequential()

model.add(
    Embedding(
        # input_dim = size of vocab
        input_dim=max_words,
        # Dimension of embedding
        output_dim=embed_vec_len,
        
        embeddings_initializer=e_init,
        mask_zero=True
    )
)

# Add LSTM network
model.add(
    LSTM(
        units=100,
        kernel_initializer=init,
        dropout=0.2,
        recurrent_dropout=0.2
    )
)

model.add(
    Dense(
        units=1,
        kernel_initializer=init,
        activation='sigmoid'
    )
)

# Compile Model

In [12]:
model.compile(
    loss='binary_crossentropy',
    optimizer=simple_adam,
    metrics=['acc']
)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          160000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________


## Start training model with given parameters

In [13]:
bat_size = 32
max_epochs = 3

sentiment_analyzer = model.fit(
    X_train,
    y_train,
    epochs=max_epochs,
    batch_size=bat_size,
    shuffle=True,
    verbose=1
) 

Train on 160000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [14]:
loss_acc = model.evaluate(X_test, y_test, verbose=0)
print("Test data: loss = %0.6f  accuracy = %0.2f%% " % \
  (loss_acc[0], loss_acc[1]*100))

Test data: loss = 0.136676  accuracy = 95.18% 


## Save model + weights to disk

In [15]:
# serialize model to JSON
# model_json = model.to_json()
# with open("rev_sent_model.json", "w") as json_file:
#     json_file.write(model_json)
# serialize weights to HDF5
# model.save_weights("rev_sent_model.h5")
# print("Saved model to disk")

## Testing on single review

In [16]:
example_review = df_rev.iloc[220]['review']
example_review = proces_text_only(example_review)
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_review)

filtered_sentence = [w for w in word_tokens if not w in stop_words] 
  
filtered_sentence = [] 
  
for w in word_tokens: 
    if w not in stop_words: 
        filtered_sentence.append(w)

the_rev = tokenizer.texts_to_sequences(filtered_sentence)

ent_list = []

for sublist in the_rev:
    for item in sublist:
        ent_list.append(item)

instance = pad_sequences([ent_list], padding='post', maxlen=maxlen)

print('This review is positive') if model.predict(instance)[0][0] > 0.5 else print('This review is negative')
print(model.predict(instance)[0][0])

This review is negative
0.003162171


In [25]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.plot(sentiment_analyzer.history['acc'])
plt.plot(sentiment_analyzer.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(sentiment_analyzer.history['loss'])
plt.plot(sentiment_analyzer.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

KeyError: 'val_acc'