In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as ddf
import re, nltk
from numpy import array
from pymongo import MongoClient

# tensorflow imports
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Activation,
    Dropout,
    Dense,
    Flatten,
    GlobalMaxPooling1D,
    Embedding,
    LSTM
)
from tensorflow.keras.initializers import RandomUniform, glorot_uniform
from tensorflow.keras.optimizers import Adam
# from Tkeras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

## Collect all Data

In [2]:
# get data from database
client = MongoClient('localhost:27017')
db = client.hotels
collection = db.reviews.find()

# transform to pandas dataframe
all_reviews = pd.DataFrame(list(collection), columns=['review', 'positive'])

## Function to pre proces text

In [3]:
def proces_text(row):
    
    the_text = row['review']
    # Remove all the special characters (pro_fea = processed feature)
    pro_fea = re.sub(r'\W', ' ', the_text)
    # remove all single characters
    pro_fea = re.sub(r'\s+[a-zA-Z]\s+', ' ', pro_fea)
    # Remove single characters from the start
    pro_fea = re.sub(r'\^[a-zA-Z]\s+', ' ', pro_fea) 
    # Substituting multiple spaces with single space
    pro_fea = re.sub(r'\s+', ' ', pro_fea, flags=re.I)
    # Removing prefixed 'b'
    pro_fea = re.sub(r'^b\s+', '', pro_fea)
    # Converting to Lowercase
    row['review'] = pro_fea.lower()
    
    
    return row

## Apply all the function to all rows (( maybe apply dask))

In [4]:
# proc_rev = all_reviews.apply(proces_text, axis=1)
ddf_reviews = ddf.from_pandas(all_reviews, npartitions=7)
ddf_rev_pr = ddf_reviews.apply(proces_text, axis=1, meta={'review': 'object', 'positive': 'int64'})

In [5]:
df_rev = ddf_rev_pr.compute()

## Set Axis info

In [6]:
X = np.array(list(df_rev.loc[:, 'review']))
y = np.array(list(df_rev.loc[:, 'positive']))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# ADD EXPLANATION HERE

In [8]:
max_words=5000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# ADD EXPLANATION HERE

In [9]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# ADD EXPLANATION HERE

In [10]:
# 2. define model
e_init = RandomUniform(-0.01, 0.01, seed=1)
init = glorot_uniform(seed=1)
simple_adam = Adam()
embed_vec_len = 32  # values per word

In [11]:
model = Sequential()

model.add(
    Embedding(
        input_dim=max_words,
        output_dim=embed_vec_len,
        embeddings_initializer=e_init,
        mask_zero=True
    )
)

model.add(
    LSTM(
        units=100,
        kernel_initializer=init,
        dropout=0.2,
        recurrent_dropout=0.2
    )
)

model.add(
    Dense(
        units=1,
        kernel_initializer=init,
        activation='sigmoid'
    )
)