In [1]:
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

import os
os.environ['KERAS_BACKEND']='tensorflow' # set backend

In [2]:
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [3]:
# look into preprocessor.py file for details
from preprocessor import DocuemntTermMatrix, accuracy, indicator_to_matrix

In [4]:
yelp_data = DocuemntTermMatrix("yelp_academic_dataset_review.json", "text", "stars", 500)

# MLP (Feed Forward Network) 

### specification
- 512 Hidden Layer 
- 50% drop out
- Rectified Linear Unit activation
- adam optimiser 

### model summary 
____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
====================================================================================================
dense_2 (Dense)                    (None, 512)         493056      dense_input_2[0][0]              
____________________________________________________________________________________________________
activation_2 (Activation)          (None, 512)         0           dense_2[0][0]                    
____________________________________________________________________________________________________
dropout_2 (Dropout)                (None, 512)         0           activation_2[0][0]               
____________________________________________________________________________________________________
dense_3 (Dense)                    (None, 5)           2565        dropout_2[0][0]                  
____________________________________________________________________________________________________
activation_3 (Activation)          (None, 5)           0           dense_3[0][0]                    
====================================================================================================
Total params: 495621
____________________________________________________________________________________________________

In [5]:
vocab_size = len(yelp_data.docs_vocab)

In [6]:
nb_classes = len(yelp_data.docs_label_index)

In [10]:
print('Building model...')
model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])



Building model...


In [11]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
dense_1 (Dense)                    (None, 512)         5032448     dense_input_1[0][0]              
____________________________________________________________________________________________________
activation_1 (Activation)          (None, 512)         0           dense_1[0][0]                    
____________________________________________________________________________________________________
dropout_1 (Dropout)                (None, 512)         0           activation_1[0][0]               
____________________________________________________________________________________________________
dense_2 (Dense)                    (None, 5)           2565        dropout_1[0][0]                  
___________________________________________________________________________________________

In [12]:
#from preprocessor import Dataset, pad_vec_sequences, pad_sent_sequences
from sklearn import preprocessing
from sklearn import cross_validation

In [13]:
# data split
x_train, x_test, y_train, y_test = cross_validation.train_test_split(yelp_data.X_docs,yelp_data.Y_docs,test_size=0.2)

# create appropirate matrix (hot encoded) response
y_train, y_test = [indicator_to_matrix(x,yelp_data.docs_label_index)  for x in (y_train, y_test)]

In [14]:
batch_size = 32
nb_epoch = 5

history = model.fit(x_train, y_train,
                    nb_epoch=nb_epoch, batch_size=batch_size,
                    validation_split=0.1)
# do not set verbose = 1

Train on 360 samples, validate on 40 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Training score:', score[0])
print('Test accuracy:', score[1])

Training score: 1.47659727097
Test accuracy: 0.39


In [None]:
model.save('yelp_mlp_hidden_epocs.h5')

In [None]:
from keras.models import load_model
current_model = load_model('yelp_mlp_hidden_epocs.h5')