In [1]:
from sklearn import preprocessing
from sklearn import cross_validation

import os
os.environ['KERAS_BACKEND']='tensorflow' # set backend

import numpy as np
np.random.seed(1337)  # for reproducibility



In [2]:
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge, GRU
from keras.utils import np_utils, generic_utils

Using TensorFlow backend.


#### Data Setup

In [4]:
# load pre-trained vectors
import cPickle
with open('pickles/embeddings_preprocessed_29_4_64.pkl', 'rb') as fid:
    embeddings = cPickle.load(fid)
with open('pickles/dictionary_preprocessed_29_4_64.pkl', 'rb') as fid:
    dictionary = cPickle.load(fid)   

In [7]:
from preprocessor import Tensor_Sequence_W2V, accuracy, indicator_to_matrix

In [63]:
yelp_data = Tensor_Sequence_W2V("yelp_academic_dataset_review.json", "text", "stars", embeddings, dictionary, 100,30)

In [45]:
len(yelp_data.docs_vocab) # vocab size

1423

In [46]:
#class_balance = {1: 0, 2:0, 3:0, 4:0, 5:0}
class_balance = {}

for i in yelp_data.Y_doc_seq:
    if(i in class_balance): class_balance[i] += 1
    else: class_balance[i] = 1
class_balance

{1: 7, 2: 3, 3: 4, 4: 15, 5: 16}

In [47]:
yelp_data.X_doc_seq.shape[1:3]

(30, 64)

#### Hyper parameter testing with cross-validation

In [48]:
grid_models = []
models = ['' for i in xrange(4)] 
# models are training incrementally 
#   therefore we need multiple copies of the models 
#   otherwise we end up training on all the data (test and trianing)

In [49]:
maxlen = yelp_data.maxlen
hidden_dim = 128
nb_classes = len(yelp_data.docs_label_index)

In [50]:
yelp_data.X_doc_seq.shape[1:3]

(30, 64)

In [51]:
for cv in xrange(4):
    
    print('Building model...')
    current_model = Input(shape=yelp_data.X_doc_seq.shape[1:3], dtype='float32')
    
    # bidirectional LSTM
    forwards  = LSTM(hidden_dim,dropout_W=0.1,dropout_U=0.1)(current_model)
    backwards = LSTM(hidden_dim,dropout_W=0.1,dropout_U=0.1,go_backwards=True)(current_model)
    
    # merge LSTM's
    merged = merge([forwards, backwards], mode='concat', concat_axis=-1)
    
    # add dropout
    after_dp = Dropout(0.1)(merged)
    
    # output
    output    = Dense(nb_classes, activation='softmax')(after_dp)
    models[cv] = Model(input=current_model, output=output)
    
    # compile model with adam
    models[cv].compile('adam', 'categorical_crossentropy')

Building model...
Building model...
Building model...
Building model...


In [16]:
models[0:2] # check that models are different

[<keras.engine.training.Model at 0x7ff776853490>,
 <keras.engine.training.Model at 0x7ff75cd84c90>]

In [18]:
models[0].summary()

____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
input_1 (InputLayer)               (None, 30, 64)      0                                            
____________________________________________________________________________________________________
lstm_1 (LSTM)                      (None, 128)         98816       input_1[0][0]                    
____________________________________________________________________________________________________
lstm_2 (LSTM)                      (None, 128)         98816       input_1[0][0]                    
____________________________________________________________________________________________________
merge_1 (Merge)                    (None, 256)         0           lstm_1[0][0]                     
                                                                   lstm_2[0][0]            

In [64]:
# data split - training and validation data (80, 20)

X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(yelp_data.X_doc_seq,yelp_data.Y_doc_seq,test_size=0.2)

In [53]:
from preprocessor import Kfold_cv

In [65]:
full_index = [i for i in xrange(len(X_train))]
indices_cv = Kfold_cv(full_index,4)

In [66]:
x_train = []
x_test  = []

y_train = []
y_test  = []

for cv in xrange(4):
    x_train.append(X_train[indices_cv[cv]["train"]])
    x_test.append(X_train[indices_cv[cv]["test"]] )

    y_train.append([ Y_train[i] for i in indices_cv[cv]["train"] ] )
    y_test.append([ Y_train[i] for i in indices_cv[cv]["test"] ] )

In [23]:
# model specs
batch_size = 32
num_epoch = 10

In [56]:
# train model
out_sample_accuracies = []
in_sample_accuracies = []
    
for cv in xrange(4):
    np.random.seed(1337)  # for reproducibility
    
    # create appropirate matrix (hot encoded) response
        
    y_train_m, y_test_m = [indicator_to_matrix(x,yelp_data.docs_label_index)  for x in (y_train[cv], y_test[cv])]

    history = models[cv].fit(x_train[cv], y_train_m,
                        nb_epoch=num_epoch, batch_size=batch_size,
                        verbose=False) 
    
    # set validation split to 0 or none so all the traning data is used 
    #   the out of sample rate will be determined later

    # do not set verbose = 1
    
    out_sample_accuracies.append(accuracy(models[cv],x_test[cv],y_test_m))
    in_sample_accuracies.append(accuracy(models[cv],x_train[cv],y_train_m)) 
    
print([np.mean(in_sample_accuracies),np.mean(out_sample_accuracies)])

[48.333333333333336, 25.0]


In [25]:
from sklearn.metrics import confusion_matrix

In [None]:
def predict_classes(model,x_test):
    predictions = model.predict(x_test)
    return [ pred.argmax() for idx, pred in enumerate(predictions)]

In [57]:
# in and out scores over the cross folds

out_sample_accuracies = []
in_sample_accuracies = []
confustion_matrices = []

for cv in xrange(4):
        
    y_test_vec = [ yelp_data.docs_label_index[i] for i in y_test[cv] ] 

    # create appropirate matrix (hot encoded) response
    y_train_m, y_test_m = [indicator_to_matrix(x,yelp_data.docs_label_index)  for x in (y_train[cv], y_test[cv])]
    
    out_sample_accuracies.append(accuracy(models[cv],x_test[cv],y_test_m))
    in_sample_accuracies.append(accuracy(models[cv],x_train[cv],y_train_m))
    
    confustion_matrices.append(confusion_matrix(y_test_vec,predict_classes(models[cv],x_test[cv])))

In [58]:
print([np.mean(in_sample_accuracies), np.mean(out_sample_accuracies)])

[48.333333333333336, 25.0]


In [60]:
conf_perc = np.mean(confustion_matrices,axis=0).T/np.sum(np.mean(confustion_matrices,axis=0),axis=1) 
#  there has to be a better way to do row wise division 
print(conf_perc.T)

ValueError: operands could not be broadcast together with shapes (5,5) (4,4) 

#### Final Model

In [68]:
maxlen = yelp_data.maxlen
hidden_dim = 128
nb_classes = len(yelp_data.docs_label_index)

In [69]:
# model specification
sequence = Input(shape=yelp_data.X_doc_seq.shape[1:3], dtype='float32')
forwards = LSTM(hidden_dim,dropout_W=0.1,dropout_U=0.1)(sequence)
backwards = LSTM(hidden_dim,dropout_W=0.1,dropout_U=0.1,go_backwards=True)(sequence)
merged = merge([forwards, backwards], mode='concat', concat_axis=-1)
after_dp = Dropout(0.1)(merged)
output = Dense(nb_classes, activation='softmax')(after_dp)
model = Model(input=sequence, output=output)

In [70]:
model.compile('adam', 'categorical_crossentropy')

In [None]:
model.summary()

In [None]:
# data split
x_train, x_test, y_train, y_test = cross_validation.train_test_split(yelp_data.X_doc_seq,yelp_data.Y_doc_seq,test_size=0.2)

# create appropirate matrix (hot encoded) response
y_train, y_test = [indicator_to_matrix(x,yelp_data.docs_label_index)  for x in (y_train, y_test)]

In [37]:
# model specs
batch_size = 32
num_epoch = 10
# train model
model.fit(x_train, y_train,
          batch_size=batch_size,
          nb_epoch=num_epoch,
          validation_data=[x_test, y_test])

Train on 200 samples, validate on 50 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f875530dd10>

In [39]:
accuracy(model,x_test,y_test)

40.0

In [34]:
from sklearn.metrics import confusion_matrix

predictions = model.predict(x_test)
y_pred_vec = [ pred.argmax() for idx, pred in enumerate(predictions)]
y_test_vec = [ pred.argmax() for idx, pred in enumerate(y_test)]

confustion_matrix = confusion_matrix(y_test_vec,y_pred_vec)
print(confustion_matrix) # class balance

[[1 0 1 3 1]
 [1 0 0 5 1]
 [2 0 0 2 2]
 [4 0 1 7 0]
 [3 0 2 8 6]]


In [37]:
#class_balance = {1: 0, 2:0, 3:0, 4:0, 5:0}
class_balance_test = {}

for i in y_test_vec:
    if(i in class_balance_test): class_balance_test[i] += 1
    else: class_balance_test[i] = 1
class_balance_test

{0: 9, 1: 5, 2: 5, 3: 16, 4: 15}

#### Save and Load Model

In [None]:
model.save_weights('yelp_bi_lstm.h5')

In [None]:
import json
jsonfile = open('yelp_bi_lstm.json', 'w')
json.dump(model_as_json_string, jsonfile)
jsonfile.write('\\n')

In [None]:
import json
with open('yelp_bi_lstm.json') as data_file:
    model_as_json_string = json.load(data_file)

In [None]:
from keras.models import model_from_json
current_model = model_from_json(model_as_json_string)
current_model.load_weights('yelp_bi_lstm.h5')