In [2]:
import numpy as np, pandas as pd, tensorflow as tf

  return f(*args, **kwds)


# Dataset Loading

#### Classify on Stars

In [3]:
## Amazon Reviews
## https://www.kaggle.com/snap/amazon-fine-food-reviews
reviews = pd.read_csv('data/Reviews.csv')
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
Id                        568454 non-null int64
ProductId                 568454 non-null object
UserId                    568454 non-null object
ProfileName               568438 non-null object
HelpfulnessNumerator      568454 non-null int64
HelpfulnessDenominator    568454 non-null int64
Score                     568454 non-null int64
Time                      568454 non-null int64
Summary                   568427 non-null object
Text                      568454 non-null object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [4]:
try:
    del reviews['ProfileName']
except KeyError:
    print('No such column')
    
try:
    del reviews['Summary']
except KeyError:
    print('No such column')

In [5]:
#Remove rows where numerator > denominator. Assume this is entry error in dataset.
faulty_rows = reviews[reviews['HelpfulnessNumerator'] > reviews['HelpfulnessDenominator']]
reviews = reviews[reviews['HelpfulnessNumerator'] <= reviews['HelpfulnessDenominator']]

faulty_rows

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Text
44736,44737,B001EQ55RW,A2V0I904FH7ABY,3,2,4,1212883200,It was almost a 'love at first bite' - the per...
64421,64422,B000MIDROQ,A161DK06JJMCYF,3,1,5,1224892800,My son loves spaghetti so I didn't hesitate or...


In [6]:
import math
# [CITE] https://www.evanmiller.org/how-not-to-sort-by-average-rating.html
# "The lower bound of Wilson score confidence interval for a Bernoulli parameter"
def lbc(positive_votes, total_votes):
    negative_votes = total_votes - positive_votes
    if total_votes == 0:
        return 0.0
    lower_bound = ((positive_votes + 1.9208) / (total_votes) - 1.96 * math.sqrt((total_votes * negative_votes) / (total_votes) + 0.9604) / 
        (total_votes)) / (1 + 3.8416 / (total_votes))
    return lower_bound

### Modify Dataset

In [7]:
%%time

# Add the Lbc column
lbc_for_row = lambda row: lbc(row['HelpfulnessNumerator'], row['HelpfulnessDenominator'])
reviews['Lbc'] = reviews.apply(lbc_for_row, axis=1)

# Make Score zero indexed
reviews['Score'] = reviews['Score']-1

CPU times: user 8.83 s, sys: 11 ms, total: 8.84 s
Wall time: 8.84 s


## Reduce Dataset Size

In [8]:
data = reviews.sample(frac=1)
data.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Text,Lbc
43047,43048,B004P0SRSS,A264CH3T20YH3Z,3,3,4,1316131200,Also found these while touring Europe. Love t...,0.438494
279394,279395,B0039556DY,A255LN7YXS7QAE,0,0,4,1295481600,I drink only flavored caffeinated coffee. I s...,0.0
412922,412923,B000E4C2LW,A3TXDO9392M8NJ,1,1,4,1275350400,I know what you're thinking. Corn flakes are ...,0.206543
326560,326561,B002AQ0OW6,A3RCJ8SLVUKT7U,4,4,0,1290729600,Have ordered boxes on 3 occassions. Every box ...,0.5101
329343,329344,B000V762EA,A255ZAR2NXQY7W,0,0,4,1346803200,"I was at Gilt (in Portland, OR) enjoying some ...",0.0


## Preprocessing

In [9]:
quartile = .85
review_length = int(data['Text'].str.len().quantile(q=quartile))

longest = data[data['Text'].str.len() == review_length]
print("Length of quartile", review_length)
longest['Text'].tolist()[0]

Length of quartile 715


"These are a great alternative to Maruchan, which are fried ramen noodles.  If you are like me and  love ramen noodles, but hate the fact that there's a ton of fat (which is 14 grams per pack and 1660 mg of sodium for chicken flavor), try these.  I drain the water from my noodles and season them.  I do not eat it as soup.  I use half of the seasoning pack and a couple of spritzes of liquid aminos (tastes like soy sauce).  It ends up having 1 gram of fat and about 50% less sodium, and is very tasty.  Now those like me who like to cut some fat and salt can have their ramen noodles and eat them, too!  ; D.<br /> Ohh Haapppyyy Daayyy<br /><br />Now if only someone will make a Snickers with one gram of fat.  : ("

### Tokenize the dataset

In [17]:
%%time
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

NUM_TOP_WORDS = 120000
MAX_ART_LEN = int(data['Text'].str.len().quantile(q=quartile)) # maximum and minimum number of words 
                                                               #  based on a quartile of review length

tokenizer = Tokenizer(num_words=NUM_TOP_WORDS)
tokenizer.fit_on_texts(data.Text)
sequences = tokenizer.texts_to_sequences(data.Text)

word_index = tokenizer.word_index
NUM_TOP_WORDS = len(word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
top_words = min((len(word_index),NUM_TOP_WORDS))
print('Found %s unique tokens. Distilled to %d top words.' % (len(word_index),top_words))

X = pad_sequences(sequences, maxlen=MAX_ART_LEN)

print('Shape of data tensor:', X.shape)
print(np.max(X))

Found 133038 unique tokens. Distilled to 120000 top words.
Shape of data tensor: (568452, 715)
119999
CPU times: user 43 s, sys: 799 ms, total: 43.8 s
Wall time: 43.8 s


In [18]:
from sklearn.model_selection import train_test_split
# Split it into train / test subsets

#######################
## Ordinal Selection ##
ordinal = True
#######################

if ordinal:
    y = data['Score']/5
else:
    y = keras.utils.to_categorical(data['Score'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                            stratify=data['Score'], 
                                                            random_state=42)
if ordinal:
    loss='mean_squared_error'
    NUM_CLASSES = 1
else:
    loss='categorical_crossentropy'
    NUM_CLASSES = y.shape[1]
    
print(X_train.shape,y_train.shape)
print(np.sum(y_train,axis=0))

(454761, 715) (454761,)
289519.2


### Load the embeding

In [19]:
%%time

EMBED_SIZE = 100
# the embed size should match the file you load glove from
embeddings_index = {}
f = open('embeddings/glove.6B.100d.txt')
# save key/array pairs of the embeddings
#  the key of the dictionary is the word, the array is the embedding
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# now fill in the matrix, using the ordering from the
#  keras word tokenizer from before
embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

# Define the embeding layer
from keras.layers import Embedding

gru_embedding = Embedding(len(word_index) + 1,
                            EMBED_SIZE,
                            weights=[embedding_matrix],
                            input_length=MAX_ART_LEN,
                            trainable=True)

lstm_embedding = Embedding(len(word_index) + 1,
                            EMBED_SIZE,
                            weights=[embedding_matrix],
                            input_length=MAX_ART_LEN,
                            trainable=True)

Found 400000 word vectors.
(133039, 100)
CPU times: user 7.43 s, sys: 104 ms, total: 7.53 s
Wall time: 7.55 s


## Define Cache/Fit Function
Load the cached mode or fit if no model found

In [20]:
import os.path
from keras.models import load_model
import json

def cache_fit(model_name: str, model, *args, **kwargs):
    archive_name = f'{model_name}_model.h5'
    history_name = f'{model_name}_history.json'
    archive_exists = os.path.isfile(archive_name)

    if not archive_exists:
        print(f'Model {model_name} not found in archive. Training new model.')
        hist = model.fit(*args, **kwargs)
        model.save(archive_name)
        with open(history_name, 'w') as f:
            json.dump(hist.history, f)
        return model
    else:
        print('Model found on disk. Reloading.')
        return load_model(archive_name)

## Network 1

In [23]:
%%time

from keras.models import Sequential
from keras.layers import BatchNormalization
from keras.layers import Conv1D, MaxPooling1D # Convolution Layers
from keras.layers import Dense                # Dense Layers
from keras.layers import GRU                  # Recurrent Layers

gru = Sequential()
gru.add(gru_embedding)
gru.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
gru.add(MaxPooling1D(pool_size=2))
gru.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
gru.add(MaxPooling1D(pool_size=2))
gru.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
gru.add(MaxPooling1D(pool_size=2))
gru.add(BatchNormalization())
gru.add(GRU(100,dropout=0.25, recurrent_dropout=0.2))
gru.add(Dense(64))
gru.add(Dense(64))
gru.add(Dense(NUM_CLASSES, activation='sigmoid'))
gru.compile(loss=loss,
              optimizer='Adam', 
              metrics=['accuracy'])
gru.summary()

gru = cache_fit(
    'gru', gru, 
    X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=32
)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 715, 100)          13303900  
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 715, 32)           9632      
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 357, 32)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 357, 64)           6208      
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 178, 64)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 178, 64)           12352     
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 89, 64)            0         
__________

# Results 
#### 10% of Dataset
Train on 45476 samples, validate on 11369 samples

Epoch 1/4
45476/45476 [==============================] - 451s 10ms/step - loss: 1.0291 - acc: 0.6508 - val_loss: 0.8893 - val_acc: 0.6783

Epoch 2/4
45476/45476 [==============================] - 448s 10ms/step - loss: 0.8846 - acc: 0.6804 - val_loss: 0.8173 - val_acc: 0.6968

Epoch 3/4
45476/45476 [==============================] - 453s 10ms/step - loss: 0.8355 - acc: 0.6923 - val_loss: 0.7954 - val_acc: 0.7038

Epoch 4/4
45476/45476 [==============================] - 442s 10ms/step - loss: 0.8052 - acc: 0.6997 - val_loss: 0.7769 - val_acc: 0.7107

CPU times: user 1h 44min 42s, sys: 27min 2s, total: 2h 11min 45s
Wall time: 29min 56s
```
[[ 594    2  138   26  282]
 [ 187    5  183   58  165]
 [  87    4  265  182  316]
 [  32    1  152  315 1107]
 [  58    0  103  206 6901]]
```
#### 25% of Dataset
Train on 113690 samples, validate on 28423 samples

Epoch 1/4
113690/113690 [==============================] - 785s 7ms/step - loss: 0.9327 - acc: 0.6701 - val_loss: 0.8315 - val_acc: 0.6937

Epoch 2/4
113690/113690 [==============================] - 785s 7ms/step - loss: 0.8090 - acc: 0.7014 - val_loss: 0.7651 - val_acc: 0.7136

Epoch 3/4
113690/113690 [==============================] - 784s 7ms/step - loss: 0.7749 - acc: 0.7120 - val_loss: 0.7545 - val_acc: 0.7154

Epoch 4/4
113690/113690 [==============================] - 784s 7ms/step - loss: 0.7517 - acc: 0.7191 - val_loss: 0.7304 - val_acc: 0.7266

CPU times: user 1h 13min 8s, sys: 3min 12s, total: 1h 16min 21s
Wall time: 52min 17s
```
[[ 2006    61    99    24   411]
 [  683   137   236    63   373]
 [  491   129   450   293   768]
 [  191    43   255   731  2820]
 [  350    23   141   318 17327]]
 ```
 #### 50% of Dataset
 Train on 227380 samples, validate on 56846 samples
 
Epoch 1/4
227380/227380 [==============================] - 1547s 7ms/step - loss: 0.8767 - acc: 0.6832 - val_loss: 0.7632 - val_acc: 0.7135

Epoch 2/4
227380/227380 [==============================] - 1550s 7ms/step - loss: 0.7758 - acc: 0.7119 - val_loss: 0.7332 - val_acc: 0.7257

Epoch 3/4
227380/227380 [==============================] - 1554s 7ms/step - loss: 0.7447 - acc: 0.7217 - val_loss: 0.7117 - val_acc: 0.7352

Epoch 4/4
227380/227380 [==============================] - 1555s 7ms/step - loss: 0.7239 - acc: 0.7285 - val_loss: 0.7185 - val_acc: 0.7317

CPU times: user 2h 30min, sys: 7min 19s, total: 2h 37min 20s
Wall time: 1h 43min 26s
```
[[ 3375   348   204    40  1231]
 [  917   517   424   120  1007]
 [  426   332   933   578  1988]
 [  152    47   351  1209  6286]
 [  257    46   131   364 35563]]
 ```
 #### 100% of Dataset
 Train on 454761 samples, validate on 113691 samples
 
Epoch 1/4
454761/454761 [==============================] - 3094s 7ms/step - loss: 0.8326 - acc: 0.6952 - val_loss: 0.7404 - val_acc: 0.7233

Epoch 2/4
454761/454761 [==============================] - 3096s 7ms/step - loss: 0.7582 - acc: 0.7169 - val_loss: 0.7318 - val_acc: 0.7235

Epoch 3/4
454761/454761 [==============================] - 3098s 7ms/step - loss: 0.7566 - acc: 0.7174 - val_loss: 0.7054 - val_acc: 0.7344

Epoch 4/4
454761/454761 [==============================] - 3099s 7ms/step - loss: 0.7325 - acc: 0.7254 - val_loss: 0.6979 - val_acc: 0.7371

CPU times: user 4h 57min 53s, sys: 13min 48s, total: 5h 11min 42s
Wall time: 3h 26min 27s
```
[[ 8280   309   496   103  1266]
 [ 2909   560  1150   323  1012]
 [ 1727   466  2599  1514  2222]
 [  695   101  1245  3969 10121]
 [ 1443    63   622  2097 68399]]
 ```

## Network 2

In [22]:
%%time

from keras.models import Sequential
from keras.layers import BatchNormalization
from keras.layers import Conv1D, MaxPooling1D # Convolution Layers
from keras.layers import Dense                # Dense Layers
from keras.layers import LSTM                 # Recurrent Layers

lstm = Sequential()
lstm.add(lstm_embedding)
lstm.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
lstm.add(MaxPooling1D(pool_size=2))
lstm.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
lstm.add(MaxPooling1D(pool_size=2))
lstm.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
lstm.add(MaxPooling1D(pool_size=2))
lstm.add(BatchNormalization())
lstm.add(LSTM(100,dropout=0.25, recurrent_dropout=0.2))
lstm.add(Dense(64))
lstm.add(Dense(64))
lstm.add(Dense(NUM_CLASSES, activation='sigmoid'))
lstm.compile(loss=loss,
              optimizer='Adam', 
              metrics=['accuracy'])
lstm.summary()

lstm = cache_fit(
    'lstm', lstm, 
    X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=32
)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 715, 100)          13303900  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 715, 32)           9632      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 357, 32)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 357, 64)           6208      
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 178, 64)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 178, 64)           12352     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 89, 64)            0         
__________

# Results 
#### 10% of Dataset
Train on 45476 samples, validate on 11369 samples

Epoch 1/4
45476/45476 [==============================] - 237s 5ms/step - loss: 0.9874 - acc: 0.6574 - val_loss: 0.9331 - val_acc: 0.6636

Epoch 2/4
45476/45476 [==============================] - 236s 5ms/step - loss: 0.8625 - acc: 0.6846 - val_loss: 0.8611 - val_acc: 0.6892

Epoch 3/4
45476/45476 [==============================] - 237s 5ms/step - loss: 0.8107 - acc: 0.6965 - val_loss: 0.8028 - val_acc: 0.6973

Epoch 4/4
45476/45476 [==============================] - 236s 5ms/step - loss: 0.7780 - acc: 0.7080 - val_loss: 0.8506 - val_acc: 0.6994

CPU times: user 56min 14s, sys: 14min 44s, total: 1h 10min 59s
Wall time: 15min 49s
```
[[ 616   11   42   10  363]
 [ 198   10   64   39  287]
 [ 104    2   99   77  572]
 [  42    1   33   68 1463]
 [  58    0   25   27 7158]]
```
#### 25% of Dataset
Train on 113690 samples, validate on 28423 samples

Epoch 1/4
113690/113690 [==============================] - 274s 2ms/step - loss: 0.9015 - acc: 0.6761 - val_loss: 0.8047 - val_acc: 0.6987

Epoch 2/4
113690/113690 [==============================] - 273s 2ms/step - loss: 0.7932 - acc: 0.7021 - val_loss: 0.7815 - val_acc: 0.7025

Epoch 3/4
113690/113690 [==============================] - 273s 2ms/step - loss: 0.7555 - acc: 0.7145 - val_loss: 0.8046 - val_acc: 0.6960

Epoch 4/4
113690/113690 [==============================] - 273s 2ms/step - loss: 0.7293 - acc: 0.7245 - val_loss: 0.7831 - val_acc: 0.7154

CPU times: user 26min 3s, sys: 1min 13s, total: 27min 16s
Wall time: 18min 13s
```
[[ 1551    62    85    23   880]
 [  509   110   230    70   573]
 [  237    91   373   326  1104]
 [   78    28   177   599  3158]
 [  128    13    83   233 17702]]
 ```
 #### 50% of Dataset
 Train on 227380 samples, validate on 56846 samples
 
Epoch 1/4
227380/227380 [==============================] - 543s 2ms/step - loss: 0.8495 - acc: 0.6899 - val_loss: 0.7786 - val_acc: 0.7036

Epoch 2/4
227380/227380 [==============================] - 545s 2ms/step - loss: 0.7544 - acc: 0.7170 - val_loss: 0.7779 - val_acc: 0.6993

Epoch 3/4
227380/227380 [==============================] - 545s 2ms/step - loss: 0.7212 - acc: 0.7283 - val_loss: 0.7270 - val_acc: 0.7259

Epoch 4/4
227380/227380 [==============================] - 544s 2ms/step - loss: 0.6999 - acc: 0.7352 - val_loss: 0.7443 - val_acc: 0.7299

CPU times: user 50min 21s, sys: 2min 18s, total: 52min 40s
Wall time: 36min 18s
```
[[ 3708    70   204    67  1149]
 [ 1245   127   454   194   965]
 [  618   104   753   865  1917]
 [  173    18   270  1570  6014]
 [  313     4   102   606 35336]]
 ```
 #### 100% of Dataset
 Train on 454761 samples, validate on 113691 samples
 
Epoch 1/4
454761/454761 [==============================] - 1085s 2ms/step - loss: 0.8098 - acc: 0.7001 - val_loss: 0.7436 - val_acc: 0.7193

Epoch 2/4
454761/454761 [==============================] - 1089s 2ms/step - loss: 0.7243 - acc: 0.7271 - val_loss: 0.7504 - val_acc: 0.7266

Epoch 3/4
454761/454761 [==============================] - 1089s 2ms/step - loss: 0.6966 - acc: 0.7364 - val_loss: 0.7103 - val_acc: 0.7338

Epoch 4/4
454761/454761 [==============================] - 1089s 2ms/step - loss: 0.6782 - acc: 0.7429 - val_loss: 0.6988 - val_acc: 0.7354

CPU times: user 1h 44min 8s, sys: 4min 53s, total: 1h 49min 1s
Wall time: 1h 12min 31s
```
[[ 6899   726  1663    93  1073]
 [ 1535   781  2684   249   705]
 [  728   329  4397  1602  1472]
 [  266    47  2311  4983  8524]
 [  792    44  1657  3577 66554]]
 ```
 ## Training Embedding
 #### 2 Epochs
 Train on 454761 samples, validate on 113691 samples
 
Epoch 1/2
454761/454761 [==============================] - 1038s 2ms/step - loss: 0.7174 - acc: 0.7310 - val_loss: 0.6536 - val_acc: 0.7576

Epoch 2/2
454761/454761 [==============================] - 1035s 2ms/step - loss: 0.5844 - acc: 0.7799 - val_loss: 0.5877 - val_acc: 0.7829

CPU times: user 35min 26s, sys: 1min 36s, total: 37min 2s
Wall time: 34min 43s
```
[[ 8332   882   598    50   592]
 [ 2046  1847  1444   209   408]
 [  791   827  4332  1484  1094]
 [  286   140  1557  6656  7492]
 [  762   130   839  3054 67839]]
 ```
 #### 4 Epochs
Train on 454761 samples, validate on 113691 samples

Epoch 1/4
454761/454761 [==============================] - 1039s 2ms/step - loss: 0.7166 - acc: 0.7314 - val_loss: 0.6305 - val_acc: 0.7638

Epoch 2/4
454761/454761 [==============================] - 1036s 2ms/step - loss: 0.5836 - acc: 0.7805 - val_loss: 0.5858 - val_acc: 0.7810

Epoch 3/4
454761/454761 [==============================] - 1036s 2ms/step - loss: 0.5079 - acc: 0.8119 - val_loss: 0.5684 - val_acc: 0.7955

Epoch 4/4
454761/454761 [==============================] - 1036s 2ms/step - loss: 0.4453 - acc: 0.8384 - val_loss: 0.5913 - val_acc: 0.7899

CPU times: user 1h 10min 36s, sys: 3min 19s, total: 1h 13min 56s
Wall time: 1h 9min 17s
```
[[ 7640  1949   308    59   498]
 [ 1124  3690   665   112   363]
 [  604  1982  4115   965   862]
 [  274   498  1659  7581  6119]
 [  734   537  1011  3568 66774]]

In [28]:
import os.path
from keras.models import load_model
import json

def cache_fit(model_name: str, model, *args, **kwargs):
    archive_name = f'{model_name}_model.h5'
    history_name = f'{model_name}_history.json'
    archive_exists = os.path.isfile(archive_name)

    if not archive_exists:
        print(f'Model {model_name} not found in archive. Training new model.')
        hist = model.fit(*args, **kwargs)
        model.save(archive_name)
        with open(history_name, 'w') as f:
            json.dump(hist.history, f)
        return model
    else:
        print('Model found on disk. Reloading.')
        return load_model(archive_name)

In [29]:
model = cache_fit(
    'rnn3', rnn3, 
    X_train, y_train_ordinal, validation_data=(X_test, y_test_ordinal), epochs=2, batch_size=32
)

Model found on disk. Reloading.
