In [2]:
import numpy as np, pandas as pd, tensorflow as tf

  return f(*args, **kwds)


# Dataset Loading

#### Classify on Stars

In [3]:
## Amazon Reviews
## https://www.kaggle.com/snap/amazon-fine-food-reviews
reviews = pd.read_csv('data/Reviews.csv')
reviews.info()

ParserError: Error tokenizing data. C error: Expected 1 fields in line 6, saw 2


In [3]:
try:
    del reviews['ProfileName']
except KeyError:
    print('No such column')
    
try:
    del reviews['Summary']
except KeyError:
    print('No such column')

In [4]:
#Remove rows where numerator > denominator. Assume this is entry error in dataset.
faulty_rows = reviews[reviews['HelpfulnessNumerator'] > reviews['HelpfulnessDenominator']]
reviews = reviews[reviews['HelpfulnessNumerator'] <= reviews['HelpfulnessDenominator']]

faulty_rows

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Text
44736,44737,B001EQ55RW,A2V0I904FH7ABY,3,2,4,1212883200,It was almost a 'love at first bite' - the per...
64421,64422,B000MIDROQ,A161DK06JJMCYF,3,1,5,1224892800,My son loves spaghetti so I didn't hesitate or...


In [5]:
import math
# [CITE] https://www.evanmiller.org/how-not-to-sort-by-average-rating.html
# "The lower bound of Wilson score confidence interval for a Bernoulli parameter"
def lbc(positive_votes, total_votes):
    negative_votes = total_votes - positive_votes
    if total_votes == 0:
        return 0.0
    lower_bound = ((positive_votes + 1.9208) / (total_votes) - 1.96 * math.sqrt((total_votes * negative_votes) / (total_votes) + 0.9604) / 
        (total_votes)) / (1 + 3.8416 / (total_votes))
    return lower_bound

### Modify Dataset

In [6]:
%%time

# Add the Lbc column
lbc_for_row = lambda row: lbc(row['HelpfulnessNumerator'], row['HelpfulnessDenominator'])
reviews['Lbc'] = reviews.apply(lbc_for_row, axis=1)

# Make Score zero indexed
reviews['Score'] = reviews['Score']-1

CPU times: user 14.6 s, sys: 80.2 ms, total: 14.7 s
Wall time: 14.7 s


## Reduce Dataset Size

In [7]:
data = reviews.sample(frac=.25)
data.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Text,Lbc
139166,139167,B0057OR5IO,A2K722XACWXW5G,0,1,1,1337817600,"All four of these had ""sell by"" dates which ha...",-0.170084
387988,387989,B007RTR8UM,A28I19Q54MYXGV,0,1,2,1340841600,The is a very low cost conditioner that goes o...,-0.170084
555251,555252,B002ESSASK,A1RJDQPF8G9WEP,0,6,2,1307404800,We've been using this coffee for the past 2 mo...,-0.33025
443392,443393,B000MXJR7C,AP66BP6OX6WD1,0,1,0,1339718400,"Please try to avoid , we were disappointed<br ...",-0.170084
8140,8141,B0019GVYR2,ALI6SW10L0ZMC,0,4,4,1324252800,Real salt is good and good for you. Doesnt tak...,-0.311735


## Preprocessing

In [8]:
quartile = .85
review_length = int(data['Text'].str.len().quantile(q=quartile))

longest = data[data['Text'].str.len() == review_length]
print("Length of quartile", review_length)
longest['Text'].tolist()[0]

Length of quartile 714


"I highly recommend ALL Wellness products!! My Shih Tzu's were plaqued with severe allergy problems,, Spent $$$$ after $$$$ at vet for relief for them, nothing seemed to work.. I did extensive research on dog food and found out the mainstream brands are all crap, nothing but by products, additives, nothing natural about any of them.. Did research on Wellness and Blue Buffalo, look them up, you will find out for yourself.. Their products are all natural, no additives, no preservatives, 100% meat from animals that are NOT fed growth hormones, etc.. My dogs have been on this food, both wet and dry and they are thriving and the picture of health, bright eyes, shiny coats, full of energy and NO MORE allergies.."

### Tokenize the dataset

In [9]:
%%time
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

NUM_TOP_WORDS = 120000
MAX_ART_LEN = int(data['Text'].str.len().quantile(q=quartile)) # maximum and minimum number of words 
                                                               #  based on a quartile of review length

tokenizer = Tokenizer(num_words=NUM_TOP_WORDS)
tokenizer.fit_on_texts(data.Text)
sequences = tokenizer.texts_to_sequences(data.Text)

word_index = tokenizer.word_index
NUM_TOP_WORDS = len(word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
top_words = min((len(word_index),NUM_TOP_WORDS))
print('Found %s unique tokens. Distilled to %d top words.' % (len(word_index),top_words))

X = pad_sequences(sequences, maxlen=MAX_ART_LEN)

y_ohe = keras.utils.to_categorical(data['Score'])
print('Shape of data tensor:', X.shape)
print('Shape of label tensor:', y_ohe.shape)
print(np.max(X))

Using TensorFlow backend.


Found 72469 unique tokens. Distilled to 72469 top words.
Shape of data tensor: (142113, 714)
Shape of label tensor: (142113, 5)
72469
CPU times: user 20.6 s, sys: 522 ms, total: 21.1 s
Wall time: 21.2 s


In [10]:
from sklearn.model_selection import train_test_split
# Split it into train / test subsets
X_train, X_test, y_train_ohe, y_test_ohe = train_test_split(X, y_ohe, test_size=0.2,
                                                            stratify=data['Score'], 
                                                            random_state=42)
NUM_CLASSES = y_ohe.shape[1]
print(X_train.shape,y_train_ohe.shape)
print(np.sum(y_train_ohe,axis=0))

(113690, 714) (113690, 5)
[ 10426.   5908.   8574.  16131.  72651.]


### Load the embeding

In [11]:
%%time

EMBED_SIZE = 100
# the embed size should match the file you load glove from
embeddings_index = {}
f = open('embeddings/glove.6B/glove.6B.100d.txt')
# save key/array pairs of the embeddings
#  the key of the dictionary is the word, the array is the embedding
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# now fill in the matrix, using the ordering from the
#  keras word tokenizer from before
embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

# Define the embeding layer
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBED_SIZE,
                            weights=[embedding_matrix],
                            input_length=MAX_ART_LEN,
                            trainable=False)

Found 400000 word vectors.
(72470, 100)
CPU times: user 12.3 s, sys: 375 ms, total: 12.7 s
Wall time: 12.7 s


## Network 1

In [12]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D # Convolution Layers
from keras.layers import Dense                # Dense Layers
from keras.layers import GRU                  # Recurrent Layers

rnn1 = Sequential()
rnn1.add(embedding_layer)
rnn1.add(Conv1D(filters=32, kernel_size=2, padding='same', activation='relu'))
rnn1.add(MaxPooling1D(pool_size=2))
rnn1.add(GRU(100,dropout=0.2, recurrent_dropout=0.2))
rnn1.add(Dense(NUM_CLASSES, activation='sigmoid'))
rnn1.compile(loss='categorical_crossentropy',
              optimizer='Adam', 
              metrics=['accuracy'])
print(rnn1.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 714, 100)          7247000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 714, 32)           6432      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 357, 32)           0         
_________________________________________________________________
gru_1 (GRU)                  (None, 100)               39900     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 505       
Total params: 7,293,837
Trainable params: 46,837
Non-trainable params: 7,247,000
_________________________________________________________________
None


In [13]:
%%time

rnn1.fit(X_train, y_train_ohe, validation_data=(X_test, y_test_ohe), epochs=4, batch_size=32)

Train on 113690 samples, validate on 28423 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
CPU times: user 4h 28min 29s, sys: 1h 8min 6s, total: 5h 36min 36s
Wall time: 1h 27min 41s


<keras.callbacks.History at 0x12c944908>

In [14]:
y_hat = np.argmax(rnn1.predict(X_test), axis=1)

In [15]:
from sklearn.metrics import confusion_matrix

y_test = np.argmax(y_test_ohe, axis=1)

cm = confusion_matrix(y_test, y_hat)
print(cm)

[[ 2007    31   124    19   425]
 [  715    52   306    60   344]
 [  448    41   550   317   788]
 [  212    13   342   665  2801]
 [  488     4   162   288 17221]]


# Results 
#### 10% of Dataset
Train on 45476 samples, validate on 11369 samples

Epoch 1/4
45476/45476 [==============================] - 451s 10ms/step - loss: 1.0291 - acc: 0.6508 - val_loss: 0.8893 - val_acc: 0.6783

Epoch 2/4
45476/45476 [==============================] - 448s 10ms/step - loss: 0.8846 - acc: 0.6804 - val_loss: 0.8173 - val_acc: 0.6968

Epoch 3/4
45476/45476 [==============================] - 453s 10ms/step - loss: 0.8355 - acc: 0.6923 - val_loss: 0.7954 - val_acc: 0.7038

Epoch 4/4
45476/45476 [==============================] - 442s 10ms/step - loss: 0.8052 - acc: 0.6997 - val_loss: 0.7769 - val_acc: 0.7107

CPU times: user 1h 44min 42s, sys: 27min 2s, total: 2h 11min 45s
Wall time: 29min 56s
```
[[ 594    2  138   26  282]
 [ 187    5  183   58  165]
 [  87    4  265  182  316]
 [  32    1  152  315 1107]
 [  58    0  103  206 6901]]
```
#### 25% of Dataset

## Network 2

In [None]:
from keras.models import Sequential
from keras.layers import BatchNormalization
from keras.layers import Conv1D, MaxPooling1D # Convolution Layers
from keras.layers import Dense                # Dense Layers
from keras.layers import LSTM                 # Recurrent Layers

rnn2 = Sequential()
rnn2.add(embedding_layer)
rnn2.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
rnn2.add(MaxPooling1D(pool_size=2))
rnn2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
rnn2.add(MaxPooling1D(pool_size=2))
rnn2.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
rnn2.add(MaxPooling1D(pool_size=2))
rnn2.add(BatchNormalization())
rnn2.add(LSTM(100,dropout=0.25, recurrent_dropout=0.2))
rnn2.add(Dense(64))
rnn2.add(Dense(64))
rnn2.add(Dense(NUM_CLASSES, activation='sigmoid'))
rnn2.compile(loss='categorical_crossentropy',
              optimizer='Adam', 
              metrics=['accuracy'])
print(rnn2.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 714, 100)          7247000   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 714, 32)           9632      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 357, 32)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 357, 64)           6208      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 178, 64)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 178, 64)           12352     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 89, 64)            0         
__________

In [None]:
%%time

rnn2.fit(X_train, y_train_ohe, validation_data=(X_test, y_test_ohe), epochs=4, batch_size=32)

Train on 113690 samples, validate on 28423 samples
Epoch 1/4
Epoch 2/4

In [None]:
y_hat = np.argmax(rnn2.predict(X_test), axis=1)

In [None]:
from sklearn.metrics import confusion_matrix

y_test = np.argmax(y_test_ohe, axis=1)

cm = confusion_matrix(y_test, y_hat)
print(cm)

# Results 
#### 10% of Dataset
Train on 45476 samples, validate on 11369 samples

Epoch 1/4
45476/45476 [==============================] - 237s 5ms/step - loss: 0.9874 - acc: 0.6574 - val_loss: 0.9331 - val_acc: 0.6636

Epoch 2/4
45476/45476 [==============================] - 236s 5ms/step - loss: 0.8625 - acc: 0.6846 - val_loss: 0.8611 - val_acc: 0.6892

Epoch 3/4
45476/45476 [==============================] - 237s 5ms/step - loss: 0.8107 - acc: 0.6965 - val_loss: 0.8028 - val_acc: 0.6973

Epoch 4/4
45476/45476 [==============================] - 236s 5ms/step - loss: 0.7780 - acc: 0.7080 - val_loss: 0.8506 - val_acc: 0.6994

CPU times: user 56min 14s, sys: 14min 44s, total: 1h 10min 59s
Wall time: 15min 49s
```
[[ 616   11   42   10  363]
 [ 198   10   64   39  287]
 [ 104    2   99   77  572]
 [  42    1   33   68 1463]
 [  58    0   25   27 7158]]
```
#### 25% of Dataset