In [1]:
import gensim
import keras
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, BatchNormalization, Activation, Bidirectional
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences

import matplotlib as mpl
%matplotlib inline
from matplotlib import pyplot as plt
from keras.utils import plot_model 
from IPython.display import Image
import tensorflow as tf
np.random.seed(1003)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
gensim.__version__

'3.4.0'

In [7]:
tf.__version__

'1.13.1'

In [8]:
keras.__version__

'2.2.4'

In [9]:
**Specify Hyperparameters**

SyntaxError: invalid syntax (<ipython-input-9-637ab76b6519>, line 1)

In [2]:
# These are some hyperparameters that can be tuned
MAX_SENT_LEN = 30 # 90% of corpus
MAX_VOCAB_SIZE = 20000
LSTM_DIM = 128
EMBEDDING_DIM = 300
BATCH_SIZE = 32
N_EPOCHS = 10

In [None]:
**Load Data**

In [3]:
# Read the text files of positive and negative sentences
with open('neg.txt', 'r', errors='ignore') as f:
    neg = f.readlines()
    
with open('pos.txt', 'r', errors='ignore') as f:
    pos = f.readlines()

In [6]:
print('Number of negative sentences:', len(neg))
print('Number of positive sentences:', len(pos))

Number of negative sentences: 400000
Number of positive sentences: 400000


In [4]:
# Create a dataframe to store the sentence and polarity as 2 columns
df = pd.DataFrame(columns=['sentence', 'polarity'])
df['sentence'] = neg + pos
df['polarity'] = [0]*len(neg) + [1]*len(pos)
df = df.sample(frac=1, random_state=10) # Shuffle the rows
df.reset_index(inplace=True, drop=True)

In [8]:
df.head(10)

Unnamed: 0,sentence,polarity
0,That means the actual capacity is more like 20...,0
1,My only problem is that it is MADE IN CHINA.\n,0
2,The box that the item arrived in was kind of m...,0
3,It needed you to press fairly hard to register...,1
4,Now my makeup stays on longer and oil no longe...,0
5,This is the worst canned coffee I've ever had.\n,0
6,The secondary can be either incredibly telepat...,0
7,"I use these for cooking, eating, and especiall...",1
8,There's a gap between it and the ear when it's...,1
9,I guess they made their money and don't care.\n,0


In [5]:
# Pre-processing involves removal of puctuations and converting text to lower case
word_seq = [text_to_word_sequence(sent) for sent in df['sentence']]
print('90th Percentile Sentence Length:', np.percentile([len(seq) for seq in word_seq], 90))

90th Percentile Sentence Length: 23.0


In [6]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts([' '.join(seq[:MAX_SENT_LEN]) for seq in word_seq])

print("Number of words in vocabulary:", len(tokenizer.word_index))

Number of words in vocabulary: 93830


In [11]:
tokenizer.word_index

{'the': 1,
 'i': 2,
 'and': 3,
 'it': 4,
 'a': 5,
 'to': 6,
 'this': 7,
 'is': 8,
 'of': 9,
 'for': 10,
 'my': 11,
 'in': 12,
 'that': 13,
 'with': 14,
 'on': 15,
 'have': 16,
 'not': 17,
 'but': 18,
 'you': 19,
 'was': 20,
 'as': 21,
 'so': 22,
 'one': 23,
 'are': 24,
 'they': 25,
 'be': 26,
 'like': 27,
 'use': 28,
 'just': 29,
 'very': 30,
 'if': 31,
 'all': 32,
 'or': 33,
 'at': 34,
 'had': 35,
 'out': 36,
 'these': 37,
 "it's": 38,
 'would': 39,
 'when': 40,
 'up': 41,
 'can': 42,
 'good': 43,
 'from': 44,
 'great': 45,
 'them': 46,
 'will': 47,
 'get': 48,
 'no': 49,
 'well': 50,
 'more': 51,
 'me': 52,
 'has': 53,
 'your': 54,
 'about': 55,
 'than': 56,
 'product': 57,
 'phone': 58,
 'an': 59,
 'time': 60,
 'do': 61,
 'only': 62,
 "don't": 63,
 'after': 64,
 'bought': 65,
 'other': 66,
 'really': 67,
 'much': 68,
 'used': 69,
 'even': 70,
 'because': 71,
 'also': 72,
 'we': 73,
 'there': 74,
 'too': 75,
 'what': 76,
 'off': 77,
 'some': 78,
 'case': 79,
 'work': 80,
 'does': 81,

In [7]:
# Convert the sequence of words to sequnce of indices
X = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SENT_LEN]) for seq in word_seq])
X = pad_sequences(X, maxlen=MAX_SENT_LEN, padding='post', truncating='post')

y = df['polarity']

In [13]:
X[:3]

array([[  13,  741,    1, 1052, 1342,    8,   51,   27, 4286, 3432,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0],
       [  11,   62,  175,    8,   13,    4,    8,   93,   12,  959,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0],
       [   1,  246,   13,    1,  217,  482,   12,   20,  419,    9, 9130,
        5760,   13,    4,   20,  580,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]], dtype=int32)

In [8]:
# Split the dataset to train, validate, and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)

In [15]:
X_train.shape

(648000, 30)

In [9]:
%%time
# Load the word2vec embeddings 
import pickle
embeddings = gensim.models.Word2Vec.load('w2v_amazon_data.pkl')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 325 ms, sys: 117 ms, total: 442 ms
Wall time: 549 ms


In [17]:
print('Number of words in this pre-trained w2v model:', len(embeddings.vocab))
print('Dimension of w2v:', embeddings.vector_size)

AttributeError: 'Word2Vec' object has no attribute 'vocab'

In [10]:
# Create an embedding matrix containing only the word's in our vocabulary
# If the word does not have a pre-trained embedding, then randomly initialize the embedding
embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(tokenizer.word_index)+1, EMBEDDING_DIM)) # +1 is because the matrix indices start with 0

for word, i in tokenizer.word_index.items(): # i=0 is the embedding for the zero padding
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector
        
del embeddings

  import sys


In [19]:
**Keras Sequential API**

SyntaxError: invalid syntax (<ipython-input-19-d670019f8ac3>, line 1)

In [11]:
# Build a sequential model by stacking neural net units 
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                          output_dim=EMBEDDING_DIM,
                          weights = [embeddings_matrix], trainable=False, name='word_embedding_layer', 
                          mask_zero=True))

model.add(LSTM(LSTM_DIM, return_sequences=False, name='lstm_layer'))

model.add(Dense(1, activation='sigmoid', name='output_layer'))

Instructions for updating:
Colocations handled automatically by placer.


In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
word_embedding_layer (Embedd (None, None, 300)         28149300  
_________________________________________________________________
lstm_layer (LSTM)            (None, 128)               219648    
_________________________________________________________________
output_layer (Dense)         (None, 1)                 129       
Total params: 28,369,077
Trainable params: 219,777
Non-trainable params: 28,149,300
_________________________________________________________________


In [13]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(X_train[:2000], y_train[:2000],
          batch_size=BATCH_SIZE,
          epochs=1,
          validation_data=(X_val[:200], y_val[:200]))

Instructions for updating:
Use tf.cast instead.
Train on 2000 samples, validate on 200 samples
Epoch 1/1
