In [1]:
import pandas as pd
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
reviews = pd.read_csv('IMDB Dataset.csv')
tokenizer = TreebankWordTokenizer()

In [3]:
val = 12500
reviews = reviews[:val]

In [4]:
import time
start_time = time.time()

import string
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords

review_lines = []

for review in reviews['review']:
    tokens = TreebankWordTokenizer().tokenize(review)
    ## Convert to Lowercase
    tokens = (w.lower() for w in tokens)
    ## Remove punctuation 
    table = str.maketrans('','',string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    ## Filter out Stop-Words
    stop_words = set(stopwords.words('english'))
    words = [w for w in stripped if not w in stop_words]
    
    review_lines.append(words)

    
print("--- %s seconds ---" % (time.time() - start_time))    

--- 13.470518827438354 seconds ---


In [5]:
## Convert to Word2Vec embeddings
from gensim.models import Word2Vec 

train_vector = Word2Vec(review_lines, min_count=1, size=100, window=2, workers=4)

words = list(train_vector.wv.vocab)
print ('Vocabulary : %d' % len(words))

Vocabulary : 69566


In [6]:
## Define a max review length 
max_length = max([len(s.split()) for s in reviews['review']])

In [7]:
## Mapping the word embeddings to a 2D vector
embedded_matrix = np.zeros((len(words), 100))

for index, word in enumerate(words):
    if index > len(words):
        continue
    embedded_vector = train_vector[word]
    if embedded_vector is not None:
        ## Word not found in embedding index will be zero
        embedded_matrix[index] = embedded_vector
        

  import sys


In [8]:
embedded_matrix.shape

(69566, 100)

In [9]:
## Get Sentiments
sentiment = []

for i in reviews['sentiment']:
    if i == 'positive':
        sentiment.append(1)
    else:
        sentiment.append(0)
        
sentiment = np.array(sentiment)

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## Vectorize into a 2D tensor
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_lines)
encoded_docs = tokenizer.texts_to_sequences(review_lines)

## Pad encoded_docs
word_index = tokenizer.word_index
pad_reviews = pad_sequences(encoded_docs, maxlen=max_length)

Using TensorFlow backend.


In [11]:
## Randomly Shuffle reviews
indices = np.arange(pad_reviews.shape[0])
np.random.shuffle(indices)
pad_reviews = pad_reviews[indices]
sentiment = sentiment[indices]

In [15]:
## Train the embedding matrix in the model setting trainable = False, as embedding is already learned

from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

## Define model
model = Sequential()
embedding_layer = Embedding(len(words)+1, 
                           100,
                           embeddings_initializer = Constant(embedded_matrix),
                           trainable = False,
                           input_length = max_length)
model.add(embedding_layer)
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
#model.add(GRU(32, dropout =0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

## Compile Model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1830, 100)         6956700   
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 7,074,077
Trainable params: 117,377
Non-trainable params: 6,956,700
_________________________________________________________________


In [17]:
print ('Train....')

model.fit(pad_reviews, sentiment,
         batch_size = 128, 
         epochs = 10,
         verbose = 2,
         validation_split = 0.2) 

Train....
Train on 10000 samples, validate on 2500 samples
Epoch 1/10


KeyboardInterrupt: 