In [1]:
#https://github.com/javaidnabi31/Word-Embeddding-Sentiment-Classification/blob/master/lstm-gru-sentiment-analysis.ipynb
#https://github.com/javaidnabi31/Word-Embeddding-Sentiment-Classification

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame()
df = pd.read_csv('/Users/KumarSanjeev/Desktop/movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1


In [3]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

review_lines = list()
lines = df['review'].values.tolist()

for line in lines:   
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word    
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words    
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_lines.append(words)

In [4]:
len(review_lines)

50000

In [5]:

import gensim 

EMBEDDING_DIM = 100
# train word2vec model
model = gensim.models.Word2Vec(sentences=review_lines, size=EMBEDDING_DIM, window=5, workers=4, min_count=1)
# vocab size
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))



Vocabulary size: 134121


In [6]:

# let us try some utility functions of gensim word2vec more details here 

model.wv.most_similar('horrible')#, topn =1)

[('terrible', 0.9223604202270508),
 ('awful', 0.8547260761260986),
 ('horrendous', 0.7724288702011108),
 ('pathetic', 0.7651054859161377),
 ('sucks', 0.761432409286499),
 ('dreadful', 0.7544825673103333),
 ('atrocious', 0.7536700963973999),
 ('horrid', 0.7454558610916138),
 ('ridiculous', 0.7207680940628052),
 ('laughable', 0.7020524740219116)]

In [7]:

#Let’s see the result of semantically reasonable word vectors (king - man + woman)
model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

[('romeo', 0.8792762160301208),
 ('princess', 0.8715932965278625),
 ('prince', 0.8560853004455566),
 ('bride', 0.8518150448799133),
 ('juliet', 0.8507338166236877),
 ('onionpeeling', 0.8435370326042175),
 ('musee', 0.841188371181488),
 ('cortland', 0.8298543095588684),
 ('jerol', 0.8290295600891113),
 ('tristran', 0.827930748462677)]

In [9]:
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [10]:
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

In [11]:

# summarize size
print("Training data: ")
print(X.shape)
print(y.shape)

Training data: 
(50000,)
(50000,)


In [12]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


tokenizer_obj = Tokenizer()
total_reviews = X_train + X_test
tokenizer_obj.fit_on_texts(total_reviews) 

# pad sequences
max_length = max([len(s.split()) for s in total_reviews])

# define vocabulary size
vocab_size = len(tokenizer_obj.word_index) + 1

X_train_tokens =  tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)


X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')


  from ._conv import register_converters as _register_converters


In [13]:
print(vocab_size)

125602


In [14]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding

EMBEDDING_DIM = 100

print('Build model...')

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(GRU(units=32,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Using TensorFlow backend.


Build model...
Summary of the built model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2678, 100)         12560200  
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 12,573,001
Trainable params: 12,573,001
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
print('Train...')

model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)


Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/25
 - 581s - loss: 0.6937 - acc: 0.5009 - val_loss: 0.6935 - val_acc: 0.5000
Epoch 2/25
 - 579s - loss: 0.6934 - acc: 0.4968 - val_loss: 0.6939 - val_acc: 0.5000
Epoch 3/25
 - 1193s - loss: 0.6934 - acc: 0.4945 - val_loss: 0.6932 - val_acc: 0.5000
Epoch 4/25
 - 561s - loss: 0.6933 - acc: 0.5038 - val_loss: 0.6932 - val_acc: 0.5000
Epoch 5/25
 - 2707s - loss: 0.6932 - acc: 0.5038 - val_loss: 0.6932 - val_acc: 0.5000
Epoch 6/25
 - 651s - loss: 0.6933 - acc: 0.4938 - val_loss: 0.6932 - val_acc: 0.5000
Epoch 7/25


In [None]:
print('Testing...')
score, acc = model.evaluate(X_test_pad, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)

print("Accuracy: {0:.2%}".format(acc))

In [None]:
#Let us test some  samples
test_sample_1 = "This movie is fantastic! I really like it because it is so good!"
test_sample_2 = "Good movie!"
test_sample_3 = "Maybe I like this movie."
test_sample_4 = "Not to my taste, will skip and watch another movie"
test_sample_5 = "if you like action, then this movie might be good for you."
test_sample_6 = "Bad movie!"
test_sample_7 = "Not a good movie!"
test_sample_8 = "This movie really sucks! Can I get my money back please?"
test_samples = [test_sample_1, test_sample_2, test_sample_3, test_sample_4, test_sample_5, test_sample_6, test_sample_7, test_sample_8]

test_samples_tokens = tokenizer_obj.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=max_length)

#predict
model.predict(x=test_samples_tokens_pad)

In [None]:

#let us check how the model predicts
classes = model.predict(X_test_pad[:10], batch_size=128)
for i in range (0,10):
    if(classes[i] > 0.5 and y_test[i] == 1 or (classes[i] <= 0.5 and y_test[i] == 0)):
        print( classes[i], y_test[i], " Right prdiction")
    else :
        print( classes[i], y_test[i], " Wrong prdiction")

In [None]:
from keras.datasets import imdb
from keras.models import Sequential
from tensorflow.python.keras.preprocessing import sequence
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding

# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

print('Build model...')

model = Sequential()
model.add(Embedding(top_words, 100, input_length=max_words))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

In [None]:
print('Train...')

model.fit(X_train, y_train, batch_size=128, epochs=25, validation_data=(X_test, y_test), verbose=2)

In [None]:
score, acc = model.evaluate(X_test, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)
print("Accuracy: %.2f%%" % (acc*100))