In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re
import numpy as np

import xgboost as xgb
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from sklearn.metrics import accuracy_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

from keras.models import Model, Sequential
from keras.layers import Input, Dense, Concatenate, Reshape, Dropout, BatchNormalization, Embedding, Flatten

In [3]:
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 16

In [4]:
data = pd.read_csv('../../chapter 8/data/movie_reviews.csv', encoding='latin-1')

In [5]:
data.shape

(25000, 2)

In [6]:
data.SentimentText = data.SentimentText.str.lower()

In [7]:
def clean_str(string):
    
    string = re.sub(r"https?\://\S+", '', string)
    string = re.sub(r'\<a href', ' ', string)
    string = re.sub(r'&amp;', '', string) 
    string = re.sub(r'<br />', ' ', string)
    string = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', string)
    string = re.sub('\d','', string)
    string = re.sub(r"can\'t", "cannot", string)
    string = re.sub(r"it\'s", "it is", string)
    return string

In [8]:
data.SentimentText = data.SentimentText.apply(lambda x: clean_str(str(x)))

In [9]:
pd.Series(' '.join(data['SentimentText']).split()).value_counts().head(10)

movie    43558
film     39095
it       30659
one      26509
is       20355
like     20270
good     15099
the      13913
time     12682
even     12656
dtype: int64

In [10]:
stop_words = stopwords.words('english') + ['movie', 'film', 'time']
stop_words = set(stop_words)
remove_stop_words = lambda r: [[word for word in word_tokenize(sente) if word not in stop_words] for sente in sent_tokenize(r)]
data['SentimentText'] = data['SentimentText'].apply(remove_stop_words)

In [10]:
model = Word2Vec(
        data['SentimentText'].apply(lambda x: x[0]),
        iter=10,
        size=16,
        window=5,
        min_count=5,
        workers=10)

In [11]:
model.wv.save_word2vec_format('movie_embedding.txt', binary=False)

In [11]:
def combine_text(text):    
    try:
        return ' '.join(text[0])
    except:
        return np.nan

In [12]:
data.SentimentText = data.SentimentText.apply(lambda x: combine_text(x))

In [13]:
data = data.dropna(how='any')

In [14]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(list(data['SentimentText']))
sequences = tokenizer.texts_to_sequences(data['SentimentText'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 77348 unique tokens.


In [15]:
reviews = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [16]:
def load_embedding(filename, word_index , num_words, embedding_dim):
    embeddings_index = {}
    file = open(filename, encoding="utf-8")
    for line in file:
        values = line.split()
        word = values[0]
        coef = np.asarray(values[1:])
        embeddings_index[word] = coef
    file.close()
    
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for word, pos in word_index.items():
        if pos >= num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[pos] = embedding_vector
    return embedding_matrix

In [17]:
embedding_matrix = load_embedding('movie_embedding.txt', word_index, len(word_index), EMBEDDING_DIM)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(reviews, pd.get_dummies(data.Sentiment), test_size=0.2, random_state=9)

In [59]:
inp = Input((MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(len(word_index),
                    EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False)(inp)
model = Flatten()(embedding_layer)
model = BatchNormalization()(model)
model = Dropout(0.10)(model)
model = Dense(units=256, activation='relu')(model)
model = Dense(units=64, activation='relu')(model)
model = Dropout(0.5)(model)
predictions = Dense(units=2, activation='softmax')(model)
model = Model(inputs = inp, outputs = predictions)

model.compile(loss='binary_crossentropy', optimizer='sgd', metrics = ['acc'])

In [60]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 16)           1237568   
_________________________________________________________________
flatten_3 (Flatten)          (None, 1600)              0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 1600)              6400      
_________________________________________________________________
dropout_5 (Dropout)          (None, 1600)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 256)               409856    
_________________________________________________________________
dense_8 (Dense)              (None, 64)                16448     
__________

In [61]:
model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs=10, batch_size=256)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2101ae46470>

In [62]:
preds = model.predict(X_test)

In [63]:
accuracy_score(np.argmax(preds, 1), np.argmax(y_test.values, 1))

0.7634

In [64]:
y_actual = pd.Series(np.argmax(y_test.values, axis=1), name='Actual')
y_pred = pd.Series(np.argmax(preds, axis=1), name='Predicted')
pd.crosstab(y_actual, y_pred, margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1774,679,2453
1,504,2043,2547
All,2278,2722,5000


In [65]:
review_num = 111
print("Review: \n"+tokenizer.sequences_to_texts([X_test[review_num]])[0])
sentiment = "Positive" if np.argmax(preds[review_num]) else "Negative"
print("\nPredicted sentiment = "+ sentiment)
sentiment = "Positive" if np.argmax(y_test.values[review_num]) else "Negative"
print("\nActual sentiment = "+ sentiment)

Review: 
love love love another absolutely superb performance miss beginning end one big treat n't rent buy

Predicted sentiment = Positive

Actual sentiment = Positive
