In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import os, re
import nltk
BASE_DIR = '../input/'
LABELED_TRAIN_DF = BASE_DIR + 'labeled_train_clean_reviews.csv'
TEST_DF = BASE_DIR + 'test_clean_reviews.csv'

In [2]:
labeled_train = pd.read_csv(LABELED_TRAIN_DF, header = 0)
test = pd.read_csv(TEST_DF, header = 0)
labeled_train["review"] = labeled_train["review"].astype(str)
test["review"] = test["review"].astype(str)
print "Read %d labeled train reviews" \
          "and %d test reviews" % (labeled_train["review"].size, test["review"].size)

Read 25000 labeled train reviewsand 25000 test reviews


Data leakage

Check if test["sentiment"] is correct

In [3]:
test["sentiment"] = test["id"].map(lambda x: 1 if int(x.strip('"').split("_")[1]) >= 5 else 0)

Prepare

In [4]:
train_clean_reviews = labeled_train["review"].tolist()
test_clean_reviews = test["review"].tolist()

all_clean_reviews = train_clean_reviews + test_clean_reviews

In [5]:
print(np.random.choice(all_clean_reviews, 1))

['based on actual events of  one  nine  zero  five   silent film the battleship potemkin concerns an imperial russian ship on which abominable conditions lead to a mutiny  shocked by conditions on the ship  citizens of the port city odessa rally to the mutineers  support  and in consequence find themselves at the mercy of imperial forces  who attack the civilian supporters with savage force potemkin is a film in which individual characters are much less important than the groups and crowds of which they are members  and it achieves its incredible power by showing the clash of the groups and crowds in a series of extraordinarily visualized and edited sequences  amazingly  each of these sequences manage to top the previous one  and the film actually builds in power as it moves from the mutiny to the citizen s rally to the massacre on the odessa steps  the latter of which is among the most famous sequences in all of film history  filming largely where the real events actually occurred  di

In [40]:
# We vectorize the text corpus by turning each text into a sequence of integers
# Each integer is the index of a token in the dictionary
from keras.preprocessing.text import Tokenizer

MAX_NUM_WORDS_FOR_KERAS_TOKENIZER = 500
#
# num_words: the maximum number of words to keep, based on frequency.
keras_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS_FOR_KERAS_TOKENIZER)
#
# fit_on_texts accepts a list of strings, a generator of strings or 
# a list of list of strings. In the last case, it assumes each entry of the lists to be a token.
# Here we provide a list of strings.
keras_tokenizer.fit_on_texts(train_clean_reviews)
word_index = keras_tokenizer.word_index
print('Keras Tokenizer found %s unique tokens' % len(word_index))

Keras Tokenizer found 74217 unique tokens


In [41]:
embedded_train = keras_tokenizer.texts_to_matrix(train_clean_reviews, mode='count')
embedded_test = keras_tokenizer.texts_to_matrix(test_clean_reviews, mode='count')

In [42]:
print(len(embedded_train), len(embedded_train[0]))
print(len(embedded_test), len(embedded_test[0]))
print(embedded_train[0])
print(embedded_test[0])

(25000, 500)
(25000, 500)
[ 0. 19. 10. 10. 11.  9. 16.  8.  4. 10. 11.  2.  4.  2.  2.  4.  5.  4.
  3.  4.  2.  1.  2.  2.  2.  9.  2.  3.  1.  1.  4.  2.  0.  0.  2.  3.
  4.  0.  3.  1.  1.  0.  4.  3.  2.  0.  2.  1.  0.  2.  0.  1.  1.  0.
  0.  3.  0.  1.  0.  0.  0.  1.  1.  3.  0.  1.  2.  1.  0.  1.  0.  0.
  1.  2.  0.  0.  0.  1.  3.  0.  5.  0.  0.  0.  2.  2.  0.  0.  2.  0.
  1.  1.  4.  1.  0.  2.  0.  0.  1.  1.  0.  0.  0.  0.  0.  1.  0.  1.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  3.  0.
  1.  1.  0.  1.  1.  0.  0.  0.  0.  0.  1.  1.  1.  0.  0.  0.  2.  0.
  0.  1.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  1.  3.  0.  0.  2.  1.
  0.  0.  0.  0.  0.  0.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  1.
  0.  0.  0.  0.  1.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  2.  2.  1.  2.  0.
  0.  1.  1.  0.  0.  0. 

In [44]:
# Reshape data into a 25000 x 5000 x 1 array for feedint 1D Covolution
embedded_train = np.reshape(embedded_train, (len(embedded_train), MAX_NUM_WORDS_FOR_KERAS_TOKENIZER, 1))
embedded_test = np.reshape(embedded_test, (len(embedded_test), MAX_NUM_WORDS_FOR_KERAS_TOKENIZER, 1))

In [45]:
print(len(embedded_train), len(embedded_train[0]), len(embedded_train[0][0]))
print(len(embedded_test), len(embedded_test[0]), len(embedded_test[0][0]))
print(embedded_train[0])
print(embedded_test[0])

(25000, 500, 1)
(25000, 500, 1)
[[ 0.]
 [19.]
 [10.]
 [10.]
 [11.]
 [ 9.]
 [16.]
 [ 8.]
 [ 4.]
 [10.]
 [11.]
 [ 2.]
 [ 4.]
 [ 2.]
 [ 2.]
 [ 4.]
 [ 5.]
 [ 4.]
 [ 3.]
 [ 4.]
 [ 2.]
 [ 1.]
 [ 2.]
 [ 2.]
 [ 2.]
 [ 9.]
 [ 2.]
 [ 3.]
 [ 1.]
 [ 1.]
 [ 4.]
 [ 2.]
 [ 0.]
 [ 0.]
 [ 2.]
 [ 3.]
 [ 4.]
 [ 0.]
 [ 3.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 4.]
 [ 3.]
 [ 2.]
 [ 0.]
 [ 2.]
 [ 1.]
 [ 0.]
 [ 2.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 3.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 3.]
 [ 0.]
 [ 1.]
 [ 2.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 2.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 3.]
 [ 0.]
 [ 5.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 2.]
 [ 2.]
 [ 0.]
 [ 0.]
 [ 2.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 4.]
 [ 1.]
 [ 0.]
 [ 2.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 3.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [

In [31]:
# Split train_sequences into train and validation. Ratio: 80/20
VALIDATION_SPLIT = 0.2
np.random.seed(1234)

# 
perm = np.random.permutation(len(embedded_train))
index_train = perm[:int(len(embedded_train)*(1-VALIDATION_SPLIT))]
index_val = perm[int(len(embedded_train)*(1-VALIDATION_SPLIT)):]

x_train = embedded_train[index_train]
x_val = embedded_train[index_val]
y_train = labeled_train["sentiment"][index_train].tolist()
y_val = labeled_train["sentiment"][index_val].tolist()

print('Randomly split %d pad sequences for training, %d for validation' % (len(x_train) ,len(x_val)))

Randomly split 20000 pad sequences for training, 5000 for validation


In [32]:
x_test = embedded_test
y_test = test["sentiment"]

In [6]:
# MAX_FEATURES = 5000
# # Bag of Words
# vectorizer = CountVectorizer(analyzer = "word",   \
#                          tokenizer = None,    \
#                          preprocessor = None, \
#                          stop_words = None,   \
#                          max_features = MAX_FEATURES)

In [7]:
# train_data_features = vectorizer.fit_transform(train_clean_reviews)
# test_data_features = vectorizer.transform(test_clean_reviews)
# train_data_features = train_data_features.toarray()
# test_data_features = test_data_features.toarray()

In [8]:
# print(len(train_data_features), len(train_data_features[0]))
# print(len(test_data_features), len(test_data_features[0]))

(25000, 5000)
(25000, 5000)


In [9]:
# # Reshape data into a 25000 x 5000 x 1 array for feedint 1D Covolution
# train_data_features = np.reshape(train_data_features, (len(train_data_features), MAX_FEATURES, 1))
# test_data_features = np.reshape(test_data_features, (len(test_data_features), MAX_FEATURES, 1))

In [10]:
# print(len(train_data_features), len(train_data_features[0]), len(train_data_features[0][0]))
# print(len(test_data_features), len(test_data_features[0]), len(test_data_features[0][0]))
# print(train_data_features[0])

(25000, 5000, 1)
(25000, 5000, 1)
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [11]:
# train_data_features = np.expand_dims(train_data_features, axis=2)
# print(len(train_data_features), len(train_data_features[0]), len(train_data_features[0][0]))
# print(len(test_data_features), len(test_data_features[0]), len(test_data_features[0][0]))
# print(train_data_features[0][0])

In [12]:
# # Split train_sequences into train and validation. Ratio: 80/20
# VALIDATION_SPLIT = 0.2
# np.random.seed(1234)

# # 
# perm = np.random.permutation(len(train_data_features))
# index_train = perm[:int(len(train_data_features)*(1-VALIDATION_SPLIT))]
# index_val = perm[int(len(train_data_features)*(1-VALIDATION_SPLIT)):]

# x_train = train_data_features[index_train]
# x_val = train_data_features[index_val]
# y_train = labeled_train["sentiment"][index_train].tolist()
# y_val = labeled_trained_train["sentiment"][index_val].tolist()

# print('Randomly split %d pad sequences for training, %d for validation' % (len(x_train) ,len(x_val)))


Randomly split 20000 pad sequences for training, 5000 for validation


In [13]:
# x_test = test_data_features
# y_test = test["sentiment"]

In [34]:
print(len(x_train), len(x_train[0]), len(x_train[0][0]))
print(len(x_val), len(x_val[0]), len(x_val[0][0]))

(20000, 5000, 1)
(5000, 5000, 1)


## Model Architecture

In [35]:
from keras.models import Sequential
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Bidirectional, Conv1D, MaxPooling1D ,GlobalMaxPooling1D
from keras.initializers import Constant

In [36]:
def cnn_rnn_model(dropout_cnn=0.5, dropout_rnn=0.2, num_filters=64, kernel_size=2):
    model = Sequential()
    
    LSTM_UNITS = 32
    LSTM_DROPOUT = 0.
    LSTM_RECCURENT_DROPOUT = 0.

    output_layer = Dense(1, activation='sigmoid')
    
    model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1
                    , input_shape=(5000, 1)))
    # Cannot use GlobalMaxPooling since you're feeding it into RNN
    model.add(MaxPooling1D(2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(dropout_cnn))
    model.add(Bidirectional(LSTM(LSTM_UNITS, 
                                 dropout=LSTM_DROPOUT, 
                                 recurrent_dropout=LSTM_RECCURENT_DROPOUT
                                )))
    model.add(Dropout(dropout_rnn))
    model.add(output_layer)
    
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    return model

In [37]:
model = cnn_rnn_model()

In [38]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_2 (Conv1D)            (None, 4999, 64)          192       
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2499, 64)          0         
_________________________________________________________________
dense_4 (Dense)              (None, 2499, 32)          2080      
_________________________________________________________________
dropout_3 (Dropout)          (None, 2499, 32)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                16640     
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total para

In [39]:
# batch_size: number of samples per gradient update
history = model.fit(x_train, y_train, batch_size=64, epochs=6, validation_data=[x_val, y_val])

Train on 20000 samples, validate on 5000 samples
Epoch 1/6
  832/20000 [>.............................] - ETA: 40:54 - loss: 0.7031 - acc: 0.4952

KeyboardInterrupt: 

In [None]:
y_test_pred = model.predict(x_test)

In [None]:
y_test_pred_binary = map(lambda predict: 1 if predict > 0.5 else 0, y_test_pred)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

In [None]:
print("The AUC score for CNN-BiLSTM model is : %.5f." %roc_auc_score(y_test, y_test_pred_binary))

In [None]:
# Write the test results
output = pd.DataFrame(data={"id": test["id"], "sentiment": y_test_pred_binary})
output.to_csv(os.path.join('../', 'output', "cnn_bilstm_bow_6epoch.csv"), index=False, quoting=3)
print "Wrote to cnn_bilstm_bow_6epoch.csv"

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)

plt.plot(epochs, loss, color='orange', label='Train')
plt.plot(epochs, val_loss, color='green', label='Validation')
plt.title('Loss: cnn_bilstm_bow_6epoch')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig('../figures/cnn_bilstm_bow_6epoch_loss.png', dpi=900)

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']

plt.plot(epochs, acc, color='orange', label='Train')
plt.plot(epochs, val_acc, color='blue', label='Validation')
plt.title('Accuracy: cnn_bilstm_bow_6epoch')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('../figures/cnn_bilstm_bow_6epoch_accuracy.png', dpi=900)