In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import pickle
import os, re
import nltk
BASE_DIR = '../input/'
LABELED_TRAIN_DF = BASE_DIR + 'labeledTrainData.tsv'
UNLABELED_TRAIN_DF = BASE_DIR + 'unlabeledTrainData.tsv'
TEST_DF = BASE_DIR + 'testData.tsv'
print(os.listdir(BASE_DIR))

['word2vec_model_300dim_40minwords_10context', 'sampleSubmission.csv', 'labeledTrainData.tsv', 'test_submission.csv', 'testData.tsv', 'unlabeledTrainData.tsv']


In [2]:
labeled_train = pd.read_csv(LABELED_TRAIN_DF, header = 0, delimiter = '\t', quoting=3)
unlabeled_train = pd.read_csv(UNLABELED_TRAIN_DF, header = 0, delimiter = '\t', quoting=3)
test = pd.read_csv(TEST_DF, header = 0, delimiter = '\t', quoting=3)
print "Read %d labeled train reviews, %d unlabeled train reviews, " \
          "and %d test reviews" % (labeled_train["review"].size, unlabeled_train["review"].size, test["review"].size)

Read 25000 labeled train reviews, 50000 unlabeled train reviews, and 25000 test reviews


Data leakage

Check if test["sentiment"] is correct

In [3]:
test["sentiment"] = test["id"].map(lambda x: 1 if int(x.strip('"').split("_")[1]) >= 5 else 0)

Credits: Kaggle tutorial

In [4]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

def review_to_clean_review(review, remove_numbers=True, stem_words=False):
    # Function to convert a document to a clean document,
    # optionally removing numbers.  Returns a string.
    #
    # 1. Remove HTML using lxml parser, ranked best by bs4
    review_text = BeautifulSoup(review, "lxml").get_text()
    #
    # TODO: Clean the text! stemming?
    # https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings#L92
    # https://www.kaggle.com/currie32/the-importance-of-cleaning-text
    #  
    # 2. Remove non-letters and non-numbers
    review_text = re.sub("[^a-zA-Z0-9]", " ", review_text)
    #
    # 3. Optionally remove numbers
    if remove_numbers:
        review_text = re.sub("[0-9]", " ", review_text)
    else:
        review_text = review_text.replace('0', ' zero ')
        review_text = review_text.replace('1', ' one ')
        review_text = review_text.replace('2', ' two ')
        review_text = review_text.replace('3', ' three ')
        review_text = review_text.replace('4', ' four ')
        review_text = review_text.replace('5', ' five ')
        review_text = review_text.replace('6', ' six ')
        review_text = review_text.replace('7', ' seven ')
        review_text = review_text.replace('8', ' eight ')
        review_text = review_text.replace('9', ' nine ')
    
    review_text = review_text.lower()
    
    if stem_words:
        words = review_text.split()
        stemmer = SnowballStemmer('english')
        words = [stemmer.stem(word) for word in words]
        text = " ".join(stemmed_words)
        
    # 6. Return a cleaned string
    return(review_text)

def review_to_wordlist(review, remove_stopwords=False, remove_numbers=True, stem_words=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    # 1. Clean review, split it into words
    words = review_to_clean_review(review, stem_words).split()
    #
    # 2. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 6. Return a list of words
    return(words)

def review_to_sentences(review, tokenizer, remove_stopwords=False, remove_numbers=True):
    # Function to split a review into parsed sentences. Returns a
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(review_to_wordlist(raw_sentence, \
                                                        remove_stopwords, remove_numbers))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [5]:
from gensim.models import Word2Vec
EMBEDDING_DIM = 300  # Word vector dimensionality
MIN_WORD_COUNT = 40  # Minimum word count. Kaggle set to 40, to avoid attaching too much importance to individual movie titles.
NUM_THREADS = 4  # Number of threads to run in parallel
CONTEXT = 10  # Context window size
DOWNSAMPLING = 1e-3  # Downsample setting for frequent words
WORD2VEC_MODEL_FILE = BASE_DIR + \
    "word2vec_model_" + \
    str(EMBEDDING_DIM) + "dim_" + \
    str(MIN_WORD_COUNT) + "minwords_" + \
    str(CONTEXT) + "context"

word2vec_model = Word2Vec.load(WORD2VEC_MODEL_FILE)

# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
word2vec_model.init_sims(replace=True)

print("Number of words in the word2vec model vocabulary: %d" % len(word2vec_model.wv.vocab))

Number of words in the word2vec model vocabulary: 16490


Prepare

In [6]:
train_clean_reviews = []
# Getting clean reviews from training set
counter = 0.
for review in labeled_train["review"]:
    clean_review = review_to_clean_review(review, remove_numbers=False)
    train_clean_reviews.append(clean_review)
    if counter % 5000. == 0.:
        print "Train review %d of %d" % (counter, len(labeled_train["review"]))
    counter = counter + 1.

# train_clean_reviews = map(
#     lambda review: review_to_clean_review(review, remove_numbers=False),
#     labeled_train["review"])

test_clean_reviews = []
# Getting clean review from testing set
counter = 0.
for review in test["review"]:
    clean_review = review_to_clean_review(review, remove_numbers=False)
    test_clean_reviews.append(clean_review)
    if counter % 5000. == 0.:
        print "Test review %d of %d" % (counter, len(test["review"]))
    counter = counter + 1.

# test_clean_reviews = map(
#     lambda review: review_to_clean_review(review, remove_numbers=False),
#     test["review"])

all_clean_reviews = train_clean_reviews + test_clean_reviews

Train review 0 of 25000


KeyboardInterrupt: 

In [None]:
# print(train_clean_reviews[0:2])
# print(test_clean_reviews[0:2])

In [None]:
# We vectorize the text corpus by turning each text into a sequence of integers
# Each integer is the index of a token in the dictionary
from keras.preprocessing.text import Tokenizer

MAX_NUM_WORDS_FOR_KERAS_TOKENIZER = 200000
#
# num_words: the maximum number of words to keep, based on frequency.
keras_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS_FOR_KERAS_TOKENIZER)
#
# fit_on_texts accepts a list of strings, a generator of strings or 
# a list of list of strings. In the last case, it assumes each entry of the lists to be a token.
# Here we provide a list of strings.
keras_tokenizer.fit_on_texts(all_clean_reviews)
word_index = keras_tokenizer.word_index
print('Keras Tokenizer found %s unique tokens' % len(word_index))
#
# texts_to_sequences transforms each text in texts to a sequence of integers.
train_sequences = keras_tokenizer.texts_to_sequences(train_clean_reviews)
test_sequences = keras_tokenizer.texts_to_sequences(test_clean_reviews)

In [None]:
print(np.random.choice(word_index.keys(), 1000))

In [None]:
# We pad all text sequences to the same length.
# By default zeros are padded at the front.
from keras.preprocessing.sequence import pad_sequences

# Set max length for each review sequence.
MAX_SEQUENCE_LENGTH_FOR_KERAS_RNN = 500

train_pad_sequences = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH_FOR_KERAS_RNN)
test_pad_sequences = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH_FOR_KERAS_RNN)

In [None]:
# Prepare word embedding matrix

# Choose the smaller number of the two as column length of the matrix
num_words = min(MAX_NUM_WORDS_FOR_KERAS_TOKENIZER, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
words_not_found = []
for word, i in word_index.items():
    if word in word2vec_model.wv.vocab:
        embedding_matrix[i] = word2vec_model.wv.get_vector(word)
    else:
        words_not_found.append(word)
# Null word embeddings are words that don't exist in the embedding matrix
# and are therefore represented as zero vectors.
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

In [None]:
print(np.random.choice(words_not_found, 1000))

In [None]:
# Split train_sequences into train and validation. Ratio: 80/20
VALIDATION_SPLIT = 0.2
np.random.seed(1234)

# 
perm = np.random.permutation(len(train_sequences))
index_train = perm[:int(len(train_sequences)*(1-VALIDATION_SPLIT))]
index_val = perm[int(len(train_sequences)*(1-VALIDATION_SPLIT)):]

x_train = train_pad_sequences[index_train]
x_val = train_pad_sequences[index_val]
y_train = labeled_train["sentiment"][index_train].tolist()
y_val = labeled_train["sentiment"][index_val].tolist()

print('Randomly split %d pad sequences for training, %d for validation' % (len(x_train) ,len(x_val)))


In [None]:
x_test = test_pad_sequences
y_test = test["sentiment"]

## Model Architecture

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Conv1D, MaxPooling1D ,GlobalMaxPooling1D

In [None]:
def layer1_cnn(dropout=0.2, num_filters=64, kernel_size=2):
    model = Sequential()

    embedding_layer = Embedding(
            num_words,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH_FOR_KERAS_RNN,
            trainable=False)
    output_layer = Dense(1, activation='sigmoid')
    
    model.add(embedding_layer)
    model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(dropout))
    model.add(output_layer)
    
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
def layer3_cnn():
    model = Sequential()
    
    NUM_FILTERS = 64

    embedding_layer = Embedding(
            num_words,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH_FOR_KERAS_RNN,
            trainable=False)
    output_layer = Dense(1, activation='sigmoid')
    
    model.add(embedding_layer)
    model.add(Conv1D(filters=NUM_FILTERS, kernel_size=2, padding='valid', activation='relu', strides=1))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(filters=NUM_FILTERS, kernel_size=2, padding='valid', activation='relu', strides=1))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(filters=NUM_FILTERS, kernel_size=2, padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(output_layer)
    
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
layer1_cnn_model = layer1_cnn()
layer1_cnn_model.summary()

In [None]:
layer1_cnn_model.fit(x_train, y_train, batch_size=64, epochs=10, validation_data=[x_val, y_val])

In [None]:
layer3_cnn_model = layer3_cnn()
layer3_cnn_model.summary()

In [None]:
layer3_cnn_model.fit(x_train, y_train, batch_size=64, epochs=10, validation_data=[x_val, y_val])

In [None]:
layer1_cnn_model_dropout5 = layer1_cnn(dropout=0.5)
layer1_cnn_model_dropout5.fit(x_train, y_train, batch_size=64, epochs=10, validation_data=[x_val, y_val])

In [None]:
y_test_pred_layer1_cnn = layer1_cnn_model.predict(x_test)
y_test_pred_layer1_cnn_dropout5 = layer1_cnn_model_dropout5.predict(x_test)
y_test_pred_layer3_cnn = layer3_cnn_model.predict(x_test)

In [None]:
def to_binary(predicts):
    return map(lambda predict: 1 if predict > 0.5 else 0, predicts)

In [None]:
y_test_pred_layer1_cnn = to_binary(y_test_pred_layer1_cnn)
y_test_pred_layer1_cnn_dropout5 = to_binary(y_test_pred_layer1_cnn_dropout5)
y_test_pred_layer3_cnn = to_binary(y_test_pred_layer3_cnn)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

In [None]:
def print_auc_score(model_name, y_test_pred):
    print("The AUC score for %s is : %.4f." % (model_name, roc_auc_score(y_test, y_test_pred)))
    return

In [None]:
print_auc_score("layer1 cnn", y_test_pred_layer1_cnn)
print_auc_score("layer1 cnn dropout 0.5", y_test_pred_layer1_cnn_dropout5)
print_auc_score("layer3 cnn", y_test_pred_layer3_cnn)

In [None]:
layer1_cnn_32f_5d = layer1_cnn(dropout=0.5, num_filters=128)
layer1_cnn_32f_5d.fit(x_train, y_train, batch_size=64, epochs=10, validation_data=[x_val, y_val])

In [None]:
def layer1_cnn_d(dropout=0.2, num_filters=64, kernel_size=2):
    model = Sequential()

    embedding_layer = Embedding(
            num_words,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH_FOR_KERAS_RNN,
            trainable=False)
    output_layer = Dense(1, activation='sigmoid')
    
    model.add(embedding_layer)
#     model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1))
#     model.add(MaxPooling1D(2))
#     model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1))
#     model.add(MaxPooling1D(2))
    model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(dropout))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(dropout))
    model.add(output_layer)
    
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
layer1_cnn_d_0 = layer1_cnn_d()
layer1_cnn_d_0.fit(x_train, y_train, batch_size=64, epochs=10, validation_data=[x_val, y_val])

In [None]:
y_test_pred_layer1_cnn_d_0 = layer1_cnn_d_0.predict(x_test)
y_test_pred_layer1_cnn_d_0 = to_binary(y_test_pred_layer1_cnn_d_0)
print_auc_score("layer1 cnn_d", y_test_pred_layer1_cnn_d_0)

In [None]:
layer1_cnn_d_5d = layer1_cnn_d(dropout=0.5)
layer1_cnn_d_5d.fit(x_train, y_train, batch_size=64, epochs=20, validation_data=[x_val, y_val])

In [None]:
y_test_pred_layer1_cnn_d_5d = layer1_cnn_d_5d.predict(x_test)
y_test_pred_layer1_cnn_d_5d = to_binary(y_test_pred_layer1_cnn_d_5d)
print_auc_score("layer1 cnn_d", y_test_pred_layer1_cnn_d_5d)

In [None]:
def layer3_cnn_1p(dropout=0.2):
    model = Sequential()
    
    NUM_FILTERS = 64

    embedding_layer = Embedding(
            num_words,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH_FOR_KERAS_RNN,
            trainable=False)
    output_layer = Dense(1, activation='sigmoid')
    
    model.add(embedding_layer)
    model.add(Conv1D(filters=NUM_FILTERS, kernel_size=2, padding='valid', activation='relu', strides=1))
    model.add(Conv1D(filters=NUM_FILTERS, kernel_size=2, padding='valid', activation='relu', strides=1))
    model.add(Conv1D(filters=NUM_FILTERS, kernel_size=2, padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(dropout))
    model.add(output_layer)
    
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
layer3_cnn_1p_0 = layer3_cnn_1p(dropout=0.5)
layer3_cnn_1p_0.fit(x_train, y_train, batch_size=64, epochs=20, validation_data=[x_val, y_val])