# Data pre-processing

In [66]:
import numpy as np
import re
import itertools
from collections import Counter

"""
The processing scripts below is modified based on
https://github.com/dennybritz/cnn-text-classification-tf
"""

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def load_data_and_labels(files):
    """
    Loads data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    data = []
    label = []
    numOfLabels = len(files)
    for i in range(len(files)):
        tmp = list(open(files[i], "r").readlines())
        tmp = [s.strip() for s in tmp]
        data += tmp
        eachClass = [0] * numOfLabels
        eachClass[i] = 1
        label += list(itertools.repeat(eachClass, len(tmp)))

    data = [clean_str(sent) for sent in data]
    data = [s.split(" ") for s in data]
    return [data, label]


def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences


def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary_inv = list(sorted(vocabulary_inv))
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]


def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentencs and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]


def load_data(files):
    """
    Loads and preprocessed data for the MR dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences, labels = load_data_and_labels(files)
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv]


# RNN training

In [None]:
from __future__ import print_function
import numpy as np
import random
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM, SimpleRNN, GRU
from keras.datasets import imdb

import data_helpers as dh

print('Loading data...')
# Specify your files first
files = ["rt-polarity.pos","rt-polarity.neg"]
[x, y, vocabulary, vocabulary_inv] = load_data(files)

traning_size = (int)(0.6*len(x))
trainIndex = random.sample(range(len(x)), traning_size)
trainIndex.sort()
testIndex = list(set(range(len(x))).difference(trainIndex))

X_train = x[trainIndex]
X_test  = x[testIndex]

y_train = y[trainIndex]
y_test  = y[testIndex]

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)


print('Build model...')
model = Sequential()
model.add(Embedding(len(vocabulary), 128, input_length=len(x[0]), dropout=0.2))
model.add(LSTM(128, return_sequences=True, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) 
model.add(Dense(len(y[0])))
model.add(Activation('sigmoid'))

batch_size = 32

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
print(X_train.shape)
print(y_train.shape)
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=10,
          validation_data=(X_test, y_test))
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Loading data...
6397 train sequences
4265 test sequences
X_train shape: (6397, 56)
X_test shape: (4265, 56)
Build model...
Train...
(6397, 56)
(6397, 2)
Train on 6397 samples, validate on 4265 samples
Epoch 1/10