In [3]:
from string import punctuation
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard

Using TensorFlow backend.


In [4]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc, vocab):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# filter out tokens not in vocab
	tokens = [w for w in tokens if w in vocab]
	tokens = ' '.join(tokens)
	return tokens

# load all docs in a directory
def process_docs(directory, vocab, is_trian):
	documents = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		if is_trian and filename.startswith('cv9'):
			continue
		if not is_trian and not filename.startswith('cv9'):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load the doc
		doc = load_doc(path)
		# clean doc
		tokens = clean_doc(doc, vocab)
		# add to list
		documents.append(tokens)
	return documents

In [5]:
# load the vocabulary
vocab_filename = 'data/output/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

# load all training reviews
positive_docs = process_docs('data/txt_sentoken/pos', vocab, True)
negative_docs = process_docs('data/txt_sentoken/neg', vocab, True)
train_docs = negative_docs + positive_docs

In [6]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

In [7]:
max_length = max([len(s.split()) for s in train_docs])
print("Max Sequence Length: {}".format(max_length))

Max Sequence Length: 1317


In [8]:
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)

# pad sequences
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

# define training labels
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])

print(Xtrain)
print(Xtrain.shape)

[[16192  1284  1297 ...     0     0     0]
 [  317    60     1 ...     0     0     0]
 [  537     4   295 ...     0     0     0]
 ...
 [  186     7    88 ...     0     0     0]
 [  462    76   974 ...     0     0     0]
 [  346   329   716 ...     0     0     0]]
(1800, 1317)


In [9]:
# load all test reviews
positive_docs = process_docs('data/txt_sentoken/pos', vocab, False)
negative_docs = process_docs('data/txt_sentoken/neg', vocab, False)
test_docs = negative_docs + positive_docs

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)

# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

# define test labels
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

print(Xtest)
print(Xtest.shape)

[[  38   49 1452 ...    0    0    0]
 [ 561 2124 1056 ...    0    0    0]
 [  20  116   42 ...    0    0    0]
 ...
 [ 644   86  535 ...    0    0    0]
 [1022  115   36 ...    0    0    0]
 [6160   67 1365 ...    0    0    0]]
(200, 1317)


In [10]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1
print("Vocab Size: {}".format(vocab_size))

Vocab Size: 25768


In [11]:
def build_model():    
    # define model
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    return model

model = build_model()
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1317, 100)         2576800   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1310, 32)          25632     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 655, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20960)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                209610    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 2,812,053
Trainable params: 2,812,053
Non-trainable params: 0
_________________________________________________________________


In [12]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#file_path="weights.hdf5"
#batch_size = 512

#checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=verbose, save_best_only=True, mode='min', period=1)
#early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
#tensorboard = TensorBoard(log_dir='logs', histogram_freq=1)

#callbacks_list = [checkpoint, early, tensorboard]

model.fit(Xtrain, ytrain, epochs=10, verbose=2)

In [None]:
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))