In [1]:
import re
import nltk
import numpy as np

In [2]:
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense, Dropout, Activation, Embedding

Using TensorFlow backend.


In [3]:
np.random.seed(7)

## Tag the training data

In [4]:
def clean_sentence(s):
    c = s.lower().strip()
    return re.sub('[^a-z ]', '', c)

In [5]:
def translate_word_class(tag):
    if tag in ['JJ', 'JJR', 'JJS']:
        return 1 #'adjective'
    if tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        return 2 #'noun'
    if tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return 3 #'verb'
    if tag in ['CC', 'IN']:
        return 4 #'link'
    return 5 #'other'

In [6]:
def map_tagging(sentence):
    translation = list()
    for word, tag in sentence:
        translation.append((word, translate_word_class(tag)))
    return translation

In [25]:
def map_encoding(sentence):
    encoded = list()
    for word, tag in sentence:
        encodedInt = one_hot(word,30000)[0]
        encoded.append([encodedInt,tag])
    return encoded        

In [26]:
END = 6
def sentence_labeling(sentence):
    labels = list()
    for word, tag in sentence[1:]:
        labels.append(tag)
    labels.append(END)
    return labels

In [27]:
def tag_words(sentences):
    tagged_words = list()
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        tagged_words.append(nltk.pos_tag(words))
    return tagged_words

In [28]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [None]:
def create_Ymatrix(numbers,size,var_index):
    matrix = np.zeros((size))
    i = 0
    for number in numbers:
        matrix[i] = number[var_index]
        i=i+1
    return matrix

In [None]:
def create_Xmatrix(numbers,size,var_index):
    matrix = np.zeros((size,1))
    sub_matrix = np.zeros((1))
    i = 0
    for number in numbers:
        sub_matrix[0] = number[var_index]
        matrix[i][0] = sub_matrix
        i=i+1
    return matrix

In [29]:
train_sentences = list()
with open('./data_set/training_set70.txt') as train:
    for line in train:
        train_sentences.append(clean_sentence(line))

test_sentences = list()
with open('./data_set/test_set25.txt') as train:
    for line in train:
        test_sentences.append(clean_sentence(line))

In [30]:
train_sentences[:5]

['correct', 'dick agreed', 'sorry wrong', 'where is mary', 'he is happy now']

In [31]:
tagged_sentences = tag_words(train_sentences)
tagged_sentences_test = tag_words(test_sentences)

In [32]:
tagged_sentences[:5]
tagged_sentences_test[:5]

[[('sam', 'JJ'), ('didnt', 'NNS'), ('like', 'IN'), ('elections', 'NNS')],
 [('dick', 'NN'),
  ('and', 'CC'),
  ('sam', 'NN'),
  ('were', 'VBD'),
  ('disappointed', 'JJ')],
 [('where', 'WRB'), ('is', 'VBZ'), ('my', 'PRP$'), ('apple', 'NN')],
 [('yes', 'RB'), ('thats', 'NNS'), ('correct', 'VBP')],
 [('no', 'DT'), ('its', 'PRP$'), ('wrong', 'NN')]]

In [33]:
my_tagged_sentences = list(map(map_tagging, tagged_sentences))
test_tagged_sentences = list(map(map_tagging, tagged_sentences_test))

In [34]:
my_tagged_sentences[:5]

[[('correct', 2)],
 [('dick', 2), ('agreed', 3)],
 [('sorry', 2), ('wrong', 1)],
 [('where', 5), ('is', 3), ('mary', 1)],
 [('he', 5), ('is', 3), ('happy', 1), ('now', 5)]]

In [35]:
x = flatten(map(map_encoding, my_tagged_sentences))
test = flatten(map(map_encoding, test_tagged_sentences))

In [52]:
x_train = create_Xmatrix(x,len(x),0)
y_train = create_Ymatrix(x,len(x),1)
x_test = create_Xmatrix(test,len(test),0)
y_test = create_Ymatrix(test,len(test),1)

print('X_train shape:', x_train.shape)
print('X_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
print(y_test)

X_train shape: (328, 1)
X_test shape: (132, 1)
y_train shape: (328,)
y_test shape: (132,)
[ 1.  2.  4.  2.  2.  4.  2.  3.  1.  5.  3.  5.  2.  5.  2.  3.  5.  5.
  2.  2.  3.  5.  2.  4.  5.  2.  2.  4.  2.  3.  5.  2.  5.  2.  3.  3.
  5.  3.  5.  1.  2.  3.  5.  2.  2.  4.  2.  3.  5.  5.  2.  2.  3.  5.
  2.  5.  3.  5.  3.  4.  5.  2.  2.  3.  1.  4.  3.  2.  3.  5.  2.  1.
  2.  3.  5.  2.  2.  5.  3.  2.  4.  5.  1.  3.  5.  2.  2.  3.  1.  4.
  5.  2.  2.  4.  1.  3.  5.  5.  2.  2.  5.  3.  4.  5.  2.  5.  3.  5.
  5.  5.  2.  5.  3.  5.  2.  4.  5.  2.  5.  4.  3.  3.  2.  4.  5.  2.
  5.  3.  5.  3.  5.  3.]


In [57]:
print('Build model...')
model = Sequential()
model.add(Embedding(30000,256, dropout=0.2))
model.add(LSTM(256, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(Dense(1))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train, batch_size=328, nb_epoch=10,
          validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test,
                            batch_size=328)
print('Test score:', score)
print('Test accuracy:', acc)

Build model...
Train...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 328 samples, validate on 132 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 0.137393459678
Test accuracy: 0.0606060624123
