In [25]:
import re
import nltk
import random
import numpy as np

In [2]:
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [3]:
np.random.seed(7)
nb_word_class = 6

## Tag the training data

In [4]:
def clean_sentence(s):
    c = s.lower().strip()
    return re.sub('[^a-z ]', '', c)

In [5]:
def translate_word_class(tag):
    if tag in ['JJ', 'JJR', 'JJS']:
        return 1 #'adjective'
    if tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        return 2 #'noun'
    if tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return 3 #'verb'
    if tag in ['CC', 'IN']:
        return 4 #'link'
    return 0 #'other'

In [6]:
def map_tagging(sentence):
    translation = list()
    for word, tag in sentence:
        translation.append((word, translate_word_class(tag)))
    return translation

In [7]:
def map_encoding(sentence):
    encoded = list()
    for word, tag in sentence:
        encodedInt = one_hot(word,30000)[0]
        encoded.append([encodedInt,tag])
    return encoded        

In [8]:
END = 5
def sentence_labeling(sentence):
    labels = list()
    for word, tag in sentence[1:]:
        labels.append(tag)
    labels.append(END)
    return labels

In [9]:
def tag_words(sentences):
    tagged_words = list()
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        tagged_words.append(nltk.pos_tag(words))
    return tagged_words

In [10]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [11]:
def create_Ymatrix(numbers):
    matrix = np.zeros((len(numbers),1))
    i = 0
    for number in numbers:
        matrix[i] = number
        i=i+1
    return matrix

In [12]:
def create_Xmatrix(numbers,size,var_index):
    matrix = np.zeros((size,1))
    sub_matrix = np.zeros((1))
    i = 0
    for number in numbers:
        sub_matrix[0] = number[var_index]
        matrix[i][0] = sub_matrix
        i=i+1
    return matrix

In [13]:
train_sentences = list()
with open('./data_set/training_set70.txt') as train:
    for line in train:
        train_sentences.append(clean_sentence(line))

test_sentences = list()
with open('./data_set/test_set25.txt') as train:
    for line in train:
        test_sentences.append(clean_sentence(line))


In [14]:
tagged_sentences = tag_words(train_sentences)
tagged_sentences_test = tag_words(test_sentences)

In [15]:
tagged_sentences[:5]
tagged_sentences_test[:5]

[[('sam', 'JJ'), ('didnt', 'NNS'), ('like', 'IN'), ('elections', 'NNS')],
 [('dick', 'NN'),
  ('and', 'CC'),
  ('sam', 'NN'),
  ('were', 'VBD'),
  ('disappointed', 'JJ')],
 [('where', 'WRB'), ('is', 'VBZ'), ('my', 'PRP$'), ('apple', 'NN')],
 [('yes', 'RB'), ('thats', 'NNS'), ('correct', 'VBP')],
 [('no', 'DT'), ('its', 'PRP$'), ('wrong', 'NN')]]

In [16]:
my_tagged_sentences = list(map(map_tagging, tagged_sentences))
test_tagged_sentences = list(map(map_tagging, tagged_sentences_test))

In [32]:
my_tagged_sentences[:5]
my_tagged_sentences.reverse()
print(my_tagged_sentences[2])

[('dick', 2), ('had', 3), ('a', 0), ('great', 1), ('time', 2), ('at', 4), ('the', 0), ('park', 2)]


In [18]:
x = flatten(map(map_encoding, my_tagged_sentences))
test = flatten(map(map_encoding, test_tagged_sentences))


In [19]:
create_Ymatrix(flatten(map(sentence_labeling,my_tagged_sentences)))

array([[ 5.],
       [ 3.],
       [ 5.],
       [ 1.],
       [ 5.],
       [ 3.],
       [ 1.],
       [ 5.],
       [ 3.],
       [ 1.],
       [ 0.],
       [ 5.],
       [ 3.],
       [ 0.],
       [ 2.],
       [ 5.],
       [ 3.],
       [ 1.],
       [ 5.],
       [ 3.],
       [ 0.],
       [ 2.],
       [ 5.],
       [ 2.],
       [ 0.],
       [ 5.],
       [ 3.],
       [ 0.],
       [ 2.],
       [ 5.],
       [ 3.],
       [ 4.],
       [ 0.],
       [ 1.],
       [ 5.],
       [ 3.],
       [ 2.],
       [ 5.],
       [ 3.],
       [ 0.],
       [ 2.],
       [ 5.],
       [ 3.],
       [ 2.],
       [ 5.],
       [ 0.],
       [ 3.],
       [ 1.],
       [ 5.],
       [ 3.],
       [ 0.],
       [ 0.],
       [ 5.],
       [ 2.],
       [ 3.],
       [ 3.],
       [ 5.],
       [ 3.],
       [ 1.],
       [ 5.],
       [ 3.],
       [ 2.],
       [ 5.],
       [ 3.],
       [ 0.],
       [ 2.],
       [ 5.],
       [ 3.],
       [ 0.],
       [ 2.],
       [ 5.],
      

In [20]:
x_train = create_Xmatrix(x,len(x),0)
y_train = to_categorical(flatten(map(sentence_labeling,my_tagged_sentences)),nb_word_class)
x_test = create_Xmatrix(test,len(test),0)
y_test = to_categorical(flatten(map(sentence_labeling,test_tagged_sentences)),nb_word_class)

#print('X_train shape:', x_train.shape)
#print('X_test shape:', x_test.shape)
#print('y_train shape:', y_train.shape)
#print('y_test shape:', y_test.shape)
#print(y_train)
#print(x_test)

In [21]:
print('Build model...')
model = Sequential()
model.add(Embedding(30000,256, dropout=0.2))
model.add(LSTM(8, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(Dense(nb_word_class))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train, batch_size=328, nb_epoch=20,
         validation_data=(x_test, y_test),shuffle=False)
score, acc = model.evaluate(x_test, y_test,
                            batch_size=328)
print('Test score:', score)
print('Test accuracy:', acc)

Build model...
Train...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 328 samples, validate on 132 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test score: 1.7603045702
Test accuracy: 0.674242436886


In [27]:
reverse_train_sentences = list(train_sentences)
reverse_train_sentences.reverse()

random_train_sentences = list(train_sentences)
random.shuffle(random_train_sentences)


['sandra went back to the bedroom', 'colin didnt like fish tacos', 'dick got a job', 'timmys room is a mess', 'she found her phone in the car', 'sorry thats not it', 'gina loved her grandma', 'anns fridge was broken', 'ann and tim loved astrology', 'sam quits her job', 'correct', 'they had a great tea party', 'tony was scared of the ocean', 'yes that is correct', 'sandra grabbed the milk there', 'colin drinks beer at the bar', 'yes thats right', 'mary put down the apple', 'john went to the garden', 'tom and dick loved their trucks', 'tony enjoyed going on the cruise', 'neil played hockey', 'where is the football', 'larry became careful', 'daniel left the apple', 'scott gave him a high five', 'mary travelled to the bathroom', 'sarah decided to move to europe', 'mary moved to the kitchen', 'his momther was concerned', 'mary got the football there', 'don hated elections', 'jason took a hot shower', 'dick didnt like theme parks', 'gary was a brave child', 'ann ate an apple', 'tim was enter