In [2]:
import re
import nltk
import random
import numpy as np

In [3]:
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [4]:
np.random.seed(1000)
nb_word_class = 5
batch_size = 328

## Tag the training data

In [5]:
def clean_sentence(s):
    c = s.lower().strip()
    return re.sub('[^a-z ]', '', c)

In [6]:
def translate_word_class(tag):
    if tag in ['JJ', 'JJR', 'JJS']:
        return 1 #'adjective'
    if tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        return 2 #'noun'
    if tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return 3 #'verb'
    if tag in ['CC', 'IN']:
        return 4 #'link'
    return 0 #'other'

In [7]:
def map_tagging(sentence):
    translation = list()
    for word, tag in sentence:
        translation.append((word, translate_word_class(tag)))
    return translation

In [8]:
def map_encoding(sentence):
    encoded = list()
    for word, tag in sentence:
        encodedInt = one_hot(word,30000)[0]
        encoded.append([encodedInt,tag])
    return encoded        

In [9]:
#END = 5
def sentence_labeling(sentence):
    labels = list()
    for word, tag in sentence[1:]:
        labels.append(tag)
    #labels.append(END)
    return labels

In [10]:
def tag_words(sentences):
    tagged_words = list()
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        tagged_words.append(nltk.pos_tag(words))
    return tagged_words

In [11]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [12]:
def create_Ymatrix(numbers):
    matrix = np.zeros((len(numbers),1))
    i = 0
    for number in numbers:
        matrix[i] = number
        i=i+1
    return matrix

In [13]:
def create_Xmatrix(numbers,size,var_index):
    matrix = np.zeros((size,1))
    sub_matrix = np.zeros((1))
    i = 0
    for number in numbers[1:]:
        sub_matrix[0] = number[var_index]
        matrix[i][0] = sub_matrix
        i=i+1
    return matrix

In [14]:
def remove_ending_word(tagged_sentences):
    remove_ending_words = list()
    for words in tagged_sentences:
        remove_ending_words.append(words[:-1])
    return remove_ending_words

In [46]:
train_sentences = list()
with open('./data_set/training_set75.txt') as train:
    for line in train:
        train_sentences.append(clean_sentence(line))

test_sentences = list()
with open('./data_set/test_set75.txt') as train:
    for line in train:
        test_sentences.append(clean_sentence(line))


In [47]:
train_sentences.reverse()

tagged_sentences = tag_words(train_sentences)
tagged_sentences_test = tag_words(test_sentences)

In [48]:
max(map(len, tagged_sentences))

11

In [49]:
print(len(tagged_sentences))
tagged_sentences_test[:5]

75


[[('no', 'DT'), ('its', 'PRP$'), ('wrong', 'NN')],
 [('he', 'PRP'), ('joined', 'VBD'), ('a', 'DT'), ('gang', 'NN')],
 [('sam', 'NN'), ('sold', 'VBD'), ('her', 'PRP'), ('car', 'NN')],
 [('where', 'WRB'), ('is', 'VBZ'), ('my', 'PRP$'), ('apple', 'NN')],
 [('he', 'PRP'), ('is', 'VBZ'), ('not', 'RB'), ('careful', 'JJ')]]

In [50]:
my_tagged_sentences = list(map(map_tagging, tagged_sentences))
test_tagged_sentences = list(map(map_tagging, tagged_sentences_test))

In [51]:
my_tagged_sentences[:5]
my_tagged_sentences_no_ending=remove_ending_word(my_tagged_sentences)
test_tagged_sentences_no_ending=remove_ending_word(test_tagged_sentences)
print(my_tagged_sentences_no_ending[:7])

[[('she', 0), ('waited', 3), ('for', 4), ('an', 0), ('hour', 2), ('to', 0), ('ride', 3), ('the', 0), ('largest', 1), ('roller', 2)], [('rachel', 2), ('received', 3), ('a', 0), ('call', 2), ('from', 4), ('the', 0), ('doctor', 2), ('that', 4), ('she', 0)], [('rob', 2), ('felt', 3), ('refreshed', 3), ('after', 4), ('his', 0)], [('she', 0), ('decided', 3), ('that', 4), ('she', 0), ('would', 0), ('travel', 3)], [('cara', 2), ('told', 3), ('the', 0), ('owner', 2), ('that', 4), ('the', 0), ('food', 2), ('was', 3)], [('kate', 3), ('went', 3), ('online', 1), ('and', 4), ('ordered', 3), ('her', 0)], [('rachel', 2), ('adpoted', 3), ('a', 0), ('cat', 2), ('over', 4), ('ten', 0)]]


In [52]:
x = flatten(map(map_encoding, my_tagged_sentences_no_ending))
print(x[:7])
test = flatten(map(map_encoding, test_tagged_sentences_no_ending))


[[21864, 0], [26355, 3], [23631, 4], [21515, 0], [15555, 2], [27792, 0], [10503, 3]]


In [53]:
x_train = create_Xmatrix(x,len(x),0)
y_train = to_categorical(flatten(map(sentence_labeling,my_tagged_sentences)),nb_word_class)
x_test = create_Xmatrix(test,len(test),0)
y_test = to_categorical(flatten(map(sentence_labeling,test_tagged_sentences)),nb_word_class)
print(len(x))

306


In [58]:
print('Build model...')
model = Sequential()
model.add(Embedding(30000,256, dropout=0.2))
model.add(LSTM(16, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(Dense(nb_word_class))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train, batch_size=len(x), nb_epoch=15,
         validation_data=(x_test, y_test),shuffle=False)
score, acc = model.evaluate(x_test, y_test,
                            batch_size=len(x))
print('Test score:', score)
print('Test accuracy:', acc)

Build model...
Train...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 306 samples, validate on 366 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 1.58075562852
Test accuracy: 0.595628434517


Build model...
Train...


Exception: Input arrays should have the same number of samples as target arrays. Found 306 input samples and 231 target samples.