In [None]:
import re
import nltk
import random
import numpy as np

In [None]:
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.utils.np_utils import to_categorical

In [None]:
np.random.seed(7)
nb_word_class = 6

## Tag the training data

In [None]:
def clean_sentence(s):
    c = s.lower().strip()
    return re.sub('[^a-z ]', '', c)

In [None]:
def translate_word_class(tag):
    if tag in ['JJ', 'JJR', 'JJS']:
        return 1 #'adjective'
    if tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        return 2 #'noun'
    if tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return 3 #'verb'
    if tag in ['CC', 'IN']:
        return 4 #'link'
    return 0 #'other'

In [None]:
def map_tagging(sentence):
    translation = list()
    for word, tag in sentence:
        translation.append((word, translate_word_class(tag)))
    return translation

In [None]:
def map_encoding(sentence):
    encoded = list()
    for word, tag in sentence:
        encodedInt = one_hot(word,30000)[0]
        encoded.append([encodedInt,tag])
    return encoded        

In [None]:
END = 5
def sentence_labeling(sentence):
    labels = list()
    for word, tag in sentence[1:]:
        labels.append(tag)
    labels.append(END)
    return labels

In [None]:
def tag_words(sentences):
    tagged_words = list()
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        tagged_words.append(nltk.pos_tag(words))
    return tagged_words

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [None]:
def create_Ymatrix(numbers):
    matrix = np.zeros((len(numbers),1))
    i = 0
    for number in numbers:
        matrix[i] = number
        i=i+1
    return matrix

In [None]:
def create_Xmatrix(numbers,size,var_index):
    matrix = np.zeros((size,1))
    sub_matrix = np.zeros((1))
    i = 0
    for number in numbers:
        sub_matrix[0] = number[var_index]
        matrix[i][0] = sub_matrix
        i=i+1
    return matrix

In [None]:
train_sentences = list()
with open('./data_set/training_set70.txt') as train:
    for line in train:
        train_sentences.append(clean_sentence(line))

test_sentences = list()
with open('./data_set/test_set25.txt') as train:
    for line in train:
        test_sentences.append(clean_sentence(line))


In [None]:
tagged_sentences = tag_words(train_sentences)
tagged_sentences_test = tag_words(test_sentences)

In [None]:
tagged_sentences[:5]
tagged_sentences_test[:5]

In [None]:
my_tagged_sentences = list(map(map_tagging, tagged_sentences))
test_tagged_sentences = list(map(map_tagging, tagged_sentences_test))

In [None]:
my_tagged_sentences[:5]
my_tagged_sentences.reverse()
print(my_tagged_sentences[2])

In [None]:
x = flatten(map(map_encoding, my_tagged_sentences))
test = flatten(map(map_encoding, test_tagged_sentences))


In [None]:
create_Ymatrix(flatten(map(sentence_labeling,my_tagged_sentences)))

In [None]:
x_train = create_Xmatrix(x,len(x),0)
y_train = to_categorical(flatten(map(sentence_labeling,my_tagged_sentences)),nb_word_class)
x_test = create_Xmatrix(test,len(test),0)
y_test = to_categorical(flatten(map(sentence_labeling,test_tagged_sentences)),nb_word_class)

#print('X_train shape:', x_train.shape)
#print('X_test shape:', x_test.shape)
#print('y_train shape:', y_train.shape)
#print('y_test shape:', y_test.shape)
#print(y_train)
#print(x_test)

In [None]:
print('Build model...')
model = Sequential()
model.add(Embedding(30000,256, dropout=0.2))
model.add(LSTM(8, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(Dense(nb_word_class))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train, batch_size=328, nb_epoch=20,
         validation_data=(x_test, y_test),shuffle=False)
score, acc = model.evaluate(x_test, y_test,
                            batch_size=328)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
reverse_train_sentences = list(train_sentences)
reverse_train_sentences.reverse()

random_train_sentences = list(train_sentences)
random.shuffle(random_train_sentences)
